[llvm] [WIP][AMDGPU][ASAN] Add amdgpu-asan-instrument-lds pass to instrument LDS (PR #83287)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 28 08:35:02 PST 2024
https://github.com/skc7 created https://github.com/llvm/llvm-project/pull/83287
This is very early WIP patch regarding AddrSanitizer instrumentation of LDS variables for AMDGPU. More changes incoming.
This PR follows below approach for instrumentation of LDS:
- Create a new "amdgpu-asan-instrument-lds" pass that runs after the lower-module-lds pass.
- Per kernel, a grouped LDS global is created by lower-module-lds pass. This new pass will replace this global with a new LDS global which stores the malloc pointer.
- Single thread from the workgroup does a malloc at prologue and stores this in newly created LDS global. At epilogue, memory would be made free.
- All LDS accesses in the kernel will be replaced by offset into this new global.
- The logic to instrument the globals that has been implemented in AddrSanitizer pass will be ported to this new pass.
>From c1fd827d901ca0e819efdc5f1b19d16ae65817fc Mon Sep 17 00:00:00 2001
From: skc7 <Krishna.Sankisa at amd.com>
Date: Fri, 10 Nov 2023 15:18:23 +0530
Subject: [PATCH] [AMDGPU][ASAN] Add amdgpu-asan-instrument-lds pass to
instrument LDS
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 8 +
.../AMDGPU/AMDGPUAsanInstrumentLDSPass.cpp | 482 ++++++++++++++++++
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 17 +
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h | 1 +
llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 +
.../AMDGPU/lds_globals_instrument.ll | 216 ++++++++
.../AMDGPU/lds_globals_replace_with_malloc.ll | 109 ++++
7 files changed, 834 insertions(+)
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentLDSPass.cpp
create mode 100644 llvm/test/Instrumentation/AddressSanitizer/AMDGPU/lds_globals_instrument.ll
create mode 100644 llvm/test/Instrumentation/AddressSanitizer/AMDGPU/lds_globals_replace_with_malloc.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 36af767a70b0a8..12e9392bac8760 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -86,6 +86,7 @@ void initializeAMDGPUMachineCFGStructurizerPass(PassRegistry&);
extern char &AMDGPUMachineCFGStructurizerID;
void initializeAMDGPUAlwaysInlinePass(PassRegistry&);
+void initializeAMDGPUAsanInstrumentLDSPass(PassRegistry &);
Pass *createAMDGPUAnnotateKernelFeaturesPass();
Pass *createAMDGPUAttributorLegacyPass();
@@ -250,6 +251,13 @@ struct AMDGPUAlwaysInlinePass : PassInfoMixin<AMDGPUAlwaysInlinePass> {
bool GlobalOpt;
};
+ModulePass *createAMDGPUAsanInstrumentLDSPass();
+struct AMDGPUAsanInstrumentLDSPass
+ : PassInfoMixin<AMDGPUAsanInstrumentLDSPass> {
+ AMDGPUAsanInstrumentLDSPass() {}
+ PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
class AMDGPUCodeGenPreparePass
: public PassInfoMixin<AMDGPUCodeGenPreparePass> {
private:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentLDSPass.cpp
new file mode 100644
index 00000000000000..f7a44387ffe952
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentLDSPass.cpp
@@ -0,0 +1,482 @@
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "Utils/AMDGPUMemoryUtils.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/ReplaceConstant.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/OptimizedStructLayout.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+#include <iostream>
+#define DEBUG_TYPE "amdgpu-asan-instrument-lds"
+
+using namespace llvm;
+using DomTreeCallback = function_ref<DominatorTree *(Function &F)>;
+
+namespace {
+// TODO: Just for testing purpose. Will be removed.
+cl::opt<bool> ReplaceLDSAndInstrument(
+ "amdgpu-replace-lds-and-instrument",
+ cl::desc("Replace LDS accesses with malloc and don't do asan instrumentation."),
+ cl::init(true), cl::Hidden);
+
+const char kAMDGPUBallotName[] = "llvm.amdgcn.ballot.i64";
+const char kAMDGPUUnreachableName[] = "llvm.amdgcn.unreachable";
+static const uint64_t kSmallX86_64ShadowOffsetBase = 0x7FFFFFFF;
+static const uint64_t kSmallX86_64ShadowOffsetAlignMask = ~0xFFFULL;
+const bool Recover = true;
+const uint32_t AsanMappingScale = 3;
+const uint32_t AsanMappingOffset =
+ (kSmallX86_64ShadowOffsetBase &
+ (kSmallX86_64ShadowOffsetAlignMask << AsanMappingScale));
+
+class AMDGPUAsanInstrumentLDS : public ModulePass {
+
+public:
+ static char ID;
+ AMDGPUAsanInstrumentLDS() : ModulePass(ID) {}
+
+ bool runOnModule(Module &M) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addRequiredID(AMDGPULowerModuleLDSLegacyPassID);
+ }
+};
+} // namespace
+
+INITIALIZE_PASS(AMDGPUAsanInstrumentLDS, "amdgpu-asan-instrument-lds",
+ "AMDGPU AddressSanitizer instrument LDS", false, false)
+
+char AMDGPUAsanInstrumentLDS::ID = 0;
+
+static uint64_t getRedzoneSizeForGlobal(uint64_t SizeInBytes) {
+ constexpr uint64_t kMaxRZ = 1 << 18;
+ // TODO: get scale from asan-mapping-scale
+ const int MappingScale = AsanMappingScale;
+ const uint64_t MinRZ = std::max(32U, 1U << MappingScale);
+ ;
+
+ uint64_t RZ = 0;
+ if (SizeInBytes <= MinRZ / 2) {
+ // Reduce redzone size for small size objects, e.g. int, char[1]. MinRZ is
+ // at least 32 bytes, optimize when SizeInBytes is less than or equal to
+ // half of MinRZ.
+ RZ = MinRZ - SizeInBytes;
+ } else {
+ // Calculate RZ, where MinRZ <= RZ <= MaxRZ, and RZ ~ 1/4 * SizeInBytes.
+ RZ = std::clamp((SizeInBytes / MinRZ / 4) * MinRZ, MinRZ, kMaxRZ);
+
+ // Round up to multiple of MinRZ.
+ if (SizeInBytes % MinRZ)
+ RZ += MinRZ - (SizeInBytes % MinRZ);
+ }
+
+ assert((RZ + SizeInBytes) % MinRZ == 0);
+
+ return RZ;
+}
+
+static Instruction *genAMDGPUReportBlock(Module &M, IRBuilder<> &IRB,
+ Value *Cond, bool Recover) {
+ Value *ReportCond = Cond;
+ if (!Recover) {
+ auto Ballot = M.getOrInsertFunction(kAMDGPUBallotName, IRB.getInt64Ty(),
+ IRB.getInt1Ty());
+ ReportCond = IRB.CreateIsNotNull(IRB.CreateCall(Ballot, {Cond}));
+ }
+
+ auto *Trm = SplitBlockAndInsertIfThen(
+ ReportCond, &*IRB.GetInsertPoint(), false,
+ MDBuilder(M.getContext()).createBranchWeights(1, 100000));
+ Trm->getParent()->setName("asan.report");
+
+ if (Recover)
+ return Trm;
+
+ Trm = SplitBlockAndInsertIfThen(Cond, Trm, false);
+ IRB.SetInsertPoint(Trm);
+ return IRB.CreateCall(
+ M.getOrInsertFunction(kAMDGPUUnreachableName, IRB.getVoidTy()), {});
+}
+
+static Value *createSlowPathCmp(Module &M, IRBuilder<> &IRB, Value *AddrLong,
+ Value *ShadowValue, uint32_t TypeStoreSize) {
+
+ unsigned int LongSize = M.getDataLayout().getPointerSizeInBits();
+ IntegerType *IntptrTy = Type::getIntNTy(M.getContext(), LongSize);
+ size_t Granularity = static_cast<size_t>(1) << AsanMappingScale;
+ // Addr & (Granularity - 1)
+ Value *LastAccessedByte =
+ IRB.CreateAnd(AddrLong, ConstantInt::get(IntptrTy, Granularity - 1));
+ // (Addr & (Granularity - 1)) + size - 1
+ if (TypeStoreSize / 8 > 1)
+ LastAccessedByte = IRB.CreateAdd(
+ LastAccessedByte, ConstantInt::get(IntptrTy, TypeStoreSize / 8 - 1));
+ // (uint8_t) ((Addr & (Granularity-1)) + size - 1)
+ LastAccessedByte =
+ IRB.CreateIntCast(LastAccessedByte, ShadowValue->getType(), false);
+ // ((uint8_t) ((Addr & (Granularity-1)) + size - 1)) >= ShadowValue
+ return IRB.CreateICmpSGE(LastAccessedByte, ShadowValue);
+}
+
+static size_t TypeStoreSizeToSizeIndex(uint32_t TypeSize) {
+ size_t Res = llvm::countr_zero(TypeSize / 8);
+ return Res;
+}
+
+static Instruction *generateCrashCode(Module &M, IRBuilder<> &IRB,
+ Instruction *InsertBefore, Value *Addr,
+ bool IsWrite, size_t AccessSizeIndex,
+ Value *SizeArgument) {
+ IRB.SetInsertPoint(InsertBefore);
+ CallInst *Call = nullptr;
+ int LongSize = M.getDataLayout().getPointerSizeInBits();
+ Type *IntptrTy = Type::getIntNTy(M.getContext(), LongSize);
+ const char kAsanReportErrorTemplate[] = "__asan_report_";
+ const std::string TypeStr = IsWrite ? "store" : "load";
+ const std::string EndingStr = Recover ? "_noabort" : "";
+ SmallVector<Type *, 3> Args2 = {IntptrTy, IntptrTy};
+ AttributeList AL2;
+ FunctionCallee AsanErrorCallbackSized = M.getOrInsertFunction(
+ kAsanReportErrorTemplate + TypeStr + "_n" + EndingStr,
+ FunctionType::get(IRB.getVoidTy(), Args2, false), AL2);
+ const std::string Suffix = TypeStr + llvm::itostr(1ULL << AccessSizeIndex);
+ SmallVector<Type *, 2> Args1{1, IntptrTy};
+ AttributeList AL1;
+ FunctionCallee AsanErrorCallback = M.getOrInsertFunction(
+ kAsanReportErrorTemplate + Suffix + EndingStr,
+ FunctionType::get(IRB.getVoidTy(), Args1, false), AL1);
+ if (SizeArgument) {
+ Call = IRB.CreateCall(AsanErrorCallbackSized, {Addr, SizeArgument});
+ } else {
+ Call = IRB.CreateCall(AsanErrorCallback, Addr);
+ }
+
+ Call->setCannotMerge();
+ return Call;
+}
+
+static Value *memToShadow(Module &M, Value *Shadow, IRBuilder<> &IRB) {
+ int LongSize = M.getDataLayout().getPointerSizeInBits();
+ Type *IntptrTy = Type::getIntNTy(M.getContext(), LongSize);
+ // Shadow >> scale
+ Shadow = IRB.CreateLShr(Shadow, AsanMappingScale);
+ if (AsanMappingOffset == 0)
+ return Shadow;
+ // (Shadow >> scale) | offset
+ Value *ShadowBase = ConstantInt::get(IntptrTy, AsanMappingOffset);
+ return IRB.CreateAdd(Shadow, ShadowBase);
+}
+
+static void InstrumentAddress(Module &M, IRBuilder<> &IRB, Instruction *OrigIns,
+ Instruction *InsertBefore, Value *Addr,
+ MaybeAlign Alignment, uint32_t TypeStoreSize,
+ bool IsWrite, Value *SizeArgument,
+ bool UseCalls) {
+ int LongSize = M.getDataLayout().getPointerSizeInBits();
+ Type *IntptrTy = Type::getIntNTy(M.getContext(), LongSize);
+ IRB.SetInsertPoint(InsertBefore);
+ size_t AccessSizeIndex = TypeStoreSizeToSizeIndex(TypeStoreSize);
+ Type *ShadowTy = IntegerType::get(
+ M.getContext(), std::max(8U, TypeStoreSize >> AsanMappingScale));
+ Type *ShadowPtrTy = PointerType::get(ShadowTy, 0);
+ Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
+ Value *ShadowPtr = memToShadow(M, AddrLong, IRB);
+ const uint64_t ShadowAlign =
+ std::max<uint64_t>(Alignment.valueOrOne().value() >> AsanMappingScale, 1);
+ Value *ShadowValue = IRB.CreateAlignedLoad(
+ ShadowTy, IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy), Align(ShadowAlign));
+ Value *Cmp = IRB.CreateIsNotNull(ShadowValue);
+ size_t Granularity = 1ULL << AsanMappingScale;
+ Instruction *CrashTerm = nullptr;
+ auto *Cmp2 = createSlowPathCmp(M, IRB, AddrLong, ShadowValue, TypeStoreSize);
+ Cmp = IRB.CreateAnd(Cmp, Cmp2);
+ CrashTerm = genAMDGPUReportBlock(M, IRB, Cmp, Recover);
+ Instruction *Crash = generateCrashCode(M, IRB, CrashTerm, AddrLong, IsWrite,
+ AccessSizeIndex, SizeArgument);
+ return;
+}
+
+static GlobalVariable *ReplaceLDSWithMalloc(IRBuilder<> &IRB, Module &M,
+ Function *Func,
+ GlobalVariable *LoweredLDSGlobal,
+ DomTreeUpdater &DTU) {
+ // TODO
+ // Do single malloc for all globals. store offsets of GV into malloc
+ // Store malloc pointer to LDS
+ // Replace lds accesses with lds malloc ptr + offsets
+ StructType *LoweredLDSGlobalTy =
+ dyn_cast<StructType>(LoweredLDSGlobal->getValueType());
+ if (!LoweredLDSGlobalTy)
+ return nullptr;
+
+ auto &Ctx = M.getContext();
+ // Store previous entry block
+ auto *PrevEntryBlock = &Func->getEntryBlock();
+
+ // Create malloc block
+ auto *MallocBlock = BasicBlock::Create(Ctx, "Malloc", Func, PrevEntryBlock);
+
+ // Create WIdBlock block
+ auto *WIdBlock = BasicBlock::Create(Ctx, "WId", Func, MallocBlock);
+ IRB.SetInsertPoint(WIdBlock, WIdBlock->begin());
+ auto *const WIdx =
+ IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {}, {});
+ auto *const WIdy =
+ IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_y, {}, {});
+ auto *const WIdz =
+ IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_z, {}, {});
+ auto *const XYOr = IRB.CreateOr(WIdx, WIdy);
+ auto *const XYZOr = IRB.CreateOr(XYOr, WIdz);
+ auto *const WIdzCond = IRB.CreateICmpEQ(XYZOr, IRB.getInt32(0));
+ IRB.CreateCondBr(WIdzCond, MallocBlock, PrevEntryBlock);
+
+ // Malloc block
+ IRB.SetInsertPoint(MallocBlock, MallocBlock->begin());
+ const char kAsanMallocImplName[] = "malloc";
+ FunctionCallee AsanAMDGPUMallocReturn = M.getOrInsertFunction(
+ kAsanMallocImplName,
+ FunctionType::get(IRB.getPtrTy(), {IRB.getInt64Ty()}, false));
+
+ auto &DL = M.getDataLayout();
+ uint64_t MallocOffset = 0;
+ uint64_t MemberOffset = 0;
+ DenseMap<uint64_t, uint64_t> MemberOffsetToMallocOffsetMap;
+ for (auto I : llvm::enumerate(LoweredLDSGlobalTy->elements())) {
+ Type *Ty = I.value();
+ MemberOffsetToMallocOffsetMap[MemberOffset] = MallocOffset;
+ const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
+ const uint64_t RightRedzoneSize = getRedzoneSizeForGlobal(SizeInBytes);
+ MallocOffset += SizeInBytes + RightRedzoneSize;
+ ++MemberOffset;
+ }
+
+ ConstantInt *MallocSizeArg =
+ ConstantInt::get(Type::getInt64Ty(Ctx), MallocOffset);
+ Value *MCI = IRB.CreateCall(AsanAMDGPUMallocReturn, {MallocSizeArg});
+ // create new global pointer variable
+ GlobalVariable *NewGlobal = new GlobalVariable(
+ M, IRB.getPtrTy(), false, GlobalValue::InternalLinkage,
+ PoisonValue::get(IRB.getPtrTy()),
+ Twine("llvm.amdgcn.asan." + Func->getName() + ".lds"), nullptr,
+ GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false);
+ // create load of malloc to new global
+ IRB.CreateStore(MCI, NewGlobal);
+
+ // Replace lds accesses with malloc ptr + offsets
+ for (Use &U : make_early_inc_range(LoweredLDSGlobal->uses())) {
+ if (GEPOperator *GEP = dyn_cast<GEPOperator>(U.getUser())) {
+ Instruction *UserI = dyn_cast<Instruction>(GEP->use_begin()->getUser());
+ unsigned Indices = GEP->getNumIndices();
+ MemberOffset =
+ cast<ConstantInt>(GEP->getOperand(Indices))->getZExtValue();
+ Type *Ty = LoweredLDSGlobalTy->elements()[MemberOffset];
+ MallocOffset = MemberOffsetToMallocOffsetMap[MemberOffset];
+ ConstantInt *OffsetConst =
+ ConstantInt::get(Type::getInt64Ty(Ctx), MallocOffset);
+ Constant *AddrPlusOffset =
+ ConstantExpr::getGetElementPtr(Ty, NewGlobal, {OffsetConst}, true);
+ U.getUser()->replaceAllUsesWith(AddrPlusOffset);
+ continue;
+ } else {
+ MemberOffset = 0;
+ Type *Ty = LoweredLDSGlobalTy->elements()[MemberOffset];
+ MallocOffset = MemberOffsetToMallocOffsetMap[MemberOffset];
+ ConstantInt *OffsetConst =
+ ConstantInt::get(Type::getInt64Ty(Ctx), MallocOffset);
+ Constant *AddrPlusOffset =
+ ConstantExpr::getGetElementPtr(Ty, NewGlobal, {OffsetConst}, true);
+ U.set(AddrPlusOffset);
+ }
+ }
+
+ // Add instrumented globals to llvm.compiler.used list to avoid LTO from
+ // ConstantMerge'ing them.
+ appendToCompilerUsed(M, {NewGlobal});
+
+ // Create branch to PrevEntryBlock
+ IRB.CreateBr(PrevEntryBlock);
+
+ // Create wave-group barrier at the starting of Previous entry block
+ Type *Int1Ty = IRB.getInt1Ty();
+ IRB.SetInsertPoint(PrevEntryBlock, PrevEntryBlock->begin());
+ auto *XYZCondPhi = IRB.CreatePHI(Int1Ty, 2, "xyzCond");
+ XYZCondPhi->addIncoming(IRB.getInt1(0), WIdBlock);
+ XYZCondPhi->addIncoming(IRB.getInt1(1), MallocBlock);
+
+ IRB.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {}, {});
+
+ auto *CondFreeBlock = BasicBlock::Create(Ctx, "CondFree", Func);
+ auto *FreeBlock = BasicBlock::Create(Ctx, "Free", Func);
+ auto *EndBlock = BasicBlock::Create(Ctx, "End", Func);
+ DenseMap<BasicBlock *, Value *> BBToRetValMap;
+ for (BasicBlock &BB : *Func) {
+ if (!BB.empty()) {
+ if (ReturnInst *RI = dyn_cast<ReturnInst>(&BB.back())) {
+ BasicBlock *Block = &BB;
+ Value *Val = RI->getReturnValue();
+ BBToRetValMap[Block] = Val;
+ RI->eraseFromParent();
+ IRB.SetInsertPoint(&BB, BB.end());
+ IRB.CreateBr(CondFreeBlock);
+ }
+ }
+ }
+
+ // assert(BBToRetValMap.empty() && "Function has no return");
+ IRB.SetInsertPoint(CondFreeBlock, CondFreeBlock->begin());
+
+ const uint64_t Size = BBToRetValMap.size();
+ auto First = BBToRetValMap.begin();
+ auto Pair = *First;
+ Value *Val = Pair.second;
+ Type *FPhiTy = Val ? Pair.second->getType() : IRB.getVoidTy();
+ auto *CFPhi = Val ? IRB.CreatePHI(FPhiTy, Size) : nullptr;
+
+ for (auto &Entry : BBToRetValMap) {
+ BasicBlock *BB = Entry.first;
+ Value *Val = Entry.second;
+ ;
+ if (Val)
+ CFPhi->addIncoming(Val, BB);
+ }
+ IRB.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {}, {});
+
+ IRB.CreateCondBr(XYZCondPhi, FreeBlock, EndBlock);
+
+ // Free Block
+ IRB.SetInsertPoint(FreeBlock, FreeBlock->begin());
+ const char kAsanFreeImplName[] = "free";
+ FunctionCallee AsanAMDGPUFreeReturn = M.getOrInsertFunction(
+ kAsanFreeImplName,
+ FunctionType::get(IRB.getVoidTy(), {IRB.getPtrTy()}, false));
+
+ Value *MallocPtr = IRB.CreateLoad(IRB.getPtrTy(), NewGlobal);
+ IRB.CreateCall(AsanAMDGPUFreeReturn, {MallocPtr});
+
+ IRB.CreateBr(EndBlock);
+
+ // End Block
+ IRB.SetInsertPoint(EndBlock, EndBlock->begin());
+ if (CFPhi)
+ IRB.CreateRet(CFPhi);
+ else
+ IRB.CreateRetVoid();
+
+ DTU.applyUpdates({{DominatorTree::Insert, WIdBlock, MallocBlock},
+ {DominatorTree::Insert, MallocBlock, PrevEntryBlock},
+ {DominatorTree::Insert, PrevEntryBlock, CondFreeBlock},
+ {DominatorTree::Insert, CondFreeBlock, FreeBlock},
+ {DominatorTree::Insert, FreeBlock, EndBlock}});
+ return NewGlobal;
+}
+
+static uint32_t getKernelLdsSizeAttributeAsInt(Function &F) {
+ StringRef S = F.getFnAttribute("amdgpu-lds-size").getValueAsString();
+ uint32_t LdsSize = 0;
+ if (!S.empty())
+ S.consumeInteger(0, LdsSize);
+ return LdsSize;
+}
+
+static GlobalVariable *getKernelLDSGlobal(Module &M, Function &F) {
+ SmallString<64> KernelLDSName("llvm.amdgcn.kernel.");
+ KernelLDSName += F.getName();
+ KernelLDSName += ".lds";
+ return M.getNamedGlobal(KernelLDSName);
+}
+
+static bool AMDGPUAsanInstrumentLDSImpl(Module &M, DomTreeCallback DTCallback) {
+ if (!AMDGPUTargetMachine::EnableLowerModuleLDS)
+ return false;
+ LLVMContext &Ctx = M.getContext();
+ IRBuilder<> IRB(Ctx);
+ SmallVector<GlobalVariable *, 16> NewLdsMallocGlobals;
+ for (Function &F : M) {
+ bool hasSanitizeAddrAttr = F.hasFnAttribute(Attribute::SanitizeAddress);
+ GlobalVariable *GV = getKernelLDSGlobal(M, F);
+ uint32_t StaticLdsSize = getKernelLdsSizeAttributeAsInt(F);
+ if (hasSanitizeAddrAttr && (GV || (StaticLdsSize != 0))) {
+ DomTreeUpdater DTU(DTCallback(F), DomTreeUpdater::UpdateStrategy::Lazy);
+ GlobalVariable *NewGlobal = ReplaceLDSWithMalloc(IRB, M, &F, {GV}, DTU);
+ if (NewGlobal)
+ NewLdsMallocGlobals.push_back(NewGlobal);
+ }
+ }
+
+ if (ReplaceLDSAndInstrument) {
+ for (GlobalVariable *GV : NewLdsMallocGlobals) {
+ // Iterate over users instructions of global
+ for (Use &U : make_early_inc_range(GV->uses())) {
+ if (GEPOperator *GEP = dyn_cast<GEPOperator>(U.getUser())) {
+ Instruction *UserI = dyn_cast<Instruction>(GEP->use_begin()->getUser());
+ bool IsStore = UserI ? isa<StoreInst>(UserI) : false;
+ InstrumentAddress(M, IRB, UserI, UserI, U, {}, 8, IsStore, nullptr,
+ false);
+ } else {
+ Instruction *UserI = dyn_cast<Instruction>(U.getUser());
+ if (!UserI)
+ continue;
+ bool IsStore = UserI ? isa<StoreInst>(UserI) : false;
+ InstrumentAddress(M, IRB, UserI, UserI, U, {}, 8, IsStore, nullptr,
+ false);
+ }
+ }
+ }
+ }
+ return false;
+}
+
+bool AMDGPUAsanInstrumentLDS::runOnModule(Module &M) {
+ DominatorTreeWrapperPass *const DTW =
+ getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+ auto DTCallback = [&DTW](Function &F) -> DominatorTree * {
+ return DTW ? &DTW->getDomTree() : nullptr;
+ };
+ bool IsChanged = AMDGPUAsanInstrumentLDSImpl(M, DTCallback);
+ return IsChanged;
+}
+
+ModulePass *llvm::createAMDGPUAsanInstrumentLDSPass() {
+ return new AMDGPUAsanInstrumentLDS();
+}
+
+PreservedAnalyses AMDGPUAsanInstrumentLDSPass::run(Module &M,
+ ModuleAnalysisManager &AM) {
+ auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+ auto DTCallback = [&FAM](Function &F) -> DominatorTree * {
+ return &FAM.getResult<DominatorTreeAnalysis>(F);
+ };
+ bool IsChanged = AMDGPUAsanInstrumentLDSImpl(M, DTCallback);
+ if (!IsChanged)
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserve<DominatorTreeAnalysis>();
+ return PA;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index e26b4cf820a52b..f287e35a39eb75 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -330,6 +330,12 @@ static cl::opt<bool, true> EnableLowerModuleLDS(
cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true),
cl::Hidden);
+static cl::opt<bool, true> EnableAsanInstrumentLDS(
+ "amdgpu-enable-asan-instrument-lds",
+ cl::desc("Enable asan instrument lds pass"),
+ cl::location(AMDGPUTargetMachine::EnableAsanInstrumentLDS), cl::init(true),
+ cl::Hidden);
+
static cl::opt<bool> EnablePreRAOptimizations(
"amdgpu-enable-pre-ra-optimizations",
cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
@@ -394,6 +400,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSILoadStoreOptimizerPass(*PR);
initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
initializeAMDGPUAlwaysInlinePass(*PR);
+ initializeAMDGPUAsanInstrumentLDSPass(*PR);
initializeAMDGPUAttributorLegacyPass(*PR);
initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
initializeAMDGPUAnnotateUniformValuesPass(*PR);
@@ -591,6 +598,7 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
bool AMDGPUTargetMachine::EnableFunctionCalls = false;
bool AMDGPUTargetMachine::EnableLowerModuleLDS = true;
+bool AMDGPUTargetMachine::EnableAsanInstrumentLDS = true;
AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
@@ -646,6 +654,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(
PM.addPass(AMDGPULowerModuleLDSPass(*this));
return true;
}
+ if (PassName == "amdgpu-asan-instrument-lds") {
+ PM.addPass(AMDGPUAsanInstrumentLDSPass());
+ return true;
+ }
if (PassName == "amdgpu-lower-ctor-dtor") {
PM.addPass(AMDGPUCtorDtorLoweringPass());
return true;
@@ -1040,6 +1052,11 @@ void AMDGPUPassConfig::addIRPasses() {
addPass(createAMDGPULowerModuleLDSLegacyPass(&TM));
}
+ // Add this pass only if asan is enabled
+ if (EnableAsanInstrumentLDS) {
+ addPass(createAMDGPUAsanInstrumentLDSPass());
+ }
+
// AMDGPUAttributor infers lack of llvm.amdgcn.lds.kernel.id calls, so run
// after their introduction
if (TM.getOptLevel() > CodeGenOptLevel::None)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index ce2dd2947daf65..44dfb9ca0036e0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -37,6 +37,7 @@ class AMDGPUTargetMachine : public LLVMTargetMachine {
static bool EnableLateStructurizeCFG;
static bool EnableFunctionCalls;
static bool EnableLowerModuleLDS;
+ static bool EnableAsanInstrumentLDS;
AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
StringRef FS, const TargetOptions &Options,
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 9a974eaf50d235..0c3ff779db1995 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -72,6 +72,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPULowerKernelArguments.cpp
AMDGPULowerKernelAttributes.cpp
AMDGPULowerModuleLDSPass.cpp
+ AMDGPUAsanInstrumentLDSPass.cpp
AMDGPUMachineCFGStructurizer.cpp
AMDGPUMachineFunction.cpp
AMDGPUMachineModuleInfo.cpp
diff --git a/llvm/test/Instrumentation/AddressSanitizer/AMDGPU/lds_globals_instrument.ll b/llvm/test/Instrumentation/AddressSanitizer/AMDGPU/lds_globals_instrument.ll
new file mode 100644
index 00000000000000..85f7a0a806a384
--- /dev/null
+++ b/llvm/test/Instrumentation/AddressSanitizer/AMDGPU/lds_globals_instrument.ll
@@ -0,0 +1,216 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt < %s -passes=amdgpu-asan-instrument-lds -S -mtriple=amdgcn-- | FileCheck %s
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-unknown-unknown"
+
+%llvm.amdgcn.module.lds.t = type { [5 x i32] }
+%llvm.amdgcn.kernel.k0.lds.t = type { [3 x i8], [1 x i8] }
+%llvm.amdgcn.kernel.k1.lds.t = type { [3 x i8] }
+
+ at llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 4, !absolute_symbol !0
+ at llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
+ at llvm.amdgcn.kernel.k0.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k0.lds.t poison, align 4, !absolute_symbol !1
+ at llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t poison, align 4, !absolute_symbol !1
+
+; Function Attrs: sanitize_address
+define amdgpu_kernel void @k0() #0 {
+; CHECK-LABEL: define amdgpu_kernel void @k0(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP15:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr @malloc(i64 64)
+; CHECK-NEXT: [[TMP7:%.*]] = lshr i64 ptrtoint (ptr addrspace(3) @llvm.amdgcn.asan.k0.lds to i64), 3
+; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP7]], 2147450880
+; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
+; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP10]], 0
+; CHECK-NEXT: [[TMP12:%.*]] = icmp sge i8 0, [[TMP10]]
+; CHECK-NEXT: [[TMP13:%.*]] = and i1 [[TMP11]], [[TMP12]]
+; CHECK-NEXT: br i1 [[TMP13]], label [[ASAN_REPORT4:%.*]], label [[TMP14:%.*]], !prof [[PROF2:![0-9]+]]
+; CHECK: asan.report4:
+; CHECK-NEXT: call void @__asan_report_store1_noabort(i64 ptrtoint (ptr addrspace(3) @llvm.amdgcn.asan.k0.lds to i64)) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT: br label [[TMP14]]
+; CHECK: 14:
+; CHECK-NEXT: store ptr [[TMP6]], ptr addrspace(3) @llvm.amdgcn.asan.k0.lds, align 8
+; CHECK-NEXT: br label [[TMP15]]
+; CHECK: 15:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[TMP14]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP16:%.*]] = lshr i64 ptrtoint (ptr addrspace(3) @llvm.amdgcn.asan.k0.lds to i64), 3
+; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 2147450880
+; CHECK-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+; CHECK-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
+; CHECK-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP19]], 0
+; CHECK-NEXT: [[TMP21:%.*]] = icmp sge i8 0, [[TMP19]]
+; CHECK-NEXT: [[TMP22:%.*]] = and i1 [[TMP20]], [[TMP21]]
+; CHECK-NEXT: br i1 [[TMP22]], label [[ASAN_REPORT1:%.*]], label [[TMP23:%.*]], !prof [[PROF2]]
+; CHECK: asan.report1:
+; CHECK-NEXT: call void @__asan_report_load1_noabort(i64 ptrtoint (ptr addrspace(3) @llvm.amdgcn.asan.k0.lds to i64)) #[[ATTR5]]
+; CHECK-NEXT: br label [[TMP23]]
+; CHECK: 23:
+; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.asan.k0.lds) ], !alias.scope [[META3:![0-9]+]], !noalias [[META6:![0-9]+]]
+; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
+; CHECK-NEXT: [[TMP24:%.*]] = lshr i64 ptrtoint (ptr addrspace(3) @llvm.amdgcn.asan.k0.lds to i64), 3
+; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[TMP24]], 2147450880
+; CHECK-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP25]] to ptr
+; CHECK-NEXT: [[TMP27:%.*]] = load i8, ptr [[TMP26]], align 1
+; CHECK-NEXT: [[TMP28:%.*]] = icmp ne i8 [[TMP27]], 0
+; CHECK-NEXT: [[TMP29:%.*]] = icmp sge i8 0, [[TMP27]]
+; CHECK-NEXT: [[TMP30:%.*]] = and i1 [[TMP28]], [[TMP29]]
+; CHECK-NEXT: br i1 [[TMP30]], label [[ASAN_REPORT2:%.*]], label [[TMP31:%.*]], !prof [[PROF2]]
+; CHECK: asan.report2:
+; CHECK-NEXT: call void @__asan_report_store1_noabort(i64 ptrtoint (ptr addrspace(3) @llvm.amdgcn.asan.k0.lds to i64)) #[[ATTR5]]
+; CHECK-NEXT: br label [[TMP31]]
+; CHECK: 31:
+; CHECK-NEXT: store i8 1, ptr addrspace(3) getelementptr inbounds ([1 x i8], ptr addrspace(3) @llvm.amdgcn.asan.k0.lds, i64 32), align 1, !alias.scope [[META6]], !noalias [[META3]]
+; CHECK-NEXT: [[TMP32:%.*]] = lshr i64 ptrtoint (ptr addrspace(3) @llvm.amdgcn.asan.k0.lds to i64), 3
+; CHECK-NEXT: [[TMP33:%.*]] = add i64 [[TMP32]], 2147450880
+; CHECK-NEXT: [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
+; CHECK-NEXT: [[TMP35:%.*]] = load i8, ptr [[TMP34]], align 1
+; CHECK-NEXT: [[TMP36:%.*]] = icmp ne i8 [[TMP35]], 0
+; CHECK-NEXT: [[TMP37:%.*]] = icmp sge i8 0, [[TMP35]]
+; CHECK-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]]
+; CHECK-NEXT: br i1 [[TMP38]], label [[ASAN_REPORT3:%.*]], label [[TMP39:%.*]], !prof [[PROF2]]
+; CHECK: asan.report3:
+; CHECK-NEXT: call void @__asan_report_store1_noabort(i64 ptrtoint (ptr addrspace(3) @llvm.amdgcn.asan.k0.lds to i64)) #[[ATTR5]]
+; CHECK-NEXT: br label [[TMP39]]
+; CHECK: 39:
+; CHECK-NEXT: store i8 2, ptr addrspace(3) @llvm.amdgcn.asan.k0.lds, align 4, !alias.scope [[META3]], !noalias [[META6]]
+; CHECK-NEXT: br label [[CONDFREE:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP40:%.*]] = lshr i64 ptrtoint (ptr addrspace(3) @llvm.amdgcn.asan.k0.lds to i64), 3
+; CHECK-NEXT: [[TMP41:%.*]] = add i64 [[TMP40]], 2147450880
+; CHECK-NEXT: [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
+; CHECK-NEXT: [[TMP43:%.*]] = load i8, ptr [[TMP42]], align 1
+; CHECK-NEXT: [[TMP44:%.*]] = icmp ne i8 [[TMP43]], 0
+; CHECK-NEXT: [[TMP45:%.*]] = icmp sge i8 0, [[TMP43]]
+; CHECK-NEXT: [[TMP46:%.*]] = and i1 [[TMP44]], [[TMP45]]
+; CHECK-NEXT: br i1 [[TMP46]], label [[ASAN_REPORT:%.*]], label [[TMP47:%.*]], !prof [[PROF2]]
+; CHECK: asan.report:
+; CHECK-NEXT: call void @__asan_report_load1_noabort(i64 ptrtoint (ptr addrspace(3) @llvm.amdgcn.asan.k0.lds to i64)) #[[ATTR5]]
+; CHECK-NEXT: br label [[TMP47]]
+; CHECK: 47:
+; CHECK-NEXT: [[TMP48:%.*]] = load ptr, ptr addrspace(3) @llvm.amdgcn.asan.k0.lds, align 8
+; CHECK-NEXT: call void @free(ptr [[TMP48]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds) ], !alias.scope !2, !noalias !5
+ call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
+ store i8 1, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1), align 1, !alias.scope !5, !noalias !2
+ store i8 2, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, align 4, !alias.scope !2, !noalias !5
+ ret void
+}
+
+; Function Attrs: sanitize_address
+define amdgpu_kernel void @k1() #1 {
+; CHECK-LABEL: define amdgpu_kernel void @k1(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP15:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr @malloc(i64 32)
+; CHECK-NEXT: [[TMP7:%.*]] = lshr i64 ptrtoint (ptr addrspace(3) @llvm.amdgcn.asan.k1.lds to i64), 3
+; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP7]], 2147450880
+; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
+; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP10]], 0
+; CHECK-NEXT: [[TMP12:%.*]] = icmp sge i8 0, [[TMP10]]
+; CHECK-NEXT: [[TMP13:%.*]] = and i1 [[TMP11]], [[TMP12]]
+; CHECK-NEXT: br i1 [[TMP13]], label [[ASAN_REPORT3:%.*]], label [[TMP14:%.*]], !prof [[PROF2]]
+; CHECK: asan.report3:
+; CHECK-NEXT: call void @__asan_report_store1_noabort(i64 ptrtoint (ptr addrspace(3) @llvm.amdgcn.asan.k1.lds to i64)) #[[ATTR5]]
+; CHECK-NEXT: br label [[TMP14]]
+; CHECK: 14:
+; CHECK-NEXT: store ptr [[TMP6]], ptr addrspace(3) @llvm.amdgcn.asan.k1.lds, align 8
+; CHECK-NEXT: br label [[TMP15]]
+; CHECK: 15:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[TMP14]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: [[TMP16:%.*]] = lshr i64 ptrtoint (ptr addrspace(3) @llvm.amdgcn.asan.k1.lds to i64), 3
+; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 2147450880
+; CHECK-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+; CHECK-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
+; CHECK-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP19]], 0
+; CHECK-NEXT: [[TMP21:%.*]] = icmp sge i8 0, [[TMP19]]
+; CHECK-NEXT: [[TMP22:%.*]] = and i1 [[TMP20]], [[TMP21]]
+; CHECK-NEXT: br i1 [[TMP22]], label [[ASAN_REPORT1:%.*]], label [[TMP23:%.*]], !prof [[PROF2]]
+; CHECK: asan.report1:
+; CHECK-NEXT: call void @__asan_report_load1_noabort(i64 ptrtoint (ptr addrspace(3) @llvm.amdgcn.asan.k1.lds to i64)) #[[ATTR5]]
+; CHECK-NEXT: br label [[TMP23]]
+; CHECK: 23:
+; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.asan.k1.lds) ]
+; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
+; CHECK-NEXT: [[TMP24:%.*]] = lshr i64 ptrtoint (ptr addrspace(3) @llvm.amdgcn.asan.k1.lds to i64), 3
+; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[TMP24]], 2147450880
+; CHECK-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP25]] to ptr
+; CHECK-NEXT: [[TMP27:%.*]] = load i8, ptr [[TMP26]], align 1
+; CHECK-NEXT: [[TMP28:%.*]] = icmp ne i8 [[TMP27]], 0
+; CHECK-NEXT: [[TMP29:%.*]] = icmp sge i8 0, [[TMP27]]
+; CHECK-NEXT: [[TMP30:%.*]] = and i1 [[TMP28]], [[TMP29]]
+; CHECK-NEXT: br i1 [[TMP30]], label [[ASAN_REPORT2:%.*]], label [[TMP31:%.*]], !prof [[PROF2]]
+; CHECK: asan.report2:
+; CHECK-NEXT: call void @__asan_report_store1_noabort(i64 ptrtoint (ptr addrspace(3) @llvm.amdgcn.asan.k1.lds to i64)) #[[ATTR5]]
+; CHECK-NEXT: br label [[TMP31]]
+; CHECK: 31:
+; CHECK-NEXT: store i8 3, ptr addrspace(3) @llvm.amdgcn.asan.k1.lds, align 4
+; CHECK-NEXT: br label [[CONDFREE:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP32:%.*]] = lshr i64 ptrtoint (ptr addrspace(3) @llvm.amdgcn.asan.k1.lds to i64), 3
+; CHECK-NEXT: [[TMP33:%.*]] = add i64 [[TMP32]], 2147450880
+; CHECK-NEXT: [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
+; CHECK-NEXT: [[TMP35:%.*]] = load i8, ptr [[TMP34]], align 1
+; CHECK-NEXT: [[TMP36:%.*]] = icmp ne i8 [[TMP35]], 0
+; CHECK-NEXT: [[TMP37:%.*]] = icmp sge i8 0, [[TMP35]]
+; CHECK-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]]
+; CHECK-NEXT: br i1 [[TMP38]], label [[ASAN_REPORT:%.*]], label [[TMP39:%.*]], !prof [[PROF2]]
+; CHECK: asan.report:
+; CHECK-NEXT: call void @__asan_report_load1_noabort(i64 ptrtoint (ptr addrspace(3) @llvm.amdgcn.asan.k1.lds to i64)) #[[ATTR5]]
+; CHECK-NEXT: br label [[TMP39]]
+; CHECK: 39:
+; CHECK-NEXT: [[TMP40:%.*]] = load ptr, ptr addrspace(3) @llvm.amdgcn.asan.k1.lds, align 8
+; CHECK-NEXT: call void @free(ptr [[TMP40]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds) ]
+ call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
+ store i8 3, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, align 4
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare void @llvm.donothing() #2
+
+attributes #0 = { sanitize_address "amdgpu-lds-size"="24" }
+attributes #1 = { sanitize_address "amdgpu-lds-size"="23" }
+attributes #2 = { nocallback nofree nosync nounwind willreturn memory(none) }
+
+!0 = !{i32 0, i32 1}
+!1 = !{i32 20, i32 21}
+!2 = !{!3}
+!3 = distinct !{!3, !4}
+!4 = distinct !{!4}
+!5 = !{!6}
+!6 = distinct !{!6, !4}
diff --git a/llvm/test/Instrumentation/AddressSanitizer/AMDGPU/lds_globals_replace_with_malloc.ll b/llvm/test/Instrumentation/AddressSanitizer/AMDGPU/lds_globals_replace_with_malloc.ll
new file mode 100644
index 00000000000000..1e32ea136c3979
--- /dev/null
+++ b/llvm/test/Instrumentation/AddressSanitizer/AMDGPU/lds_globals_replace_with_malloc.ll
@@ -0,0 +1,109 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt < %s -passes=amdgpu-asan-instrument-lds -amdgpu-replace-lds-and-instrument=0 -S -mtriple=amdgcn-- | FileCheck %s
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-unknown-unknown"
+
+%llvm.amdgcn.module.lds.t = type { [5 x i32] }
+%llvm.amdgcn.kernel.k0.lds.t = type { [3 x i8], [1 x i8] }
+%llvm.amdgcn.kernel.k1.lds.t = type { [3 x i8] }
+
+ at llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 4, !absolute_symbol !0
+ at llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
+ at llvm.amdgcn.kernel.k0.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k0.lds.t poison, align 4, !absolute_symbol !1
+ at llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t poison, align 4, !absolute_symbol !1
+
+; Function Attrs: sanitize_address
+define amdgpu_kernel void @k0() #0 {
+; CHECK-LABEL: define amdgpu_kernel void @k0(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr @malloc(i64 64)
+; CHECK-NEXT: store ptr [[TMP6]], ptr addrspace(3) @llvm.amdgcn.asan.k0.lds, align 8
+; CHECK-NEXT: br label [[TMP7]]
+; CHECK: 7:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.asan.k0.lds) ], !alias.scope [[META2:![0-9]+]], !noalias [[META5:![0-9]+]]
+; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
+; CHECK-NEXT: store i8 1, ptr addrspace(3) getelementptr inbounds ([1 x i8], ptr addrspace(3) @llvm.amdgcn.asan.k0.lds, i64 32), align 1, !alias.scope [[META5]], !noalias [[META2]]
+; CHECK-NEXT: store i8 2, ptr addrspace(3) @llvm.amdgcn.asan.k0.lds, align 4, !alias.scope [[META2]], !noalias [[META5]]
+; CHECK-NEXT: br label [[CONDFREE:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP8:%.*]] = load ptr, ptr addrspace(3) @llvm.amdgcn.asan.k0.lds, align 8
+; CHECK-NEXT: call void @free(ptr [[TMP8]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds) ], !alias.scope !2, !noalias !5
+ call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
+ store i8 1, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1), align 1, !alias.scope !5, !noalias !2
+ store i8 2, ptr addrspace(3) @llvm.amdgcn.kernel.k0.lds, align 4, !alias.scope !2, !noalias !5
+ ret void
+}
+
+; Function Attrs: sanitize_address
+define amdgpu_kernel void @k1() #1 {
+; CHECK-LABEL: define amdgpu_kernel void @k1(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT: WId:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]]
+; CHECK: Malloc:
+; CHECK-NEXT: [[TMP6:%.*]] = call ptr @malloc(i64 32)
+; CHECK-NEXT: store ptr [[TMP6]], ptr addrspace(3) @llvm.amdgcn.asan.k1.lds, align 8
+; CHECK-NEXT: br label [[TMP7]]
+; CHECK: 7:
+; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ]
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.asan.k1.lds) ]
+; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
+; CHECK-NEXT: store i8 3, ptr addrspace(3) @llvm.amdgcn.asan.k1.lds, align 4
+; CHECK-NEXT: br label [[CONDFREE:%.*]]
+; CHECK: CondFree:
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]]
+; CHECK: Free:
+; CHECK-NEXT: [[TMP8:%.*]] = load ptr, ptr addrspace(3) @llvm.amdgcn.asan.k1.lds, align 8
+; CHECK-NEXT: call void @free(ptr [[TMP8]])
+; CHECK-NEXT: br label [[END]]
+; CHECK: End:
+; CHECK-NEXT: ret void
+;
+ call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds) ]
+ call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
+ store i8 3, ptr addrspace(3) @llvm.amdgcn.kernel.k1.lds, align 4
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare void @llvm.donothing() #2
+
+attributes #0 = { sanitize_address "amdgpu-lds-size"="24" }
+attributes #1 = { sanitize_address "amdgpu-lds-size"="23" }
+attributes #2 = { nocallback nofree nosync nounwind willreturn memory(none) }
+
+!0 = !{i32 0, i32 1}
+!1 = !{i32 20, i32 21}
+!2 = !{!3}
+!3 = distinct !{!3, !4}
+!4 = distinct !{!4}
+!5 = !{!6}
+!6 = distinct !{!6, !4}
+
More information about the llvm-commits
mailing list