[llvm] [llvm][ctx_profile] Add instrumentation lowering (PR #90821)
Mircea Trofin via llvm-commits
llvm-commits at lists.llvm.org
Wed May 1 22:44:28 PDT 2024
https://github.com/mtrofin updated https://github.com/llvm/llvm-project/pull/90821
>From 8aaef96563bfc5812ec6317b9d5d1cda52bef80d Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin at google.com>
Date: Wed, 1 May 2024 22:31:17 -0700
Subject: [PATCH 1/2] [llvm][ctx_profile] Add instrumentation lowering
This adds the instrumentation lowering pass.
(Tracking Issue: #89287, RFC referenced there)
---
.../Instrumentation/PGOCtxProfLowering.h | 5 +-
llvm/lib/Passes/PassBuilder.cpp | 1 +
llvm/lib/Passes/PassBuilderPipelines.cpp | 5 +
llvm/lib/Passes/PassRegistry.def | 1 +
.../Instrumentation/PGOCtxProfLowering.cpp | 301 ++++++++++++++++++
.../PGOProfile/ctx-instrumentation.ll | 161 ++++++++++
6 files changed, 473 insertions(+), 1 deletion(-)
diff --git a/llvm/include/llvm/Transforms/Instrumentation/PGOCtxProfLowering.h b/llvm/include/llvm/Transforms/Instrumentation/PGOCtxProfLowering.h
index 38afa0c6fd3294..5256aff56205ba 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/PGOCtxProfLowering.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/PGOCtxProfLowering.h
@@ -12,13 +12,16 @@
#ifndef LLVM_TRANSFORMS_INSTRUMENTATION_PGOCTXPROFLOWERING_H
#define LLVM_TRANSFORMS_INSTRUMENTATION_PGOCTXPROFLOWERING_H
+#include "llvm/IR/PassManager.h"
namespace llvm {
class Type;
-class PGOCtxProfLoweringPass {
+class PGOCtxProfLoweringPass : public PassInfoMixin<PGOCtxProfLoweringPass> {
public:
explicit PGOCtxProfLoweringPass() = default;
static bool isContextualIRPGOEnabled();
+
+ PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
};
} // namespace llvm
#endif
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 30d3e7a1ec05b8..22fd2aef4ea684 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -175,6 +175,7 @@
#include "llvm/Transforms/Instrumentation/LowerAllowCheckPass.h"
#include "llvm/Transforms/Instrumentation/MemProfiler.h"
#include "llvm/Transforms/Instrumentation/MemorySanitizer.h"
+#include "llvm/Transforms/Instrumentation/PGOCtxProfLowering.h"
#include "llvm/Transforms/Instrumentation/PGOForceFunctionAttrs.h"
#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
#include "llvm/Transforms/Instrumentation/PoisonChecking.h"
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 100889c0845bc3..1d7f0510450c95 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -74,6 +74,7 @@
#include "llvm/Transforms/Instrumentation/InstrOrderFile.h"
#include "llvm/Transforms/Instrumentation/InstrProfiling.h"
#include "llvm/Transforms/Instrumentation/MemProfiler.h"
+#include "llvm/Transforms/Instrumentation/PGOCtxProfLowering.h"
#include "llvm/Transforms/Instrumentation/PGOForceFunctionAttrs.h"
#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
#include "llvm/Transforms/Scalar/ADCE.h"
@@ -834,6 +835,10 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM,
PTO.EagerlyInvalidateAnalyses));
}
+ if (PGOCtxProfLoweringPass::isContextualIRPGOEnabled()) {
+ MPM.addPass(PGOCtxProfLoweringPass());
+ return;
+ }
// Add the profile lowering pass.
InstrProfOptions Options;
if (!ProfileFile.empty())
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 9b670e4e3a44bb..8f79601d0351cf 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -77,6 +77,7 @@ MODULE_PASS("inliner-wrapper-no-mandatory-first",
MODULE_PASS("insert-gcov-profiling", GCOVProfilerPass())
MODULE_PASS("instrorderfile", InstrOrderFilePass())
MODULE_PASS("instrprof", InstrProfilingLoweringPass())
+MODULE_PASS("pgo-ctx-instr-lower", PGOCtxProfLoweringPass())
MODULE_PASS("internalize", InternalizePass())
MODULE_PASS("invalidate<all>", InvalidateAllAnalysesPass())
MODULE_PASS("iroutliner", IROutlinerPass())
diff --git a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
index 9d6dd5ccb38b8d..7442d8010ab0d3 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
@@ -8,10 +8,19 @@
//
#include "llvm/Transforms/Instrumentation/PGOCtxProfLowering.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PassManager.h"
#include "llvm/Support/CommandLine.h"
+#include <utility>
using namespace llvm;
+#define DEBUG_TYPE "ctx-profile-lower"
+
static cl::list<std::string> ContextRoots(
"profile-context-root", cl::Hidden,
cl::desc(
@@ -22,3 +31,295 @@ static cl::list<std::string> ContextRoots(
bool PGOCtxProfLoweringPass::isContextualIRPGOEnabled() {
return !ContextRoots.empty();
}
+
+// the names of symbols we expect in compiler-rt. Using a namespace for
+// readability.
+namespace CompilerRtAPINames {
+static auto StartCtx = "__llvm_ctx_profile_start_context";
+static auto ReleaseCtx = "__llvm_ctx_profile_release_context";
+static auto GetCtx = "__llvm_ctx_profile_get_context";
+static auto ExpectedCalleeTLS = "__llvm_ctx_profile_expected_callee";
+static auto CallsiteTLS = "__llvm_ctx_profile_callsite";
+} // namespace CompilerRtAPINames
+
+namespace {
+// The lowering logic and state.
+class CtxInstrumentationLowerer final {
+ Module &M;
+ ModuleAnalysisManager &MAM;
+ Type *ContextNodeTy = nullptr;
+ Type *ContextRootTy = nullptr;
+
+ DenseMap<const Function *, Constant *> ContextRootMap;
+ Function *StartCtx = nullptr;
+ Function *GetCtx = nullptr;
+ Function *ReleaseCtx = nullptr;
+ GlobalVariable *ExpectedCalleeTLS = nullptr;
+ GlobalVariable *CallsiteInfoTLS = nullptr;
+
+public:
+ CtxInstrumentationLowerer(Module &M, ModuleAnalysisManager &MAM);
+ void lowerFunction(Function &F);
+};
+
+std::pair<uint32_t, uint32_t> getNrCountersAndCallsites(const Function &F) {
+ uint32_t NrCounters = 0;
+ uint32_t NrCallsites = 0;
+ for (const auto &BB : F) {
+ for (const auto &I : BB) {
+ if (const auto *Incr = dyn_cast<InstrProfIncrementInst>(&I)) {
+ if (!NrCounters)
+ NrCounters =
+ static_cast<uint32_t>(Incr->getNumCounters()->getZExtValue());
+ } else if (const auto *CSIntr = dyn_cast<InstrProfCallsite>(&I)) {
+ if (!NrCallsites)
+ NrCallsites =
+ static_cast<uint32_t>(CSIntr->getNumCounters()->getZExtValue());
+ }
+ if (NrCounters && NrCallsites)
+ return std::make_pair(NrCounters, NrCallsites);
+ }
+ }
+ return {0, 0};
+}
+} // namespace
+
+// set up tie-in with compiler-rt.
+// NOTE!!!
+// These have to match compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
+CtxInstrumentationLowerer::CtxInstrumentationLowerer(Module &M,
+ ModuleAnalysisManager &MAM)
+ : M(M), MAM(MAM) {
+ auto *PointerTy = PointerType::get(M.getContext(), 0);
+ auto *SanitizerMutexType = Type::getInt8Ty(M.getContext());
+ auto *I32Ty = Type::getInt32Ty(M.getContext());
+ auto *I64Ty = Type::getInt64Ty(M.getContext());
+
+ // The ContextRoot type
+ ContextRootTy =
+ StructType::get(M.getContext(), {
+ PointerTy, /*FirstNode*/
+ PointerTy, /*FirstMemBlock*/
+ PointerTy, /*CurrentMem*/
+ SanitizerMutexType, /*Taken*/
+ });
+ // The Context header.
+ ContextNodeTy = StructType::get(M.getContext(), {
+ I64Ty, /*Guid*/
+ PointerTy, /*Next*/
+ I32Ty, /*NrCounters*/
+ I32Ty, /*NrCallsites*/
+ });
+
+ // Define a global for each entrypoint. We'll reuse the entrypoint's name as
+ // prefix. We assume the entrypoint names to be unique.
+ for (const auto &Fname : ContextRoots) {
+ if (const auto *F = M.getFunction(Fname)) {
+ if (F->isDeclaration())
+ continue;
+ auto *G = M.getOrInsertGlobal(Fname + "_ctx_root", ContextRootTy);
+ cast<GlobalVariable>(G)->setInitializer(
+ Constant::getNullValue(ContextRootTy));
+ ContextRootMap.insert(std::make_pair(F, G));
+ }
+ }
+
+ // Declare the functions we will call.
+ StartCtx = cast<Function>(
+ M.getOrInsertFunction(
+ CompilerRtAPINames::StartCtx,
+ FunctionType::get(ContextNodeTy->getPointerTo(),
+ {ContextRootTy->getPointerTo(), /*ContextRoot*/
+ I64Ty, /*Guid*/ I32Ty,
+ /*NrCounters*/ I32Ty /*NrCallsites*/},
+ false))
+ .getCallee());
+ GetCtx = cast<Function>(
+ M.getOrInsertFunction(CompilerRtAPINames::GetCtx,
+ FunctionType::get(ContextNodeTy->getPointerTo(),
+ {PointerTy, /*Callee*/
+ I64Ty, /*Guid*/
+ I32Ty, /*NrCounters*/
+ I32Ty}, /*NrCallsites*/
+ false))
+ .getCallee());
+ ReleaseCtx = cast<Function>(
+ M.getOrInsertFunction(
+ CompilerRtAPINames::ReleaseCtx,
+ FunctionType::get(Type::getVoidTy(M.getContext()),
+ {
+ ContextRootTy->getPointerTo(), /*ContextRoot*/
+ },
+ false))
+ .getCallee());
+
+ // Declare the TLSes we will need to use.
+ CallsiteInfoTLS =
+ new GlobalVariable(M, PointerTy, false, GlobalValue::ExternalLinkage,
+ nullptr, CompilerRtAPINames::CallsiteTLS);
+ CallsiteInfoTLS->setThreadLocal(true);
+ CallsiteInfoTLS->setVisibility(llvm::GlobalValue::HiddenVisibility);
+ ExpectedCalleeTLS =
+ new GlobalVariable(M, PointerTy, false, GlobalValue::ExternalLinkage,
+ nullptr, CompilerRtAPINames::ExpectedCalleeTLS);
+ ExpectedCalleeTLS->setThreadLocal(true);
+ ExpectedCalleeTLS->setVisibility(llvm::GlobalValue::HiddenVisibility);
+}
+
+PreservedAnalyses PGOCtxProfLoweringPass::run(Module &M,
+ ModuleAnalysisManager &MAM) {
+ CtxInstrumentationLowerer Lowerer(M, MAM);
+ for (auto &F : M)
+ Lowerer.lowerFunction(F);
+ return PreservedAnalyses::none();
+}
+
+void CtxInstrumentationLowerer::lowerFunction(Function &F) {
+ if (F.isDeclaration())
+ return;
+ auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+ auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+
+ Value *Guid = nullptr;
+ auto [NrCounters, NrCallsites] = getNrCountersAndCallsites(F);
+
+ Value *Context = nullptr;
+ Value *RealContext = nullptr;
+
+ StructType *ThisContextType = nullptr;
+ Value *TheRootContext = nullptr;
+ Value *ExpectedCalleeTLSAddr = nullptr;
+ Value *CallsiteInfoTLSAddr = nullptr;
+
+ auto &Head = F.getEntryBlock();
+ for (auto &I : Head) {
+ // Find the increment intrinsic in the entry basic block.
+ if (auto *Mark = dyn_cast<InstrProfIncrementInst>(&I)) {
+ assert(Mark->getIndex()->isZero());
+
+ IRBuilder<> Builder(Mark);
+ // FIXME(mtrofin): use InstrProfSymtab::getCanonicalName
+ Guid = Builder.getInt64(F.getGUID());
+ // The type of the context of this function is now knowable since we have
+ // NrCallsites and NrCounters. We delcare it here because it's more
+ // convenient - we have the Builder.
+ ThisContextType = StructType::get(
+ F.getContext(),
+ {ContextNodeTy, ArrayType::get(Builder.getInt64Ty(), NrCounters),
+ ArrayType::get(Builder.getPtrTy(), NrCallsites)});
+ // Figure out which way we obtain the context object for this function -
+ // if it's an entrypoint, then we call StartCtx, otherwise GetCtx. In the
+ // former case, we also set TheRootContext since we need it to release it
+ // at the end (plus it can be used to know if we have an entrypoint or a
+ // regular function)
+ auto Iter = ContextRootMap.find(&F);
+ if (Iter != ContextRootMap.end()) {
+ TheRootContext = Iter->second;
+ Context = Builder.CreateCall(StartCtx, {TheRootContext, Guid,
+ Builder.getInt32(NrCounters),
+ Builder.getInt32(NrCallsites)});
+ ORE.emit(
+ [&] { return OptimizationRemark(DEBUG_TYPE, "Entrypoint", &F); });
+ } else {
+ Context =
+ Builder.CreateCall(GetCtx, {&F, Guid, Builder.getInt32(NrCounters),
+ Builder.getInt32(NrCallsites)});
+ ORE.emit([&] {
+ return OptimizationRemark(DEBUG_TYPE, "RegularFunction", &F);
+ });
+ }
+ // The context could be scratch.
+ auto *CtxAsInt = Builder.CreatePtrToInt(Context, Builder.getInt64Ty());
+ if (NrCallsites > 0) {
+ // Figure out which index of the TLS 2-element buffers to use.
+ // Scratch context => we use index == 1. Real contexts => index == 0.
+ auto *Index = Builder.CreateAnd(CtxAsInt, Builder.getInt64(1));
+ // The GEPs corresponding to that index, in the respective TLS.
+ ExpectedCalleeTLSAddr = Builder.CreateGEP(
+ Builder.getInt8Ty()->getPointerTo(),
+ Builder.CreateThreadLocalAddress(ExpectedCalleeTLS), {Index});
+ CallsiteInfoTLSAddr = Builder.CreateGEP(
+ Builder.getInt32Ty(),
+ Builder.CreateThreadLocalAddress(CallsiteInfoTLS), {Index});
+ }
+ // Because the context pointer may have LSB set (to indicate scratch),
+ // clear it for the value we use as base address for the counter vector.
+ // This way, if later we want to have "real" (not clobbered) buffers
+ // acting as scratch, the lowering (at least this part of it that deals
+ // with counters) stays the same.
+ RealContext = Builder.CreateIntToPtr(
+ Builder.CreateAnd(CtxAsInt, Builder.getInt64(-2)),
+ ThisContextType->getPointerTo());
+ I.eraseFromParent();
+ break;
+ }
+ }
+ if (!Context) {
+ ORE.emit([&] {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "Skip", &F)
+ << "Function doesn't have instrumentation, skipping";
+ });
+ return;
+ }
+
+ bool ContextWasReleased = false;
+ for (auto &BB : F) {
+ for (auto &I : llvm::make_early_inc_range(BB)) {
+ if (auto *Instr = dyn_cast<InstrProfCntrInstBase>(&I)) {
+ IRBuilder<> Builder(Instr);
+ switch (Instr->getIntrinsicID()) {
+ case llvm::Intrinsic::instrprof_increment:
+ case llvm::Intrinsic::instrprof_increment_step: {
+ // Increments (or increment-steps) are just a typical load - increment
+ // - store in the RealContext.
+ auto *AsStep = cast<InstrProfIncrementInst>(Instr);
+ auto *GEP = Builder.CreateGEP(
+ ThisContextType, RealContext,
+ {Builder.getInt32(0), Builder.getInt32(1), AsStep->getIndex()});
+ Builder.CreateStore(
+ Builder.CreateAdd(Builder.CreateLoad(Builder.getInt64Ty(), GEP),
+ AsStep->getStep()),
+ GEP);
+ } break;
+ case llvm::Intrinsic::instrprof_callsite:
+ // callsite lowering: write the called value in the expected callee
+ // TLS we treat the TLS as volatile because of signal handlers and to
+ // avoid these being moved away from the callsite they decorate.
+ auto *CSIntrinsic = dyn_cast<InstrProfCallsite>(Instr);
+ Builder.CreateStore(CSIntrinsic->getCallee(), ExpectedCalleeTLSAddr,
+ true);
+ // write the GEP of the slot in the sub-contexts portion of the
+ // context in TLS. Now, here, we use the actual Context value - as
+ // returned from compiler-rt - which may have the LSB set if the
+ // Context was scratch. Since the header of the context object and
+ // then the values are all 8-aligned (or, really, insofar as we care,
+ // they are even) - if the context is scratch (meaning, an odd value),
+ // so will the GEP. This is important because this is then visible to
+ // compiler-rt which will produce scratch contexts for callers that
+ // have a scratch context.
+ Builder.CreateStore(
+ Builder.CreateGEP(ThisContextType, Context,
+ {Builder.getInt32(0), Builder.getInt32(2),
+ CSIntrinsic->getIndex()}),
+ CallsiteInfoTLSAddr, true);
+ break;
+ }
+ I.eraseFromParent();
+ } else if (TheRootContext && isa<ReturnInst>(I)) {
+ // Remember to release the context if we are an entrypoint.
+ IRBuilder<> Builder(&I);
+ Builder.CreateCall(ReleaseCtx, {TheRootContext});
+ ContextWasReleased = true;
+ }
+ }
+ }
+ // FIXME: This would happen if the entrypoint tailcalls. A way to fix would be
+ // to disallow this, (so this then stays as an error), another is to detect
+ // that and then do a wrapper or disallow the tail call. This only affects
+ // instrumentation, when we want to detect the call graph.
+ if (TheRootContext && !ContextWasReleased)
+ F.getContext().emitError(
+ "[ctx_prof] An entrypoint was instrumented but it has no `ret` "
+ "instructions above which to release the context: " +
+ F.getName());
+}
diff --git a/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll b/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll
index 2ad95ab51cc696..7fa14f6cd30b7c 100644
--- a/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll
+++ b/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll
@@ -1,11 +1,27 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4
; RUN: opt -passes=pgo-instr-gen -profile-context-root=an_entrypoint \
; RUN: -S < %s | FileCheck --check-prefix=INSTRUMENT %s
+; RUN: opt -passes=pgo-instr-gen,pgo-ctx-instr-lower -profile-context-root=an_entrypoint \
+; RUN: -profile-context-root=another_entrypoint_no_callees \
+; RUN: -S < %s | FileCheck --check-prefix=LOWERING %s
+
declare void @bar()
;.
; INSTRUMENT: @__profn_foo = private constant [3 x i8] c"foo"
+; INSTRUMENT: @__profn_an_entrypoint = private constant [13 x i8] c"an_entrypoint"
+; INSTRUMENT: @__profn_another_entrypoint_no_callees = private constant [29 x i8] c"another_entrypoint_no_callees"
+; INSTRUMENT: @__profn_simple = private constant [6 x i8] c"simple"
+;.
+; LOWERING: @__profn_foo = private constant [3 x i8] c"foo"
+; LOWERING: @__profn_an_entrypoint = private constant [13 x i8] c"an_entrypoint"
+; LOWERING: @__profn_another_entrypoint_no_callees = private constant [29 x i8] c"another_entrypoint_no_callees"
+; LOWERING: @__profn_simple = private constant [6 x i8] c"simple"
+; LOWERING: @an_entrypoint_ctx_root = global { ptr, ptr, ptr, i8 } zeroinitializer
+; LOWERING: @another_entrypoint_no_callees_ctx_root = global { ptr, ptr, ptr, i8 } zeroinitializer
+; LOWERING: @__llvm_ctx_profile_callsite = external hidden thread_local global ptr
+; LOWERING: @__llvm_ctx_profile_expected_callee = external hidden thread_local global ptr
;.
define void @foo(i32 %a, ptr %fct) {
; INSTRUMENT-LABEL: define void @foo(
@@ -24,6 +40,38 @@ define void @foo(i32 %a, ptr %fct) {
; INSTRUMENT-NEXT: br label [[EXIT]]
; INSTRUMENT: exit:
; INSTRUMENT-NEXT: ret void
+;
+; LOWERING-LABEL: define void @foo(
+; LOWERING-SAME: i32 [[A:%.*]], ptr [[FCT:%.*]]) {
+; LOWERING-NEXT: [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @foo, i64 6699318081062747564, i32 2, i32 2)
+; LOWERING-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
+; LOWERING-NEXT: [[TMP3:%.*]] = and i64 [[TMP2]], 1
+; LOWERING-NEXT: [[TMP4:%.*]] = call ptr @llvm.threadlocal.address.p0(ptr @__llvm_ctx_profile_expected_callee)
+; LOWERING-NEXT: [[TMP5:%.*]] = getelementptr ptr, ptr [[TMP4]], i64 [[TMP3]]
+; LOWERING-NEXT: [[TMP6:%.*]] = call ptr @llvm.threadlocal.address.p0(ptr @__llvm_ctx_profile_callsite)
+; LOWERING-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i64 [[TMP3]]
+; LOWERING-NEXT: [[TMP8:%.*]] = and i64 [[TMP2]], -2
+; LOWERING-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; LOWERING-NEXT: [[T:%.*]] = icmp eq i32 [[A]], 0
+; LOWERING-NEXT: br i1 [[T]], label [[YES:%.*]], label [[NO:%.*]]
+; LOWERING: yes:
+; LOWERING-NEXT: [[TMP10:%.*]] = getelementptr { { i64, ptr, i32, i32 }, [2 x i64], [2 x ptr] }, ptr [[TMP9]], i32 0, i32 1, i32 1
+; LOWERING-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP10]], align 4
+; LOWERING-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 1
+; LOWERING-NEXT: store i64 [[TMP12]], ptr [[TMP10]], align 4
+; LOWERING-NEXT: store volatile ptr [[FCT]], ptr [[TMP5]], align 8
+; LOWERING-NEXT: [[TMP13:%.*]] = getelementptr { { i64, ptr, i32, i32 }, [2 x i64], [2 x ptr] }, ptr [[TMP1]], i32 0, i32 2, i32 0
+; LOWERING-NEXT: store volatile ptr [[TMP13]], ptr [[TMP7]], align 8
+; LOWERING-NEXT: call void [[FCT]](i32 [[A]])
+; LOWERING-NEXT: br label [[EXIT:%.*]]
+; LOWERING: no:
+; LOWERING-NEXT: store volatile ptr @bar, ptr [[TMP5]], align 8
+; LOWERING-NEXT: [[TMP14:%.*]] = getelementptr { { i64, ptr, i32, i32 }, [2 x i64], [2 x ptr] }, ptr [[TMP1]], i32 0, i32 2, i32 1
+; LOWERING-NEXT: store volatile ptr [[TMP14]], ptr [[TMP7]], align 8
+; LOWERING-NEXT: call void @bar()
+; LOWERING-NEXT: br label [[EXIT]]
+; LOWERING: exit:
+; LOWERING-NEXT: ret void
;
%t = icmp eq i32 %a, 0
br i1 %t, label %yes, label %no
@@ -36,6 +84,119 @@ no:
exit:
ret void
}
+
+define void @an_entrypoint(i32 %a) {
+; INSTRUMENT-LABEL: define void @an_entrypoint(
+; INSTRUMENT-SAME: i32 [[A:%.*]]) {
+; INSTRUMENT-NEXT: call void @llvm.instrprof.increment(ptr @__profn_an_entrypoint, i64 784007058953177093, i32 2, i32 0)
+; INSTRUMENT-NEXT: [[T:%.*]] = icmp eq i32 [[A]], 0
+; INSTRUMENT-NEXT: br i1 [[T]], label [[YES:%.*]], label [[NO:%.*]]
+; INSTRUMENT: yes:
+; INSTRUMENT-NEXT: call void @llvm.instrprof.increment(ptr @__profn_an_entrypoint, i64 784007058953177093, i32 2, i32 1)
+; INSTRUMENT-NEXT: call void @llvm.instrprof.callsite(ptr @__profn_an_entrypoint, i64 784007058953177093, i32 1, i32 0, ptr @foo)
+; INSTRUMENT-NEXT: call void @foo(i32 1, ptr null)
+; INSTRUMENT-NEXT: ret void
+; INSTRUMENT: no:
+; INSTRUMENT-NEXT: ret void
+;
+; LOWERING-LABEL: define void @an_entrypoint(
+; LOWERING-SAME: i32 [[A:%.*]]) {
+; LOWERING-NEXT: [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_start_context(ptr @an_entrypoint_ctx_root, i64 4909520559318251808, i32 2, i32 1)
+; LOWERING-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
+; LOWERING-NEXT: [[TMP3:%.*]] = and i64 [[TMP2]], 1
+; LOWERING-NEXT: [[TMP4:%.*]] = call ptr @llvm.threadlocal.address.p0(ptr @__llvm_ctx_profile_expected_callee)
+; LOWERING-NEXT: [[TMP5:%.*]] = getelementptr ptr, ptr [[TMP4]], i64 [[TMP3]]
+; LOWERING-NEXT: [[TMP6:%.*]] = call ptr @llvm.threadlocal.address.p0(ptr @__llvm_ctx_profile_callsite)
+; LOWERING-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i64 [[TMP3]]
+; LOWERING-NEXT: [[TMP8:%.*]] = and i64 [[TMP2]], -2
+; LOWERING-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; LOWERING-NEXT: [[T:%.*]] = icmp eq i32 [[A]], 0
+; LOWERING-NEXT: br i1 [[T]], label [[YES:%.*]], label [[NO:%.*]]
+; LOWERING: yes:
+; LOWERING-NEXT: [[TMP10:%.*]] = getelementptr { { i64, ptr, i32, i32 }, [2 x i64], [1 x ptr] }, ptr [[TMP9]], i32 0, i32 1, i32 1
+; LOWERING-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP10]], align 4
+; LOWERING-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 1
+; LOWERING-NEXT: store i64 [[TMP12]], ptr [[TMP10]], align 4
+; LOWERING-NEXT: store volatile ptr @foo, ptr [[TMP5]], align 8
+; LOWERING-NEXT: [[TMP13:%.*]] = getelementptr { { i64, ptr, i32, i32 }, [2 x i64], [1 x ptr] }, ptr [[TMP1]], i32 0, i32 2, i32 0
+; LOWERING-NEXT: store volatile ptr [[TMP13]], ptr [[TMP7]], align 8
+; LOWERING-NEXT: call void @foo(i32 1, ptr null)
+; LOWERING-NEXT: call void @__llvm_ctx_profile_release_context(ptr @an_entrypoint_ctx_root)
+; LOWERING-NEXT: ret void
+; LOWERING: no:
+; LOWERING-NEXT: call void @__llvm_ctx_profile_release_context(ptr @an_entrypoint_ctx_root)
+; LOWERING-NEXT: ret void
+;
+ %t = icmp eq i32 %a, 0
+ br i1 %t, label %yes, label %no
+
+yes:
+ call void @foo(i32 1, ptr null)
+ ret void
+no:
+ ret void
+}
+
+define void @another_entrypoint_no_callees(i32 %a) {
+; INSTRUMENT-LABEL: define void @another_entrypoint_no_callees(
+; INSTRUMENT-SAME: i32 [[A:%.*]]) {
+; INSTRUMENT-NEXT: call void @llvm.instrprof.increment(ptr @__profn_another_entrypoint_no_callees, i64 784007058953177093, i32 2, i32 0)
+; INSTRUMENT-NEXT: [[T:%.*]] = icmp eq i32 [[A]], 0
+; INSTRUMENT-NEXT: br i1 [[T]], label [[YES:%.*]], label [[NO:%.*]]
+; INSTRUMENT: yes:
+; INSTRUMENT-NEXT: call void @llvm.instrprof.increment(ptr @__profn_another_entrypoint_no_callees, i64 784007058953177093, i32 2, i32 1)
+; INSTRUMENT-NEXT: ret void
+; INSTRUMENT: no:
+; INSTRUMENT-NEXT: ret void
+;
+; LOWERING-LABEL: define void @another_entrypoint_no_callees(
+; LOWERING-SAME: i32 [[A:%.*]]) {
+; LOWERING-NEXT: [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_start_context(ptr @another_entrypoint_no_callees_ctx_root, i64 -6371873725078000974, i32 0, i32 0)
+; LOWERING-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
+; LOWERING-NEXT: [[TMP3:%.*]] = and i64 [[TMP2]], -2
+; LOWERING-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; LOWERING-NEXT: [[T:%.*]] = icmp eq i32 [[A]], 0
+; LOWERING-NEXT: br i1 [[T]], label [[YES:%.*]], label [[NO:%.*]]
+; LOWERING: yes:
+; LOWERING-NEXT: [[TMP5:%.*]] = getelementptr { { i64, ptr, i32, i32 }, [0 x i64], [0 x ptr] }, ptr [[TMP4]], i32 0, i32 1, i32 1
+; LOWERING-NEXT: [[TMP6:%.*]] = load i64, ptr [[TMP5]], align 4
+; LOWERING-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 1
+; LOWERING-NEXT: store i64 [[TMP7]], ptr [[TMP5]], align 4
+; LOWERING-NEXT: call void @__llvm_ctx_profile_release_context(ptr @another_entrypoint_no_callees_ctx_root)
+; LOWERING-NEXT: ret void
+; LOWERING: no:
+; LOWERING-NEXT: call void @__llvm_ctx_profile_release_context(ptr @another_entrypoint_no_callees_ctx_root)
+; LOWERING-NEXT: ret void
+;
+ %t = icmp eq i32 %a, 0
+ br i1 %t, label %yes, label %no
+
+yes:
+ ret void
+no:
+ ret void
+}
+
+define void @simple(i32 %a) {
+; INSTRUMENT-LABEL: define void @simple(
+; INSTRUMENT-SAME: i32 [[A:%.*]]) {
+; INSTRUMENT-NEXT: call void @llvm.instrprof.increment(ptr @__profn_simple, i64 742261418966908927, i32 1, i32 0)
+; INSTRUMENT-NEXT: ret void
+;
+; LOWERING-LABEL: define void @simple(
+; LOWERING-SAME: i32 [[A:%.*]]) {
+; LOWERING-NEXT: [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @simple, i64 -3006003237940970099, i32 0, i32 0)
+; LOWERING-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
+; LOWERING-NEXT: [[TMP3:%.*]] = and i64 [[TMP2]], -2
+; LOWERING-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; LOWERING-NEXT: ret void
+;
+ ret void
+}
+
;.
; INSTRUMENT: attributes #[[ATTR0:[0-9]+]] = { nounwind }
;.
+; LOWERING: attributes #[[ATTR0:[0-9]+]] = { nounwind }
+; LOWERING: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+;.
>From ecaad76f812aefcefe930e0b3a32dc6dd15611eb Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin at google.com>
Date: Wed, 1 May 2024 22:43:53 -0700
Subject: [PATCH 2/2] don't report changes if none happened.
---
.../Instrumentation/PGOCtxProfLowering.cpp | 16 ++++++++++------
1 file changed, 10 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
index 7442d8010ab0d3..b3b0197a775cf9 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
@@ -9,6 +9,7 @@
#include "llvm/Transforms/Instrumentation/PGOCtxProfLowering.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/IR/Analysis.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
@@ -59,7 +60,8 @@ class CtxInstrumentationLowerer final {
public:
CtxInstrumentationLowerer(Module &M, ModuleAnalysisManager &MAM);
- void lowerFunction(Function &F);
+ // return true if lowering happened (i.e. a change was made)
+ bool lowerFunction(Function &F);
};
std::pair<uint32_t, uint32_t> getNrCountersAndCallsites(const Function &F) {
@@ -169,14 +171,15 @@ CtxInstrumentationLowerer::CtxInstrumentationLowerer(Module &M,
PreservedAnalyses PGOCtxProfLoweringPass::run(Module &M,
ModuleAnalysisManager &MAM) {
CtxInstrumentationLowerer Lowerer(M, MAM);
+ bool Changed = false;
for (auto &F : M)
- Lowerer.lowerFunction(F);
- return PreservedAnalyses::none();
+ Changed |= Lowerer.lowerFunction(F);
+ return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
}
-void CtxInstrumentationLowerer::lowerFunction(Function &F) {
+bool CtxInstrumentationLowerer::lowerFunction(Function &F) {
if (F.isDeclaration())
- return;
+ return false;
auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
@@ -259,7 +262,7 @@ void CtxInstrumentationLowerer::lowerFunction(Function &F) {
return OptimizationRemarkMissed(DEBUG_TYPE, "Skip", &F)
<< "Function doesn't have instrumentation, skipping";
});
- return;
+ return false;
}
bool ContextWasReleased = false;
@@ -322,4 +325,5 @@ void CtxInstrumentationLowerer::lowerFunction(Function &F) {
"[ctx_prof] An entrypoint was instrumented but it has no `ret` "
"instructions above which to release the context: " +
F.getName());
+ return true;
}
More information about the llvm-commits
mailing list