[llvm] [MemProf] Support cloning for indirect calls with ThinLTO (PR #110625)
Teresa Johnson via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 10 20:15:36 PDT 2024
https://github.com/teresajohnson updated https://github.com/llvm/llvm-project/pull/110625
>From ba5c45f2d0c9cd8dbdb5b47339cac7ad78b5c6b4 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson at google.com>
Date: Fri, 27 Sep 2024 18:40:28 -0700
Subject: [PATCH 1/6] [MemProf] Support cloning for indirect calls with ThinLTO
This patch enables support for cloning in indirect callsites.
This is done by synthesizing callsite records for each virtual call
target from the profile metadata. In the thin link all the synthesized
records for a particular indirect callsite initially share the same
context node, but support is added to partition the callsites and
outgoing edges based on the callee function, creating a separate node
for each target.
In the LTO backend, when cloning is needed we first perform indirect
call promotion, then change the target of the new direct call to the
desired clone.
Note this is ThinLTO-specific, since for regular LTO indirect call
promotion should have already occurred.
---
llvm/include/llvm/IR/ModuleSummaryIndex.h | 5 +
.../IPO/MemProfContextDisambiguation.h | 7 +-
llvm/lib/Analysis/ModuleSummaryAnalysis.cpp | 59 ++-
llvm/lib/IR/AsmWriter.cpp | 4 +-
llvm/lib/Passes/PassBuilderPipelines.cpp | 7 +-
.../IPO/MemProfContextDisambiguation.cpp | 497 +++++++++++++++++-
llvm/test/ThinLTO/X86/memprof-icp.ll | 309 +++++++++++
7 files changed, 842 insertions(+), 46 deletions(-)
create mode 100644 llvm/test/ThinLTO/X86/memprof-icp.ll
diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h
index 288aba16702f56..1cfe7c15f97dbc 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndex.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h
@@ -200,7 +200,12 @@ struct ValueInfo {
return getRef()->second.SummaryList;
}
+ // Even if the index is built with GVs available, we may not have one for
+ // summary entries synthesized for profiled indirect call targets.
+ bool hasName() const { return !haveGVs() || getValue(); }
+
StringRef name() const {
+ assert(!haveGVs() || getRef()->second.U.GV);
return haveGVs() ? getRef()->second.U.GV->getName()
: getRef()->second.U.Name;
}
diff --git a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
index f5dce01c34e7aa..fa067ad62ed1b3 100644
--- a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
+++ b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
@@ -43,8 +43,13 @@ class MemProfContextDisambiguation
// ThinLTO backend via opt (to simulate distributed ThinLTO).
std::unique_ptr<ModuleSummaryIndex> ImportSummaryForTesting;
+ // Whether we are building with SamplePGO. This is needed for correctly
+ // updating profile metadata on speculatively promoted calls.
+ bool SamplePGO;
+
public:
- MemProfContextDisambiguation(const ModuleSummaryIndex *Summary = nullptr);
+ MemProfContextDisambiguation(const ModuleSummaryIndex *Summary = nullptr,
+ bool SamplePGO = false);
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
index 2d4961dade9db1..1bd9ee651d2b0b 100644
--- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -81,6 +81,11 @@ static cl::opt<std::string> ModuleSummaryDotFile(
"module-summary-dot-file", cl::Hidden, cl::value_desc("filename"),
cl::desc("File to emit dot graph of new summary into"));
+static cl::opt<bool> EnableMemProfIndirectCallSupport(
+ "enable-memprof-indirect-call-support", cl::init(true), cl::Hidden,
+ cl::desc(
+ "Enable MemProf support for summarizing and cloning indirect calls"));
+
extern cl::opt<bool> ScalePartialSampleProfileWorkingSetSize;
extern cl::opt<unsigned> MaxNumVTableAnnotations;
@@ -404,6 +409,11 @@ static void computeFunctionSummary(
if (HasLocalsInUsedOrAsm && CI && CI->isInlineAsm())
HasInlineAsmMaybeReferencingInternal = true;
+ // Compute this once per indirect call.
+ uint32_t NumCandidates = 0;
+ uint64_t TotalCount = 0;
+ MutableArrayRef<InstrProfValueData> CandidateProfileData;
+
auto *CalledValue = CB->getCalledOperand();
auto *CalledFunction = CB->getCalledFunction();
if (CalledValue && !CalledFunction) {
@@ -481,9 +491,7 @@ static void computeFunctionSummary(
}
}
- uint32_t NumCandidates;
- uint64_t TotalCount;
- auto CandidateProfileData =
+ CandidateProfileData =
ICallAnalysis.getPromotionCandidatesForInstruction(&I, TotalCount,
NumCandidates);
for (const auto &Candidate : CandidateProfileData)
@@ -495,16 +503,6 @@ static void computeFunctionSummary(
if (!IsThinLTO)
continue;
- // TODO: Skip indirect calls for now. Need to handle these better, likely
- // by creating multiple Callsites, one per target, then speculatively
- // devirtualize while applying clone info in the ThinLTO backends. This
- // will also be important because we will have a different set of clone
- // versions per target. This handling needs to match that in the ThinLTO
- // backend so we handle things consistently for matching of callsite
- // summaries to instructions.
- if (!CalledFunction)
- continue;
-
// Ensure we keep this analysis in sync with the handling in the ThinLTO
// backend (see MemProfContextDisambiguation::applyImport). Save this call
// so that we can skip it in checking the reverse case later.
@@ -555,13 +553,24 @@ static void computeFunctionSummary(
SmallVector<unsigned> StackIdIndices;
for (auto StackId : InstCallsite)
StackIdIndices.push_back(Index.addOrGetStackIdIndex(StackId));
- // Use the original CalledValue, in case it was an alias. We want
- // to record the call edge to the alias in that case. Eventually
- // an alias summary will be created to associate the alias and
- // aliasee.
- auto CalleeValueInfo =
- Index.getOrInsertValueInfo(cast<GlobalValue>(CalledValue));
- Callsites.push_back({CalleeValueInfo, StackIdIndices});
+ if (CalledFunction) {
+ // Use the original CalledValue, in case it was an alias. We want
+ // to record the call edge to the alias in that case. Eventually
+ // an alias summary will be created to associate the alias and
+ // aliasee.
+ auto CalleeValueInfo =
+ Index.getOrInsertValueInfo(cast<GlobalValue>(CalledValue));
+ Callsites.push_back({CalleeValueInfo, StackIdIndices});
+ } else if (EnableMemProfIndirectCallSupport) {
+ // For indirect callsites, create multiple Callsites, one per target.
+ // This enables having a different set of clone versions per target,
+ // and we will apply the cloning decisions while speculatively
+ // devirtualizing in the ThinLTO backends.
+ for (const auto &Candidate : CandidateProfileData) {
+ auto CalleeValueInfo = Index.getOrInsertValueInfo(Candidate.Value);
+ Callsites.push_back({CalleeValueInfo, StackIdIndices});
+ }
+ }
}
}
}
@@ -1214,9 +1223,13 @@ bool llvm::mayHaveMemprofSummary(const CallBase *CB) {
if (CI && CalledFunction->isIntrinsic())
return false;
} else {
- // TODO: For now skip indirect calls. See comments in
- // computeFunctionSummary for what is needed to handle this.
- return false;
+ // Skip inline assembly calls.
+ if (CI && CI->isInlineAsm())
+ return false;
+ // Skip direct calls via Constant.
+ if (!CalledValue || isa<Constant>(CalledValue))
+ return false;
+ return true;
}
return true;
}
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 70e3af941bf77b..f07cfcaa0124da 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -3613,7 +3613,7 @@ void AssemblyWriter::printSummary(const GlobalValueSummary &Summary) {
void AssemblyWriter::printSummaryInfo(unsigned Slot, const ValueInfo &VI) {
Out << "^" << Slot << " = gv: (";
- if (!VI.name().empty())
+ if (VI.hasName() && !VI.name().empty())
Out << "name: \"" << VI.name() << "\"";
else
Out << "guid: " << VI.getGUID();
@@ -3627,7 +3627,7 @@ void AssemblyWriter::printSummaryInfo(unsigned Slot, const ValueInfo &VI) {
Out << ")";
}
Out << ")";
- if (!VI.name().empty())
+ if (VI.hasName() && !VI.name().empty())
Out << " ; guid = " << VI.getGUID();
Out << "\n";
}
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 8f151a99b11709..916967f8ee02b9 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1714,7 +1714,8 @@ ModulePassManager PassBuilder::buildThinLTODefaultPipeline(
// For ThinLTO we must apply the context disambiguation decisions early, to
// ensure we can correctly match the callsites to summary data.
if (EnableMemProfContextDisambiguation)
- MPM.addPass(MemProfContextDisambiguation(ImportSummary));
+ MPM.addPass(MemProfContextDisambiguation(
+ ImportSummary, PGOOpt && PGOOpt->Action == PGOOptions::SampleUse));
// These passes import type identifier resolutions for whole-program
// devirtualization and CFI. They must run early because other passes may
@@ -1927,7 +1928,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
// amount of additional cloning required to distinguish the allocation
// contexts.
if (EnableMemProfContextDisambiguation)
- MPM.addPass(MemProfContextDisambiguation());
+ MPM.addPass(MemProfContextDisambiguation(
+ /*Summary=*/nullptr,
+ PGOOpt && PGOOpt->Action == PGOOptions::SampleUse));
// Optimize globals again after we ran the inliner.
MPM.addPass(GlobalOptPass());
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 5e7d2c3c713d32..84e177355003d4 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -29,6 +29,7 @@
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/IndirectCallPromotionAnalysis.h"
#include "llvm/Analysis/MemoryProfileInfo.h"
#include "llvm/Analysis/ModuleSummaryAnalysis.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
@@ -43,6 +44,8 @@
#include "llvm/Support/GraphWriter.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/Instrumentation.h"
+#include "llvm/Transforms/Utils/CallPromotionUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include <deque>
#include <sstream>
@@ -449,9 +452,10 @@ class CallsiteContextGraph {
}
};
- /// Helpers to remove callee edges that have allocation type None (due to not
+ /// Helpers to remove edges that have allocation type None (due to not
/// carrying any context ids) after transformations.
void removeNoneTypeCalleeEdges(ContextNode *Node);
+ void removeNoneTypeCallerEdges(ContextNode *Node);
void
recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
DenseSet<const ContextNode *> &Visited);
@@ -483,6 +487,14 @@ class CallsiteContextGraph {
/// multiple different callee target functions.
void handleCallsitesWithMultipleTargets();
+ // Try to partition calls on the given node (already placed into the AllCalls
+ // array) by callee function, creating new copies of Node as needed to hold
+ // calls with different callees, and moving the callee edges appropriately.
+ // Returns true if partitioning was successful.
+ bool partitionCallsByCallee(
+ ContextNode *Node, ArrayRef<CallInfo> AllCalls,
+ std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode);
+
/// Save lists of calls with MemProf metadata in each function, for faster
/// iteration.
MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
@@ -563,6 +575,12 @@ class CallsiteContextGraph {
calleesMatch(CallTy Call, EdgeIter &EI,
MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
+ // Return the callee function of the given call, or nullptr if it can't be
+ // determined
+ const FuncTy *getCalleeFunc(CallTy Call) {
+ return static_cast<DerivedCCG *>(this)->getCalleeFunc(Call);
+ }
+
/// Returns true if the given call targets the given function, or if we were
/// able to identify the call chain through intermediate tail calls (in which
/// case FoundCalleeChain will be populated).
@@ -670,6 +688,14 @@ class CallsiteContextGraph {
bool NewClone = false,
DenseSet<uint32_t> ContextIdsToMove = {});
+ /// Change the caller of the edge at the given callee edge iterator to be
+ /// NewCaller, performing the necessary context id and allocation type
+ /// updates. The iterator is updated as the edge is removed from the list of
+ /// callee edges in the original caller. This is similar to the above
+ /// moveEdgeToExistingCalleeClone, but a simplified version of it as we always
+ /// move the given edge and all of its context ids.
+ void moveCalleeEdgeToNewCaller(EdgeIter &CalleeEdgeI, ContextNode *NewCaller);
+
/// Recursively perform cloning on the graph for the given Node and its
/// callers, in order to uniquely identify the allocation behavior of an
/// allocation given its context. The context ids of the allocation being
@@ -731,6 +757,7 @@ class ModuleCallsiteContextGraph
Instruction *>;
uint64_t getStackId(uint64_t IdOrIndex) const;
+ const Function *getCalleeFunc(Instruction *Call);
bool calleeMatchesFunc(
Instruction *Call, const Function *Func, const Function *CallerFunc,
std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
@@ -808,6 +835,7 @@ class IndexCallsiteContextGraph
IndexCall>;
uint64_t getStackId(uint64_t IdOrIndex) const;
+ const FunctionSummary *getCalleeFunc(IndexCall &Call);
bool calleeMatchesFunc(
IndexCall &Call, const FunctionSummary *Func,
const FunctionSummary *CallerFunc,
@@ -1003,6 +1031,20 @@ void CallsiteContextGraph<
}
}
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+void CallsiteContextGraph<
+ DerivedCCG, FuncTy, CallTy>::removeNoneTypeCallerEdges(ContextNode *Node) {
+ for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
+ auto Edge = *EI;
+ if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
+ assert(Edge->ContextIds.empty());
+ Edge->Caller->eraseCalleeEdge(Edge.get());
+ EI = Node->CallerEdges.erase(EI);
+ } else
+ ++EI;
+ }
+}
+
template <typename DerivedCCG, typename FuncTy, typename CallTy>
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
@@ -2025,6 +2067,21 @@ void CallsiteContextGraph<DerivedCCG, FuncTy,
AllCalls.push_back(Node->Call);
AllCalls.insert(AllCalls.end(), Node->MatchingCalls.begin(),
Node->MatchingCalls.end());
+
+ // First see if we can partition the calls by callee function, creating new
+ // nodes to host each set of calls calling the same callees. This is
+ // necessary for support indirect calls with ThinLTO, for which we
+ // synthesized CallsiteInfo records for each target. They will all have the
+ // same callsite stack ids and would be sharing a context node at this
+ // point. We need to perform separate cloning for each, which will be
+ // applied along with speculative devirtualization in the ThinLTO backends
+ // as needed. Note this does not currently support looking through tail
+ // calls, it is unclear if we need that for indirect call targets.
+ // First partition calls by callee func. Map indexed by func, value is
+ // struct with list of matching calls, assigned node.
+ if (partitionCallsByCallee(Node, AllCalls, NewCallToNode))
+ continue;
+
auto It = AllCalls.begin();
// Iterate through the calls until we find the first that matches.
for (; It != AllCalls.end(); ++It) {
@@ -2098,6 +2155,130 @@ void CallsiteContextGraph<DerivedCCG, FuncTy,
NonAllocationCallToContextNodeMap[Call] = Node;
}
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::partitionCallsByCallee(
+ ContextNode *Node, ArrayRef<CallInfo> AllCalls,
+ std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode) {
+ // Struct to keep track of all the calls having the same callee function,
+ // and the node we eventually assign to them. Eventually we will record the
+ // context node assigned to this group of calls.
+ struct CallsWithSameCallee {
+ std::vector<CallInfo> Calls;
+ ContextNode *Node = nullptr;
+ };
+
+ // First partition calls by callee function. Build map from each function
+ // to the list of matching calls.
+ DenseMap<const FuncTy *, CallsWithSameCallee> CalleeFuncToCallInfo;
+ for (auto ThisCall : AllCalls) {
+ auto *F = getCalleeFunc(ThisCall.call());
+ if (F)
+ CalleeFuncToCallInfo[F].Calls.push_back(ThisCall);
+ }
+
+ // Next, walk through all callee edges. For each callee node, get its
+ // containing function and see if it was recorded in the above map (meaning we
+ // have at least one matching call). Build another map from each callee node
+ // with a matching call to the structure instance created above containing all
+ // the calls.
+ DenseMap<ContextNode *, CallsWithSameCallee *> CalleeNodeToCallInfo;
+ for (const auto &Edge : Node->CalleeEdges) {
+ if (!Edge->Callee->hasCall())
+ continue;
+ const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
+ if (CalleeFuncToCallInfo.contains(ProfiledCalleeFunc))
+ CalleeNodeToCallInfo[Edge->Callee] =
+ &CalleeFuncToCallInfo[ProfiledCalleeFunc];
+ }
+
+ // If there are entries in the second map, then there were no matching
+ // calls/callees, nothing to do here. Return so we can go to the handling that
+ // looks through tail calls.
+ if (CalleeNodeToCallInfo.empty())
+ return false;
+
+ // Walk through all callee edges again. Any and all callee edges that didn't
+ // match any calls (callee not in the CalleeNodeToCallInfo map) are moved to a
+ // new caller node (UnmatchedCalleesNode) which gets a null call so that it is
+ // ignored during cloning. If it is in the map, then we use the node recorded
+ // in that entry (creating it if needed), and move the callee edge to it.
+ // The first callee will use the original node instead of creating a new one.
+ // Note that any of the original calls on this node (in AllCalls) that didn't
+ // have a callee function automatically get dropped from the node as part of
+ // this process.
+ ContextNode *UnmatchedCalleesNode = nullptr;
+ // Track whether we already assigned original node to a callee.
+ bool UsedOrigNode = false;
+ assert(NodeToCallingFunc[Node]);
+ for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
+ auto Edge = *EI;
+ if (!Edge->Callee->hasCall()) {
+ ++EI;
+ continue;
+ }
+
+ // Will be updated below to point to whatever (caller) node this callee edge
+ // should be moved to.
+ ContextNode *CallerNodeToUse = nullptr;
+
+ // Handle the case where there were no matching calls first. Move this
+ // callee edge to the UnmatchedCalleesNode, creating it if needed.
+ if (!CalleeNodeToCallInfo.contains(Edge->Callee)) {
+ if (!UnmatchedCalleesNode)
+ UnmatchedCalleesNode =
+ createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
+ CallerNodeToUse = UnmatchedCalleesNode;
+ } else {
+ // Look up the information recorded for this callee node, and use the
+ // recorded caller node (creating it if needed).
+ auto *Info = CalleeNodeToCallInfo[Edge->Callee];
+ if (!Info->Node) {
+ // If we haven't assigned any callees to the original node use it.
+ if (!UsedOrigNode) {
+ Info->Node = Node;
+ // Clear the set of matching calls which will be updated below.
+ Node->MatchingCalls.clear();
+ UsedOrigNode = true;
+ } else
+ Info->Node =
+ createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
+ assert(!Info->Calls.empty());
+ // The first call becomes the primary call for this caller node, and the
+ // rest go in the matching calls list.
+ Info->Node->setCall(Info->Calls.front());
+ Info->Node->MatchingCalls.insert(Info->Node->MatchingCalls.end(),
+ Info->Calls.begin() + 1,
+ Info->Calls.end());
+ // Save the primary call to node correspondence so that we can update
+ // the NonAllocationCallToContextNodeMap, which is being iterated in the
+ // caller of this function.
+ NewCallToNode.push_back({Info->Node->Call, Info->Node});
+ }
+ CallerNodeToUse = Info->Node;
+ }
+
+ // Don't need to move edge if we are using the original node;
+ if (CallerNodeToUse == Node) {
+ ++EI;
+ continue;
+ }
+
+ moveCalleeEdgeToNewCaller(EI, CallerNodeToUse);
+ }
+ // Now that we are done moving edges, clean up any caller edges that ended
+ // up with no type or context ids. During moveCalleeEdgeToNewCaller all
+ // caller edges from Node are replicated onto the new callers, and it
+ // simplifies the handling to leave them until we have moved all
+ // edges/context ids.
+ for (auto &I : CalleeNodeToCallInfo)
+ removeNoneTypeCallerEdges(I.second->Node);
+ if (UnmatchedCalleesNode)
+ removeNoneTypeCallerEdges(UnmatchedCalleesNode);
+ removeNoneTypeCallerEdges(Node);
+
+ return true;
+}
+
uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
// In the Module (IR) case this is already the Id.
return IdOrIndex;
@@ -2276,6 +2457,17 @@ bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
return FoundSingleCalleeChain;
}
+const Function *ModuleCallsiteContextGraph::getCalleeFunc(Instruction *Call) {
+ auto *CB = dyn_cast<CallBase>(Call);
+ if (!CB->getCalledOperand() || CB->isIndirectCall())
+ return nullptr;
+ auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
+ auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
+ if (Alias)
+ return dyn_cast<Function>(Alias->getAliasee());
+ return dyn_cast<Function>(CalleeVal);
+}
+
bool ModuleCallsiteContextGraph::calleeMatchesFunc(
Instruction *Call, const Function *Func, const Function *CallerFunc,
std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
@@ -2411,6 +2603,15 @@ bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
return FoundSingleCalleeChain;
}
+const FunctionSummary *
+IndexCallsiteContextGraph::getCalleeFunc(IndexCall &Call) {
+ ValueInfo Callee =
+ dyn_cast_if_present<CallsiteInfo *>(Call.getBase())->Callee;
+ if (Callee.getSummaryList().empty())
+ return nullptr;
+ return dyn_cast<FunctionSummary>(Callee.getSummaryList()[0]->getBaseObject());
+}
+
bool IndexCallsiteContextGraph::calleeMatchesFunc(
IndexCall &Call, const FunctionSummary *Func,
const FunctionSummary *CallerFunc,
@@ -2871,6 +3072,81 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
}
}
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
+ moveCalleeEdgeToNewCaller(EdgeIter &CalleeEdgeI, ContextNode *NewCaller) {
+ auto Edge = *CalleeEdgeI;
+
+ ContextNode *OldCaller = Edge->Caller;
+
+ // We might already have an edge to the new caller. If one exists we will
+ // reuse it.
+ auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(Edge->Callee);
+
+ CalleeEdgeI = OldCaller->CalleeEdges.erase(CalleeEdgeI);
+ if (ExistingEdgeToNewCaller) {
+ // Since we already have an edge to NewCaller, simply move the ids
+ // onto it, and remove the existing Edge.
+ ExistingEdgeToNewCaller->getContextIds().insert(
+ Edge->getContextIds().begin(), Edge->getContextIds().end());
+ ExistingEdgeToNewCaller->AllocTypes |= Edge->AllocTypes;
+ Edge->ContextIds.clear();
+ Edge->AllocTypes = (uint8_t)AllocationType::None;
+ Edge->Callee->eraseCallerEdge(Edge.get());
+ } else {
+ // Otherwise just reconnect Edge to NewCaller.
+ Edge->Caller = NewCaller;
+ NewCaller->CalleeEdges.push_back(Edge);
+ // Don't need to update Edge's context ids since we are simply
+ // reconnecting it.
+ }
+ // In either case, need to update the alloc types on New Caller.
+ NewCaller->AllocTypes |= Edge->AllocTypes;
+
+ // Now walk the old caller node's caller edges and move Edge's context ids
+ // over to the corresponding edge into the node (which is created here if
+ // this is a newly created node). We can tell whether this is a newly created
+ // node by seeing if it has any caller edges yet.
+#ifndef NDEBUG
+ bool IsNewNode = NewCaller->CallerEdges.empty();
+#endif
+ for (auto &OldCallerEdge : OldCaller->CallerEdges) {
+ // The context ids moving to the new caller are the subset of this edge's
+ // context ids and the context ids on the callee edge being moved.
+ DenseSet<uint32_t> EdgeContextIdsToMove =
+ set_intersection(OldCallerEdge->getContextIds(), Edge->getContextIds());
+ set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove);
+ OldCallerEdge->AllocTypes =
+ computeAllocType(OldCallerEdge->getContextIds());
+ // In this function we expect that any pre-existing node already has edges
+ // from the same callers as the old node. That should be true in the current
+ // use case, where we will remove None-type edges after copying over all
+ // caller edges from the callee.
+ assert(IsNewNode || NewCaller->findEdgeFromCaller(OldCallerEdge->Caller));
+ auto NewEdge = std::make_shared<ContextEdge>(
+ NewCaller, OldCallerEdge->Caller,
+ computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove);
+ NewCaller->CallerEdges.push_back(NewEdge);
+ NewEdge->Caller->CalleeEdges.push_back(NewEdge);
+ }
+ // Recompute the node alloc type now that its caller edges have been
+ // updated (since we will compute from those edges).
+ OldCaller->AllocTypes = OldCaller->computeAllocType();
+ // OldCaller alloc type should be None iff its context id set is now empty.
+ assert((OldCaller->AllocTypes == (uint8_t)AllocationType::None) ==
+ OldCaller->emptyContextIds());
+ if (VerifyCCG) {
+ checkNode<DerivedCCG, FuncTy, CallTy>(OldCaller, /*CheckEdges=*/false);
+ checkNode<DerivedCCG, FuncTy, CallTy>(NewCaller, /*CheckEdges=*/false);
+ for (const auto &OldCallerEdge : OldCaller->CallerEdges)
+ checkNode<DerivedCCG, FuncTy, CallTy>(OldCallerEdge->Caller,
+ /*CheckEdges=*/false);
+ for (const auto &NewCallerEdge : NewCaller->CallerEdges)
+ checkNode<DerivedCCG, FuncTy, CallTy>(NewCallerEdge->Caller,
+ /*CheckEdges=*/false);
+ }
+}
+
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
recursivelyRemoveNoneTypeCalleeEdges(
@@ -3792,6 +4068,7 @@ static ValueInfo findValueInfoForFunc(const Function &F, const Module &M,
bool MemProfContextDisambiguation::applyImport(Module &M) {
assert(ImportSummary);
bool Changed = false;
+ ICallPromotionAnalysis ICallAnalysis;
auto IsMemProfClone = [](const Function &F) {
return F.getName().contains(MemProfCloneSuffix);
@@ -3808,6 +4085,13 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
FuncToAliasMap[F].insert(&A);
}
+ InstrProfSymtab Symtab;
+ if (Error E = Symtab.create(M, /*InLTO=*/true)) {
+ std::string SymtabFailure = toString(std::move(E));
+ M.getContext().emitError("Failed to create symtab: " + SymtabFailure);
+ return false;
+ }
+
for (auto &F : M) {
if (F.isDeclaration() || IsMemProfClone(F))
continue;
@@ -3844,8 +4128,6 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
// Perform cloning if not yet done.
CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
- // Should have skipped indirect calls via mayHaveMemprofSummary.
- assert(CalledFunction);
assert(!IsMemProfClone(*CalledFunction));
// Update the calls per the summary info.
@@ -3933,6 +4215,16 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite});
}
+ // Data structures for saving indirect call profile info for use in ICP with
+ // cloning.
+ struct ICallAnalysisData {
+ std::vector<InstrProfValueData> CandidateProfileData;
+ uint32_t NumCandidates;
+ uint64_t TotalCount;
+ unsigned CallsiteInfoStartIndex;
+ };
+ std::map<CallBase *, ICallAnalysisData> ICallAnalysisMap;
+
// Assume for now that the instructions are in the exact same order
// as when the summary was created, but confirm this is correct by
// matching the stack ids.
@@ -4075,23 +4367,79 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
<< ore::NV("Attribute", AllocTypeString));
}
} else if (!CallsiteContext.empty()) {
- // Consult the next callsite node.
- assert(SI != FS->callsites().end());
- auto &StackNode = *(SI++);
-
+ if (!CalledFunction) {
+ // This is an indirect call, see if we have profile information and
+ // whether any clones were recorded for the profiled targets (that
+ // we synthesized CallsiteInfo summary records for when building the
+ // index).
#ifndef NDEBUG
- // Sanity check that the stack ids match between the summary and
- // instruction metadata.
- auto StackIdIndexIter = StackNode.StackIdIndices.begin();
- for (auto StackId : CallsiteContext) {
- assert(StackIdIndexIter != StackNode.StackIdIndices.end());
- assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
- StackId);
- StackIdIndexIter++;
+ // We should have skipped inline assembly calls.
+ auto *CI = dyn_cast<CallInst>(CB);
+ assert(!CI || !CI->isInlineAsm());
+#endif
+ // We should have skipped direct calls via a Constant.
+ assert(CalledValue && !isa<Constant>(CalledValue));
+
+ uint32_t NumCandidates;
+ uint64_t TotalCount;
+ auto CandidateProfileData =
+ ICallAnalysis.getPromotionCandidatesForInstruction(
+ CB, TotalCount, NumCandidates);
+ if (!CandidateProfileData.empty()) {
+ unsigned CallsiteInfoStartStartIndex =
+ static_cast<unsigned int>(SI - FS->callsites().begin());
+ // Iterate past all of the associated callsites nodes and check
+ // them.
+ for (const auto &Candidate : CandidateProfileData) {
+#ifndef NDEBUG
+ auto CalleeValueInfo =
+#endif
+ ImportSummary->getValueInfo(Candidate.Value);
+ // We might not have a ValueInfo if this is a distributed
+ // ThinLTO backend and decided not to import that function.
+ assert(!CalleeValueInfo || SI->Callee == CalleeValueInfo);
+ assert(SI != FS->callsites().end());
+ auto &StackNode = *(SI++);
+ // See if any of the clones of the indirect callsite for this
+ // profiled target should call a cloned version of the profiled
+ // target. We only need to do the ICP here if so.
+ for (auto CloneNo : StackNode.Clones) {
+ if (!CloneNo)
+ continue;
+ // Save information for ICP, which is performed later to avoid
+ // messing up the current function traversal.
+ ICallAnalysisMap[CB] = {CandidateProfileData.vec(),
+ NumCandidates, TotalCount,
+ CallsiteInfoStartStartIndex};
+ break;
+ }
+ // Perform cloning if not yet done. This is done here in case
+ // we don't need to do ICP, but might need to clone this
+ // function as it is the target of other cloned calls.
+ CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
+ }
+ }
}
+
+ else {
+ // Consult the next callsite node.
+ assert(SI != FS->callsites().end());
+ auto &StackNode = *(SI++);
+
+#ifndef NDEBUG
+ // Sanity check that the stack ids match between the summary and
+ // instruction metadata.
+ auto StackIdIndexIter = StackNode.StackIdIndices.begin();
+ for (auto StackId : CallsiteContext) {
+ assert(StackIdIndexIter != StackNode.StackIdIndices.end());
+ assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
+ StackId);
+ StackIdIndexIter++;
+ }
#endif
- CloneCallsite(StackNode, CB, CalledFunction);
+ CloneCallsite(StackNode, CB, CalledFunction);
+ }
} else if (CB->isTailCall()) {
// Locate the synthesized callsite info for the callee VI, if any was
// created, and use that for cloning.
@@ -4108,6 +4456,119 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
I.setMetadata(LLVMContext::MD_callsite, nullptr);
}
}
+
+ // Now do any promotion required for cloning. Specifically, for each
+ // recorded ICP candidate (which was only recorded because one clone of that
+ // candidate should call a cloned target), we perform ICP (speculative
+ // devirtualization) for each clone of the callsite, and update its callee
+ // to the appropriate clone. Note that the ICP compares against the original
+ // version of the target, which is what is in the vtable.
+ for (auto &ICallInfo : ICallAnalysisMap) {
+ auto *CB = ICallInfo.first;
+ auto &Info = ICallInfo.second;
+ auto CallsiteIndex = Info.CallsiteInfoStartIndex;
+ auto TotalCount = Info.TotalCount;
+ unsigned NumPromoted = 0;
+ unsigned NumClones = 0;
+
+ for (auto &Candidate : Info.CandidateProfileData) {
+ auto &StackNode = FS->callsites()[CallsiteIndex++];
+
+ // All calls in the same function must have the same number of clones.
+ assert(!NumClones || NumClones == StackNode.Clones.size());
+ NumClones = StackNode.Clones.size();
+
+ // See if the target is in the module. If it wasn't imported, it is
+ // possible that this profile could have been collected on a different
+ // target (or version of the code), and we need to be conservative
+ // (similar to what is done in the ICP pass).
+ Function *TargetFunction = Symtab.getFunction(Candidate.Value);
+ if (TargetFunction == nullptr || TargetFunction->isDeclaration()) {
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget",
+ CB)
+ << "Cannot promote indirect call: target with md5sum "
+ << ore::NV("target md5sum", Candidate.Value) << " not found";
+ });
+ // FIXME: See if we can use the new declaration importing support to
+ // at least get the declarations imported for this case. Hot indirect
+ // targets should have been imported normally, however.
+ continue;
+ }
+
+ // Check if legal to promote
+ const char *Reason = nullptr;
+ if (!isLegalToPromote(*CB, TargetFunction, &Reason)) {
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", CB)
+ << "Cannot promote indirect call to "
+ << ore::NV("TargetFunction", TargetFunction)
+ << " with count of " << ore::NV("TotalCount", TotalCount)
+ << ": " << Reason;
+ });
+ continue;
+ }
+
+ assert(!IsMemProfClone(*TargetFunction));
+
+ // Handle each call clone, applying ICP so that each clone directly
+ // calls the specified callee clone, guarded by the appropriate ICP
+ // check.
+ auto CalleeOrigName = TargetFunction->getName();
+ for (unsigned J = 0; J < NumClones; J++) {
+ CallBase *CBClone;
+ // Copy 0 is the original function.
+ if (!J)
+ CBClone = CB;
+ else
+ CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
+ // We do the promotion using the original name, so that the comparison
+ // is against the name in the vtable. Then just below, change the new
+ // direct call to call the cloned function.
+ auto &DirectCall = pgo::promoteIndirectCall(
+ *CBClone, TargetFunction, Candidate.Count, TotalCount, SamplePGO,
+ &ORE);
+ auto *TargetToUse = TargetFunction;
+ // Call original if this version calls the original version of its
+ // callee.
+ if (StackNode.Clones[J])
+ TargetToUse = cast<Function>(
+ M.getOrInsertFunction(
+ getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
+ TargetFunction->getFunctionType())
+ .getCallee());
+ DirectCall.setCalledFunction(TargetToUse);
+ ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
+ << ore::NV("Call", CBClone) << " in clone "
+ << ore::NV("Caller", CBClone->getFunction())
+ << " promoted and assigned to call function clone "
+ << ore::NV("Callee", TargetToUse));
+ }
+
+ // Update TotalCount (all clones should get same count above)
+ TotalCount -= Candidate.Count;
+ NumPromoted++;
+ }
+ // Adjust the MD.prof metadata for all clones, now that we have the new
+ // TotalCount and the number promoted.
+ for (unsigned J = 0; J < NumClones; J++) {
+ CallBase *CBClone;
+ // Copy 0 is the original function.
+ if (!J)
+ CBClone = CB;
+ else
+ CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
+ // First delete the old one.
+ CBClone->setMetadata(LLVMContext::MD_prof, nullptr);
+ // If all promoted, we don't need the MD.prof metadata.
+ // Otherwise we need update with the un-promoted records back.
+ if (TotalCount != 0)
+ annotateValueSite(
+ M, *CBClone,
+ ArrayRef(Info.CandidateProfileData).slice(NumPromoted),
+ TotalCount, IPVK_IndirectCallTarget, Info.NumCandidates);
+ }
+ }
}
return Changed;
@@ -4179,8 +4640,8 @@ bool MemProfContextDisambiguation::processModule(
}
MemProfContextDisambiguation::MemProfContextDisambiguation(
- const ModuleSummaryIndex *Summary)
- : ImportSummary(Summary) {
+ const ModuleSummaryIndex *Summary, bool SamplePGO)
+ : ImportSummary(Summary), SamplePGO(SamplePGO) {
if (ImportSummary) {
// The MemProfImportSummary should only be used for testing ThinLTO
// distributed backend handling via opt, in which case we don't have a
diff --git a/llvm/test/ThinLTO/X86/memprof-icp.ll b/llvm/test/ThinLTO/X86/memprof-icp.ll
new file mode 100644
index 00000000000000..4b8dc454026d2c
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-icp.ll
@@ -0,0 +1,309 @@
+;; Test that cloning of an indirect call works. We should perform ICP and update
+;; promoted call to the correct clone.
+
+;; This was created from the following source code, then the IR was reduced
+;; using llvm-reduce with the expected FileCheck input.
+
+;; -- virtfunc.h: --
+;; #include <unistd.h>
+;;
+;; void external(int *x);
+;;
+;; class B0 {
+;; public:
+;; virtual int bar(unsigned s);
+;; };
+;;
+;; class B : public B0 {
+;; public:
+;; int bar(unsigned s) override;
+;; };
+;;
+;; int foo(B0 &b, unsigned s);
+
+;; -- virtfunc.cc: --
+;; #include "virtfunc.h"
+;;
+;; int foo(B0 &b, unsigned s) {
+;; return b.bar(s);
+;; }
+
+;; -- virtfunc_main.cc: --
+;; #include "virtfunc.h"
+;; #include <stdio.h>
+;;
+;; int main() {
+;; B b;
+;; int x = foo(b, 1);
+;; printf("%d\n", x);
+;; int y = foo(b, 10);
+;; printf("%d\n", y);
+;; B0 b0;
+;; x = foo(b0, 1);
+;; printf("%d\n", x);
+;; y = foo(b0, 10);
+;; printf("%d\n", y);
+;; return 0;
+;; }
+;;
+;; int B0::bar(unsigned s) {
+;; int *x = new int;
+;; sleep(s);
+;; external(x);
+;; delete x;
+;; return 1;
+;; }
+;;
+;; int B::bar(unsigned s) {
+;; int *x = new int;
+;; sleep(s);
+;; external(x);
+;; delete x;
+;; return 2;
+;; }
+
+;; -stats requires asserts
+; REQUIRES: asserts
+
+; RUN: split-file %s %t
+
+; RUN: opt -thinlto-bc %t/main.ll >%t/main.o
+; RUN: opt -thinlto-bc %t/foo.ll >%t/foo.o
+
+;; First perform in-process ThinLTO
+; RUN: llvm-lto2 run %t/main.o %t/foo.o -enable-memprof-context-disambiguation \
+; RUN: -supports-hot-cold-new \
+; RUN: -r=%t/foo.o,_Z3fooR2B0j,plx \
+; RUN: -r=%t/main.o,_Z3fooR2B0j, \
+; RUN: -r=%t/main.o,_Znwm, \
+; RUN: -r=%t/main.o,_ZdlPvm, \
+; RUN: -r=%t/main.o,_Z8externalPi, \
+; RUN: -r=%t/main.o,main,plx \
+; RUN: -r=%t/main.o,_ZN2B03barEj,plx \
+; RUN: -r=%t/main.o,_ZN1B3barEj,plx \
+; RUN: -r=%t/main.o,_ZTV1B,plx \
+; RUN: -r=%t/main.o,_ZTVN10__cxxabiv120__si_class_type_infoE,plx \
+; RUN: -r=%t/main.o,_ZTS1B,plx \
+; RUN: -r=%t/main.o,_ZTVN10__cxxabiv117__class_type_infoE,plx \
+; RUN: -r=%t/main.o,_ZTS2B0,plx \
+; RUN: -r=%t/main.o,_ZTI2B0,plx \
+; RUN: -r=%t/main.o,_ZTI1B,plx \
+; RUN: -r=%t/main.o,_ZTV2B0,plx \
+; RUN: -thinlto-threads=1 \
+; RUN: -memprof-verify-ccg -memprof-verify-nodes -stats \
+; RUN: -pass-remarks=. -save-temps \
+; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=STATS \
+; RUN: --check-prefix=STATS-BE --check-prefix=REMARKS-MAIN \
+; RUN: --check-prefix=REMARKS-FOO
+
+; RUN: llvm-dis %t.out.2.4.opt.bc -o - | FileCheck %s --check-prefix=IR
+
+;; Try again but with distributed ThinLTO
+; RUN: llvm-lto2 run %t/main.o %t/foo.o -enable-memprof-context-disambiguation \
+; RUN: -supports-hot-cold-new \
+; RUN: -thinlto-distributed-indexes \
+; RUN: -r=%t/foo.o,_Z3fooR2B0j,plx \
+; RUN: -r=%t/main.o,_Z3fooR2B0j, \
+; RUN: -r=%t/main.o,_Znwm, \
+; RUN: -r=%t/main.o,_ZdlPvm, \
+; RUN: -r=%t/main.o,_Z8externalPi, \
+; RUN: -r=%t/main.o,main,plx \
+; RUN: -r=%t/main.o,_ZN2B03barEj,plx \
+; RUN: -r=%t/main.o,_ZN1B3barEj,plx \
+; RUN: -r=%t/main.o,_ZTV1B,plx \
+; RUN: -r=%t/main.o,_ZTVN10__cxxabiv120__si_class_type_infoE,plx \
+; RUN: -r=%t/main.o,_ZTS1B,plx \
+; RUN: -r=%t/main.o,_ZTVN10__cxxabiv117__class_type_infoE,plx \
+; RUN: -r=%t/main.o,_ZTS2B0,plx \
+; RUN: -r=%t/main.o,_ZTI2B0,plx \
+; RUN: -r=%t/main.o,_ZTI1B,plx \
+; RUN: -r=%t/main.o,_ZTV2B0,plx \
+; RUN: -memprof-verify-ccg -memprof-verify-nodes -stats \
+; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=STATS
+
+;; Run ThinLTO backend
+; RUN: opt -import-all-index -passes=function-import,memprof-context-disambiguation,inline \
+; RUN: -summary-file=%t/foo.o.thinlto.bc -memprof-import-summary=%t/foo.o.thinlto.bc \
+; RUN: -enable-import-metadata -stats -pass-remarks=. \
+; RUN: %t/foo.o -S 2>&1 | FileCheck %s --check-prefix=IR \
+; RUN: --check-prefix=STATS-BE-DISTRIB --check-prefix=REMARKS-FOO
+
+; REMARKS-MAIN: call in clone main assigned to call function clone _Z3fooR2B0j.memprof.1
+; REMARKS-MAIN: call in clone main assigned to call function clone _Z3fooR2B0j.memprof.1
+; REMARKS-MAIN: created clone _ZN2B03barEj.memprof.1
+; REMARKS-MAIN: call in clone _ZN2B03barEj marked with memprof allocation attribute notcold
+; REMARKS-MAIN: call in clone _ZN2B03barEj.memprof.1 marked with memprof allocation attribute cold
+; REMARKS-MAIN: created clone _ZN1B3barEj.memprof.1
+; REMARKS-MAIN: call in clone _ZN1B3barEj marked with memprof allocation attribute notcold
+; REMARKS-MAIN: call in clone _ZN1B3barEj.memprof.1 marked with memprof allocation attribute cold
+; REMARKS-FOO: created clone _Z3fooR2B0j.memprof.1
+;; In each version of foo we should have promoted the indirect call to two conditional
+;; direct calls, one to B::bar and one to B0::bar. The cloned version of foo should call
+;; the cloned versions of bar for both promotions.
+; REMARKS-FOO: Promote indirect call to _ZN1B3barEj with count 2 out of 4
+; REMARKS-FOO: call in clone _Z3fooR2B0j promoted and assigned to call function clone _ZN1B3barEj
+; REMARKS-FOO: Promote indirect call to _ZN1B3barEj with count 2 out of 4
+; REMARKS-FOO: call in clone _Z3fooR2B0j.memprof.1 promoted and assigned to call function clone _ZN1B3barEj.memprof.1
+; REMARKS-FOO: Promote indirect call to _ZN2B03barEj with count 2 out of 2
+; REMARKS-FOO: call in clone _Z3fooR2B0j promoted and assigned to call function clone _ZN2B03barEj
+; REMARKS-FOO: Promote indirect call to _ZN2B03barEj with count 2 out of 2
+; REMARKS-FOO: call in clone _Z3fooR2B0j.memprof.1 promoted and assigned to call function clone _ZN2B03barEj.memprof.1
+; REMARKS-FOO: created clone _ZN2B03barEj.memprof.1
+; REMARKS-FOO: call in clone _ZN2B03barEj marked with memprof allocation attribute notcold
+; REMARKS-FOO: call in clone _ZN2B03barEj.memprof.1 marked with memprof allocation attribute cold
+; REMARKS-FOO: created clone _ZN1B3barEj.memprof.1
+; REMARKS-FOO: call in clone _ZN1B3barEj marked with memprof allocation attribute notcold
+; REMARKS-FOO: call in clone _ZN1B3barEj.memprof.1 marked with memprof allocation attribute cold
+
+; STATS: 2 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during whole program analysis
+; STATS-BE: 4 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend
+; STATS: 2 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during whole program analysis
+; STATS-BE: 4 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend
+; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis
+; STATS-BE: 5 memprof-context-disambiguation - Number of function clones created during ThinLTO backend
+
+; IR: define {{.*}} @_Z3fooR2B0j(
+; IR: %1 = icmp eq ptr %0, @_ZN1B3barEj
+; IR: br i1 %1, label %if.true.direct_targ, label %if.false.orig_indirect
+; IR: if.true.direct_targ:
+; IR: call {{.*}} @_Znwm(i64 noundef 4) #[[NOTCOLD:[0-9]+]]
+; IR: if.false.orig_indirect:
+; IR: %2 = icmp eq ptr %0, @_ZN2B03barEj
+; IR: br i1 %2, label %if.true.direct_targ1, label %if.false.orig_indirect2
+; IR: if.true.direct_targ1:
+; IR: call {{.*}} @_Znwm(i64 noundef 4) #[[NOTCOLD]]
+; IR: if.false.orig_indirect2:
+; IR: call {{.*}} %0
+
+; IR: define {{.*}} @_Z3fooR2B0j.memprof.1(
+;; We should still compare against the original versions of bar since that is
+;; what is in the vtable. However, we should have called the cloned versions
+;; that perform cold allocations, which were subsequently inlined.
+; IR: %1 = icmp eq ptr %0, @_ZN1B3barEj
+; IR: br i1 %1, label %if.true.direct_targ, label %if.false.orig_indirect
+; IR: if.true.direct_targ:
+; IR: call {{.*}} @_Znwm(i64 noundef 4) #[[COLD:[0-9]+]]
+; IR: if.false.orig_indirect:
+; IR: %2 = icmp eq ptr %0, @_ZN2B03barEj
+; IR: br i1 %2, label %if.true.direct_targ1, label %if.false.orig_indirect2
+; IR: if.true.direct_targ1:
+; IR: call {{.*}} @_Znwm(i64 noundef 4) #[[COLD]]
+; IR: if.false.orig_indirect2:
+; IR: call {{.*}} %0
+
+; IR: attributes #[[NOTCOLD]] = {{.*}} "memprof"="notcold"
+; IR: attributes #[[COLD]] = {{.*}} "memprof"="cold"
+
+; STATS-BE-DISTRIB: 2 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend
+; STATS-BE-DISTRIB: 2 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend
+; STATS-BE-DISTRIB: 3 memprof-context-disambiguation - Number of function clones created during ThinLTO backend
+
+;--- foo.ll
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @_Z3fooR2B0j(ptr %b) {
+entry:
+ %0 = load ptr, ptr %b, align 8
+ %call = tail call i32 %0(ptr null, i32 0), !prof !0, !callsite !1
+ ret i32 0
+}
+
+!0 = !{!"VP", i32 0, i64 4, i64 4445083295448962937, i64 2, i64 -2718743882639408571, i64 2}
+!1 = !{i64 -2101080423462424381}
+
+;--- main.ll
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at _ZTV1B = external constant { [3 x ptr] }
+ at _ZTVN10__cxxabiv120__si_class_type_infoE = external global [0 x ptr]
+ at _ZTS1B = external constant [3 x i8]
+ at _ZTVN10__cxxabiv117__class_type_infoE = external global [0 x ptr]
+ at _ZTS2B0 = external constant [4 x i8]
+ at _ZTI2B0 = external constant { ptr, ptr }
+ at _ZTI1B = external constant { ptr, ptr, ptr }
+ at _ZTV2B0 = external constant { [3 x ptr] }
+
+define i32 @main() !prof !29 {
+entry:
+ %call2 = call i32 @_Z3fooR2B0j(ptr null, i32 0), !callsite !30
+ %call4 = call i32 @_Z3fooR2B0j(ptr null, i32 0), !callsite !31
+ %call6 = call i32 @_Z3fooR2B0j(ptr null, i32 0), !callsite !32
+ ret i32 0
+}
+
+declare i32 @_Z3fooR2B0j(ptr, i32)
+
+define i32 @_ZN2B03barEj(ptr %this, i32 %s) {
+entry:
+ %call = tail call ptr @_Znwm(i64 noundef 4) #0, !memprof !33, !callsite !38
+ store volatile i32 0, ptr %call, align 4
+ ret i32 0
+}
+
+declare ptr @_Znwm(i64)
+
+declare void @_Z8externalPi()
+
+declare void @_ZdlPvm()
+
+define i32 @_ZN1B3barEj(ptr %this, i32 %s) {
+entry:
+ %call = tail call ptr @_Znwm(i64 noundef 4) #0, !memprof !39, !callsite !44
+ store volatile i32 0, ptr %call, align 4
+ ret i32 0
+}
+
+; uselistorder directives
+uselistorder ptr @_Z3fooR2B0j, { 2, 1, 0 }
+
+attributes #0 = { builtin allocsize(0) }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9, !10, !11}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 13}
+!4 = !{!"MaxCount", i64 4}
+!5 = !{!"MaxInternalCount", i64 0}
+!6 = !{!"MaxFunctionCount", i64 4}
+!7 = !{!"NumCounts", i64 5}
+!8 = !{!"NumFunctions", i64 5}
+!9 = !{!"IsPartialProfile", i64 0}
+!10 = !{!"PartialProfileRatio", double 0.000000e+00}
+!11 = !{!"DetailedSummary", !12}
+!12 = !{!13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28}
+!13 = !{i32 10000, i64 0, i32 0}
+!14 = !{i32 100000, i64 4, i32 2}
+!15 = !{i32 200000, i64 4, i32 2}
+!16 = !{i32 300000, i64 4, i32 2}
+!17 = !{i32 400000, i64 4, i32 2}
+!18 = !{i32 500000, i64 4, i32 2}
+!19 = !{i32 600000, i64 4, i32 2}
+!20 = !{i32 700000, i64 2, i32 4}
+!21 = !{i32 800000, i64 2, i32 4}
+!22 = !{i32 900000, i64 2, i32 4}
+!23 = !{i32 950000, i64 2, i32 4}
+!24 = !{i32 990000, i64 2, i32 4}
+!25 = !{i32 999000, i64 2, i32 4}
+!26 = !{i32 999900, i64 2, i32 4}
+!27 = !{i32 999990, i64 2, i32 4}
+!28 = !{i32 999999, i64 2, i32 4}
+!29 = !{!"function_entry_count", i64 1}
+!30 = !{i64 -6490791336773930154}
+!31 = !{i64 5188446645037944434}
+!32 = !{i64 5583420417449503557}
+!33 = !{!34, !36}
+!34 = !{!35, !"notcold"}
+!35 = !{i64 -852997907418798798, i64 -2101080423462424381, i64 5188446645037944434}
+!36 = !{!37, !"cold"}
+!37 = !{i64 -852997907418798798, i64 -2101080423462424381, i64 5583420417449503557}
+!38 = !{i64 -852997907418798798}
+!39 = !{!40, !42}
+!40 = !{!41, !"notcold"}
+!41 = !{i64 4457553070050523782, i64 -2101080423462424381, i64 132626519179914298}
+!42 = !{!43, !"cold"}
+!43 = !{i64 4457553070050523782, i64 -2101080423462424381, i64 -6490791336773930154}
+!44 = !{i64 4457553070050523782}
>From 6c6121a7c7404722d04ef99bbbe14a044cedd886 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson at google.com>
Date: Mon, 30 Sep 2024 20:01:45 -0700
Subject: [PATCH 2/6] Test the -enable-memprof-indirect-call-support flag
---
llvm/test/ThinLTO/X86/memprof-icp.ll | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/llvm/test/ThinLTO/X86/memprof-icp.ll b/llvm/test/ThinLTO/X86/memprof-icp.ll
index 4b8dc454026d2c..e0f1e9cc453b0f 100644
--- a/llvm/test/ThinLTO/X86/memprof-icp.ll
+++ b/llvm/test/ThinLTO/X86/memprof-icp.ll
@@ -70,6 +70,17 @@
; RUN: opt -thinlto-bc %t/main.ll >%t/main.o
; RUN: opt -thinlto-bc %t/foo.ll >%t/foo.o
+;; Check that we get the synthesized callsite records. There should be 2, one
+;; for each profiled target in the VP metadata. They will have the same stackIds
+;; since the debug information for the callsite is the same.
+; RUN: llvm-dis %t/foo.o -o - | FileCheck %s --check-prefix=CALLSITES
+; CALLSITES: gv: (name: "_Z3fooR2B0j", {{.*}} callsites: ((callee: ^{{[0-9]+}}, clones: (0), stackIds: (16345663650247127235)), (callee: ^{{[0-9]+}}, clones: (0), stackIds: (16345663650247127235)))
+
+;; Make sure that we don't get the synthesized callsite records if the
+;; -enable-memprof-indirect-call-support flag is false.
+; RUN: opt -thinlto-bc %t/foo.ll -enable-memprof-indirect-call-support=false -o - \
+; RUN: | llvm-dis -o - | FileCheck %s --implicit-check-not callsites
+
;; First perform in-process ThinLTO
; RUN: llvm-lto2 run %t/main.o %t/foo.o -enable-memprof-context-disambiguation \
; RUN: -supports-hot-cold-new \
>From 16e66f6728e2560239b8f984d43aed003e34dc36 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson at google.com>
Date: Wed, 2 Oct 2024 07:07:21 -0700
Subject: [PATCH 3/6] Add missing check for null function (indirect call) when
applying cloning decisions.
---
llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 84e177355003d4..12741ff4797240 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -4440,7 +4440,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
CloneCallsite(StackNode, CB, CalledFunction);
}
- } else if (CB->isTailCall()) {
+ } else if (CB->isTailCall() && CalledFunction) {
// Locate the synthesized callsite info for the callee VI, if any was
// created, and use that for cloning.
ValueInfo CalleeVI =
>From f86d45b46434a02e88d705b33e844b059a09a833 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson at google.com>
Date: Wed, 9 Oct 2024 20:35:55 -0700
Subject: [PATCH 4/6] Address most of the review comments, remaining are TODO
---
.../IPO/MemProfContextDisambiguation.h | 4 +-
.../IPO/MemProfContextDisambiguation.cpp | 66 +++++++++----------
llvm/test/ThinLTO/X86/memprof-icp.ll | 8 ++-
3 files changed, 39 insertions(+), 39 deletions(-)
diff --git a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
index fa067ad62ed1b3..00598b597ba97a 100644
--- a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
+++ b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
@@ -45,11 +45,11 @@ class MemProfContextDisambiguation
// Whether we are building with SamplePGO. This is needed for correctly
// updating profile metadata on speculatively promoted calls.
- bool SamplePGO;
+ bool isSamplePGO;
public:
MemProfContextDisambiguation(const ModuleSummaryIndex *Summary = nullptr,
- bool SamplePGO = false);
+ bool isSamplePGO = false);
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 12741ff4797240..67af0979a65025 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -4218,12 +4218,13 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
// Data structures for saving indirect call profile info for use in ICP with
// cloning.
struct ICallAnalysisData {
+ CallBase *CB;
std::vector<InstrProfValueData> CandidateProfileData;
uint32_t NumCandidates;
uint64_t TotalCount;
- unsigned CallsiteInfoStartIndex;
+ size_t CallsiteInfoStartIndex;
};
- std::map<CallBase *, ICallAnalysisData> ICallAnalysisMap;
+ SmallVector<ICallAnalysisData> ICallAnalysisInfo;
// Assume for now that the instructions are in the exact same order
// as when the summary was created, but confirm this is correct by
@@ -4386,10 +4387,11 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
ICallAnalysis.getPromotionCandidatesForInstruction(
CB, TotalCount, NumCandidates);
if (!CandidateProfileData.empty()) {
- unsigned CallsiteInfoStartStartIndex =
- static_cast<unsigned int>(SI - FS->callsites().begin());
+ size_t CallsiteInfoStartIndex =
+ std::distance(FS->callsites().begin(), SI);
// Iterate past all of the associated callsites nodes and check
// them.
+ bool ICPNeeded = false;
for (const auto &Candidate : CandidateProfileData) {
#ifndef NDEBUG
auto CalleeValueInfo =
@@ -4403,21 +4405,21 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
// See if any of the clones of the indirect callsite for this
// profiled target should call a cloned version of the profiled
// target. We only need to do the ICP here if so.
- for (auto CloneNo : StackNode.Clones) {
- if (!CloneNo)
- continue;
- // Save information for ICP, which is performed later to avoid
- // messing up the current function traversal.
- ICallAnalysisMap[CB] = {CandidateProfileData.vec(),
- NumCandidates, TotalCount,
- CallsiteInfoStartStartIndex};
- break;
- }
+ ICPNeeded |=
+ llvm::any_of(StackNode.Clones,
+ [](unsigned CloneNo) { return CloneNo != 0; });
// Perform cloning if not yet done. This is done here in case
// we don't need to do ICP, but might need to clone this
// function as it is the target of other cloned calls.
CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
}
+ if (ICPNeeded) {
+ // Save information for ICP, which is performed later to avoid
+ // messing up the current function traversal.
+ ICallAnalysisInfo.push_back({CB, CandidateProfileData.vec(),
+ NumCandidates, TotalCount,
+ CallsiteInfoStartIndex});
+ }
}
}
@@ -4463,9 +4465,8 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
// devirtualization) for each clone of the callsite, and update its callee
// to the appropriate clone. Note that the ICP compares against the original
// version of the target, which is what is in the vtable.
- for (auto &ICallInfo : ICallAnalysisMap) {
- auto *CB = ICallInfo.first;
- auto &Info = ICallInfo.second;
+ for (auto &Info : ICallAnalysisInfo) {
+ auto *CB = Info.CB;
auto CallsiteIndex = Info.CallsiteInfoStartIndex;
auto TotalCount = Info.TotalCount;
unsigned NumPromoted = 0;
@@ -4484,7 +4485,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
// (similar to what is done in the ICP pass).
Function *TargetFunction = Symtab.getFunction(Candidate.Value);
if (TargetFunction == nullptr || TargetFunction->isDeclaration()) {
- ORE.emit([&]() {
+ ORE.emit([&, CB = CB]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget",
CB)
<< "Cannot promote indirect call: target with md5sum "
@@ -4499,7 +4500,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
// Check if legal to promote
const char *Reason = nullptr;
if (!isLegalToPromote(*CB, TargetFunction, &Reason)) {
- ORE.emit([&]() {
+ ORE.emit([&, CB = CB]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", CB)
<< "Cannot promote indirect call to "
<< ore::NV("TargetFunction", TargetFunction)
@@ -4514,29 +4515,28 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
// Handle each call clone, applying ICP so that each clone directly
// calls the specified callee clone, guarded by the appropriate ICP
// check.
- auto CalleeOrigName = TargetFunction->getName();
+ CallBase *CBClone = CB;
for (unsigned J = 0; J < NumClones; J++) {
- CallBase *CBClone;
// Copy 0 is the original function.
- if (!J)
- CBClone = CB;
- else
+ if (J > 0)
CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
// We do the promotion using the original name, so that the comparison
// is against the name in the vtable. Then just below, change the new
// direct call to call the cloned function.
auto &DirectCall = pgo::promoteIndirectCall(
- *CBClone, TargetFunction, Candidate.Count, TotalCount, SamplePGO,
- &ORE);
+ *CBClone, TargetFunction, Candidate.Count, TotalCount,
+ isSamplePGO, &ORE);
auto *TargetToUse = TargetFunction;
// Call original if this version calls the original version of its
// callee.
- if (StackNode.Clones[J])
+ if (StackNode.Clones[J]) {
TargetToUse = cast<Function>(
M.getOrInsertFunction(
- getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
+ getMemProfFuncName(TargetFunction->getName(),
+ StackNode.Clones[J]),
TargetFunction->getFunctionType())
.getCallee());
+ }
DirectCall.setCalledFunction(TargetToUse);
ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
<< ore::NV("Call", CBClone) << " in clone "
@@ -4551,12 +4551,10 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
}
// Adjust the MD.prof metadata for all clones, now that we have the new
// TotalCount and the number promoted.
+ CallBase *CBClone = CB;
for (unsigned J = 0; J < NumClones; J++) {
- CallBase *CBClone;
// Copy 0 is the original function.
- if (!J)
- CBClone = CB;
- else
+ if (J > 0)
CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
// First delete the old one.
CBClone->setMetadata(LLVMContext::MD_prof, nullptr);
@@ -4640,8 +4638,8 @@ bool MemProfContextDisambiguation::processModule(
}
MemProfContextDisambiguation::MemProfContextDisambiguation(
- const ModuleSummaryIndex *Summary, bool SamplePGO)
- : ImportSummary(Summary), SamplePGO(SamplePGO) {
+ const ModuleSummaryIndex *Summary, bool isSamplePGO)
+ : ImportSummary(Summary), isSamplePGO(isSamplePGO) {
if (ImportSummary) {
// The MemProfImportSummary should only be used for testing ThinLTO
// distributed backend handling via opt, in which case we don't have a
diff --git a/llvm/test/ThinLTO/X86/memprof-icp.ll b/llvm/test/ThinLTO/X86/memprof-icp.ll
index e0f1e9cc453b0f..5c6d4e383d3217 100644
--- a/llvm/test/ThinLTO/X86/memprof-icp.ll
+++ b/llvm/test/ThinLTO/X86/memprof-icp.ll
@@ -1,12 +1,13 @@
;; Test that cloning of an indirect call works. We should perform ICP and update
;; promoted call to the correct clone.
-;; This was created from the following source code, then the IR was reduced
+;; This was created from the following source code, for which both memprof and
+;; instrumentation PGO was collected and then applied, then the IR was reduced
;; using llvm-reduce with the expected FileCheck input.
+;; TODO: Consider adding a sample PGO based test, however, that uses the same VP
+;; metadata and should behave the same.
;; -- virtfunc.h: --
-;; #include <unistd.h>
-;;
;; void external(int *x);
;;
;; class B0 {
@@ -31,6 +32,7 @@
;; -- virtfunc_main.cc: --
;; #include "virtfunc.h"
;; #include <stdio.h>
+;; #include <unistd.h>
;;
;; int main() {
;; B b;
>From f5b8df36a02f627491de764d3e8ee10f04719eb0 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson at google.com>
Date: Wed, 9 Oct 2024 20:42:02 -0700
Subject: [PATCH 5/6] Remove a couple of changes only needed with the
structured binding that was no longer applicable with the change from a map
to a vector.
---
llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 67af0979a65025..b6f004d6b87ada 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -4485,7 +4485,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
// (similar to what is done in the ICP pass).
Function *TargetFunction = Symtab.getFunction(Candidate.Value);
if (TargetFunction == nullptr || TargetFunction->isDeclaration()) {
- ORE.emit([&, CB = CB]() {
+ ORE.emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget",
CB)
<< "Cannot promote indirect call: target with md5sum "
@@ -4500,7 +4500,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
// Check if legal to promote
const char *Reason = nullptr;
if (!isLegalToPromote(*CB, TargetFunction, &Reason)) {
- ORE.emit([&, CB = CB]() {
+ ORE.emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", CB)
<< "Cannot promote indirect call to "
<< ore::NV("TargetFunction", TargetFunction)
>From fe04fd206c0d6d6ed61b0a3a49357abfc5eea71b Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson at google.com>
Date: Thu, 10 Oct 2024 20:15:15 -0700
Subject: [PATCH 6/6] Address remaining comments
---
.../IPO/MemProfContextDisambiguation.h | 38 ++
.../IPO/MemProfContextDisambiguation.cpp | 353 ++++++++++--------
2 files changed, 225 insertions(+), 166 deletions(-)
diff --git a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
index 00598b597ba97a..7ac7e14ab4b81d 100644
--- a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
+++ b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
@@ -15,9 +15,11 @@
#ifndef LLVM_TRANSFORMS_IPO_MEMPROF_CONTEXT_DISAMBIGUATION_H
#define LLVM_TRANSFORMS_IPO_MEMPROF_CONTEXT_DISAMBIGUATION_H
+#include "llvm/Analysis/IndirectCallPromotionAnalysis.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/ModuleSummaryIndex.h"
#include "llvm/IR/PassManager.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
#include <functional>
namespace llvm {
@@ -36,6 +38,37 @@ class MemProfContextDisambiguation
/// the IR.
bool applyImport(Module &M);
+ // Builds the symtab and analysis used for ICP during ThinLTO backends.
+ bool initializeIndirectCallPromotionInfo(Module &M);
+
+ // Data structure for saving indirect call profile info for use in ICP with
+ // cloning.
+ struct ICallAnalysisData {
+ CallBase *CB;
+ std::vector<InstrProfValueData> CandidateProfileData;
+ uint32_t NumCandidates;
+ uint64_t TotalCount;
+ size_t CallsiteInfoStartIndex;
+ };
+
+ // Record information needed for ICP of an indirect call, depending on its
+ // profile information and the clone information recorded in the corresponding
+ // CallsiteInfo records. The SI iterator point to the current iteration point
+ // through AllCallsites in this function, and will be updated in this method
+ // as we iterate through profiled targets. The number of clones recorded for
+ // this indirect call is returned. The necessary information is recorded in
+ // the ICallAnalysisInfo list for later ICP.
+ unsigned recordICPInfo(CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites,
+ ArrayRef<CallsiteInfo>::iterator &SI,
+ SmallVector<ICallAnalysisData> &ICallAnalysisInfo);
+
+ // Actually performs any needed ICP in the function, using the information
+ // recorded in the ICallAnalysisInfo list.
+ void performICP(Module &M, ArrayRef<CallsiteInfo> AllCallsites,
+ SmallVectorImpl<std::unique_ptr<ValueToValueMapTy>> &VMaps,
+ SmallVector<ICallAnalysisData> &ICallAnalysisInfo,
+ OptimizationRemarkEmitter &ORE);
+
/// Import summary containing cloning decisions for the ThinLTO backend.
const ModuleSummaryIndex *ImportSummary;
@@ -47,6 +80,11 @@ class MemProfContextDisambiguation
// updating profile metadata on speculatively promoted calls.
bool isSamplePGO;
+ // Used when performing indirect call analysis and promotion when cloning in
+ // the ThinLTO backend during applyImport.
+ std::unique_ptr<InstrProfSymtab> Symtab;
+ std::unique_ptr<ICallPromotionAnalysis> ICallAnalysis;
+
public:
MemProfContextDisambiguation(const ModuleSummaryIndex *Summary = nullptr,
bool isSamplePGO = false);
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index b6f004d6b87ada..be81b1ef6b1289 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -29,7 +29,6 @@
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/IndirectCallPromotionAnalysis.h"
#include "llvm/Analysis/MemoryProfileInfo.h"
#include "llvm/Analysis/ModuleSummaryAnalysis.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
@@ -44,9 +43,9 @@
#include "llvm/Support/GraphWriter.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/Utils/Instrumentation.h"
#include "llvm/Transforms/Utils/CallPromotionUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Instrumentation.h"
#include <deque>
#include <sstream>
#include <unordered_map>
@@ -1826,6 +1825,10 @@ static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
}
+static bool isMemProfClone(const Function &F) {
+ return F.getName().contains(MemProfCloneSuffix);
+}
+
std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
const Instruction *Call,
unsigned CloneNo) const {
@@ -4065,14 +4068,21 @@ static ValueInfo findValueInfoForFunc(const Function &F, const Module &M,
return TheFnVI;
}
+bool MemProfContextDisambiguation::initializeIndirectCallPromotionInfo(
+ Module &M) {
+ ICallAnalysis = std::make_unique<ICallPromotionAnalysis>();
+ Symtab = std::make_unique<InstrProfSymtab>();
+ if (Error E = Symtab->create(M, /*InLTO=*/true)) {
+ std::string SymtabFailure = toString(std::move(E));
+ M.getContext().emitError("Failed to create symtab: " + SymtabFailure);
+ return false;
+ }
+ return true;
+}
+
bool MemProfContextDisambiguation::applyImport(Module &M) {
assert(ImportSummary);
bool Changed = false;
- ICallPromotionAnalysis ICallAnalysis;
-
- auto IsMemProfClone = [](const Function &F) {
- return F.getName().contains(MemProfCloneSuffix);
- };
// We also need to clone any aliases that reference cloned functions, because
// the modified callsites may invoke via the alias. Keep track of the aliases
@@ -4085,15 +4095,11 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
FuncToAliasMap[F].insert(&A);
}
- InstrProfSymtab Symtab;
- if (Error E = Symtab.create(M, /*InLTO=*/true)) {
- std::string SymtabFailure = toString(std::move(E));
- M.getContext().emitError("Failed to create symtab: " + SymtabFailure);
+ if (!initializeIndirectCallPromotionInfo(M))
return false;
- }
for (auto &F : M) {
- if (F.isDeclaration() || IsMemProfClone(F))
+ if (F.isDeclaration() || isMemProfClone(F))
continue;
OptimizationRemarkEmitter ORE(&F);
@@ -4128,7 +4134,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
// Perform cloning if not yet done.
CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
- assert(!IsMemProfClone(*CalledFunction));
+ assert(!isMemProfClone(*CalledFunction));
// Update the calls per the summary info.
// Save orig name since it gets updated in the first iteration
@@ -4215,15 +4221,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite});
}
- // Data structures for saving indirect call profile info for use in ICP with
- // cloning.
- struct ICallAnalysisData {
- CallBase *CB;
- std::vector<InstrProfValueData> CandidateProfileData;
- uint32_t NumCandidates;
- uint64_t TotalCount;
- size_t CallsiteInfoStartIndex;
- };
+ // Keeps track of needed ICP for the function.
SmallVector<ICallAnalysisData> ICallAnalysisInfo;
// Assume for now that the instructions are in the exact same order
@@ -4369,10 +4367,6 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
}
} else if (!CallsiteContext.empty()) {
if (!CalledFunction) {
- // This is an indirect call, see if we have profile information and
- // whether any clones were recorded for the profiled targets (that
- // we synthesized CallsiteInfo summary records for when building the
- // index).
#ifndef NDEBUG
// We should have skipped inline assembly calls.
auto *CI = dyn_cast<CallInst>(CB);
@@ -4381,46 +4375,18 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
// We should have skipped direct calls via a Constant.
assert(CalledValue && !isa<Constant>(CalledValue));
- uint32_t NumCandidates;
- uint64_t TotalCount;
- auto CandidateProfileData =
- ICallAnalysis.getPromotionCandidatesForInstruction(
- CB, TotalCount, NumCandidates);
- if (!CandidateProfileData.empty()) {
- size_t CallsiteInfoStartIndex =
- std::distance(FS->callsites().begin(), SI);
- // Iterate past all of the associated callsites nodes and check
- // them.
- bool ICPNeeded = false;
- for (const auto &Candidate : CandidateProfileData) {
-#ifndef NDEBUG
- auto CalleeValueInfo =
-#endif
- ImportSummary->getValueInfo(Candidate.Value);
- // We might not have a ValueInfo if this is a distributed
- // ThinLTO backend and decided not to import that function.
- assert(!CalleeValueInfo || SI->Callee == CalleeValueInfo);
- assert(SI != FS->callsites().end());
- auto &StackNode = *(SI++);
- // See if any of the clones of the indirect callsite for this
- // profiled target should call a cloned version of the profiled
- // target. We only need to do the ICP here if so.
- ICPNeeded |=
- llvm::any_of(StackNode.Clones,
- [](unsigned CloneNo) { return CloneNo != 0; });
- // Perform cloning if not yet done. This is done here in case
- // we don't need to do ICP, but might need to clone this
- // function as it is the target of other cloned calls.
- CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
- }
- if (ICPNeeded) {
- // Save information for ICP, which is performed later to avoid
- // messing up the current function traversal.
- ICallAnalysisInfo.push_back({CB, CandidateProfileData.vec(),
- NumCandidates, TotalCount,
- CallsiteInfoStartIndex});
- }
- }
+ // This is an indirect call, see if we have profile information and
+ // whether any clones were recorded for the profiled targets (that
+ // we synthesized CallsiteInfo summary records for when building the
+ // index).
+ auto NumClones =
+ recordICPInfo(CB, FS->callsites(), SI, ICallAnalysisInfo);
+
+ // Perform cloning if not yet done. This is done here in case
+ // we don't need to do ICP, but might need to clone this
+ // function as it is the target of other cloned calls.
+ if (NumClones)
+ CloneFuncIfNeeded(NumClones);
}
else {
@@ -4459,117 +4425,172 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
}
}
- // Now do any promotion required for cloning. Specifically, for each
- // recorded ICP candidate (which was only recorded because one clone of that
- // candidate should call a cloned target), we perform ICP (speculative
- // devirtualization) for each clone of the callsite, and update its callee
- // to the appropriate clone. Note that the ICP compares against the original
- // version of the target, which is what is in the vtable.
- for (auto &Info : ICallAnalysisInfo) {
- auto *CB = Info.CB;
- auto CallsiteIndex = Info.CallsiteInfoStartIndex;
- auto TotalCount = Info.TotalCount;
- unsigned NumPromoted = 0;
- unsigned NumClones = 0;
-
- for (auto &Candidate : Info.CandidateProfileData) {
- auto &StackNode = FS->callsites()[CallsiteIndex++];
-
- // All calls in the same function must have the same number of clones.
- assert(!NumClones || NumClones == StackNode.Clones.size());
- NumClones = StackNode.Clones.size();
-
- // See if the target is in the module. If it wasn't imported, it is
- // possible that this profile could have been collected on a different
- // target (or version of the code), and we need to be conservative
- // (similar to what is done in the ICP pass).
- Function *TargetFunction = Symtab.getFunction(Candidate.Value);
- if (TargetFunction == nullptr || TargetFunction->isDeclaration()) {
- ORE.emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget",
- CB)
- << "Cannot promote indirect call: target with md5sum "
- << ore::NV("target md5sum", Candidate.Value) << " not found";
- });
- // FIXME: See if we can use the new declaration importing support to
- // at least get the declarations imported for this case. Hot indirect
- // targets should have been imported normally, however.
- continue;
- }
+ // Now do any promotion required for cloning.
+ performICP(M, FS->callsites(), VMaps, ICallAnalysisInfo, ORE);
+ }
- // Check if legal to promote
- const char *Reason = nullptr;
- if (!isLegalToPromote(*CB, TargetFunction, &Reason)) {
- ORE.emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", CB)
- << "Cannot promote indirect call to "
- << ore::NV("TargetFunction", TargetFunction)
- << " with count of " << ore::NV("TotalCount", TotalCount)
- << ": " << Reason;
- });
- continue;
- }
+ return Changed;
+}
- assert(!IsMemProfClone(*TargetFunction));
-
- // Handle each call clone, applying ICP so that each clone directly
- // calls the specified callee clone, guarded by the appropriate ICP
- // check.
- CallBase *CBClone = CB;
- for (unsigned J = 0; J < NumClones; J++) {
- // Copy 0 is the original function.
- if (J > 0)
- CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
- // We do the promotion using the original name, so that the comparison
- // is against the name in the vtable. Then just below, change the new
- // direct call to call the cloned function.
- auto &DirectCall = pgo::promoteIndirectCall(
- *CBClone, TargetFunction, Candidate.Count, TotalCount,
- isSamplePGO, &ORE);
- auto *TargetToUse = TargetFunction;
- // Call original if this version calls the original version of its
- // callee.
- if (StackNode.Clones[J]) {
- TargetToUse = cast<Function>(
- M.getOrInsertFunction(
- getMemProfFuncName(TargetFunction->getName(),
- StackNode.Clones[J]),
- TargetFunction->getFunctionType())
- .getCallee());
- }
- DirectCall.setCalledFunction(TargetToUse);
- ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
- << ore::NV("Call", CBClone) << " in clone "
- << ore::NV("Caller", CBClone->getFunction())
- << " promoted and assigned to call function clone "
- << ore::NV("Callee", TargetToUse));
- }
+unsigned MemProfContextDisambiguation::recordICPInfo(
+ CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites,
+ ArrayRef<CallsiteInfo>::iterator &SI,
+ SmallVector<ICallAnalysisData> &ICallAnalysisInfo) {
+ // First see if we have profile information for this indirect call.
+ uint32_t NumCandidates;
+ uint64_t TotalCount;
+ auto CandidateProfileData =
+ ICallAnalysis->getPromotionCandidatesForInstruction(CB, TotalCount,
+ NumCandidates);
+ if (CandidateProfileData.empty())
+ return 0;
+
+ // Iterate through all of the candidate profiled targets along with the
+ // CallsiteInfo summary records synthesized for them when building the index,
+ // and see if any are cloned and/or refer to clones.
+ bool ICPNeeded = false;
+ unsigned NumClones = 0;
+ size_t CallsiteInfoStartIndex = std::distance(AllCallsites.begin(), SI);
+ for (const auto &Candidate : CandidateProfileData) {
+#ifndef NDEBUG
+ auto CalleeValueInfo =
+#endif
+ ImportSummary->getValueInfo(Candidate.Value);
+ // We might not have a ValueInfo if this is a distributed
+ // ThinLTO backend and decided not to import that function.
+ assert(!CalleeValueInfo || SI->Callee == CalleeValueInfo);
+ assert(SI != AllCallsites.end());
+ auto &StackNode = *(SI++);
+ // See if any of the clones of the indirect callsite for this
+ // profiled target should call a cloned version of the profiled
+ // target. We only need to do the ICP here if so.
+ ICPNeeded |= llvm::any_of(StackNode.Clones,
+ [](unsigned CloneNo) { return CloneNo != 0; });
+ // Every callsite in the same function should have been cloned the same
+ // number of times.
+ assert(!NumClones || NumClones == StackNode.Clones.size());
+ NumClones = StackNode.Clones.size();
+ }
+ if (!ICPNeeded)
+ return NumClones;
+ // Save information for ICP, which is performed later to avoid messing up the
+ // current function traversal.
+ ICallAnalysisInfo.push_back({CB, CandidateProfileData.vec(), NumCandidates,
+ TotalCount, CallsiteInfoStartIndex});
+ return NumClones;
+}
+
+void MemProfContextDisambiguation::performICP(
+ Module &M, ArrayRef<CallsiteInfo> AllCallsites,
+ SmallVectorImpl<std::unique_ptr<ValueToValueMapTy>> &VMaps,
+ SmallVector<ICallAnalysisData> &ICallAnalysisInfo,
+ OptimizationRemarkEmitter &ORE) {
+ // Now do any promotion required for cloning. Specifically, for each
+ // recorded ICP candidate (which was only recorded because one clone of that
+ // candidate should call a cloned target), we perform ICP (speculative
+ // devirtualization) for each clone of the callsite, and update its callee
+ // to the appropriate clone. Note that the ICP compares against the original
+ // version of the target, which is what is in the vtable.
+ for (auto &Info : ICallAnalysisInfo) {
+ auto *CB = Info.CB;
+ auto CallsiteIndex = Info.CallsiteInfoStartIndex;
+ auto TotalCount = Info.TotalCount;
+ unsigned NumPromoted = 0;
+ unsigned NumClones = 0;
+
+ for (auto &Candidate : Info.CandidateProfileData) {
+ auto &StackNode = AllCallsites[CallsiteIndex++];
+
+ // All calls in the same function must have the same number of clones.
+ assert(!NumClones || NumClones == StackNode.Clones.size());
+ NumClones = StackNode.Clones.size();
+
+ // See if the target is in the module. If it wasn't imported, it is
+ // possible that this profile could have been collected on a different
+ // target (or version of the code), and we need to be conservative
+ // (similar to what is done in the ICP pass).
+ Function *TargetFunction = Symtab->getFunction(Candidate.Value);
+ if (TargetFunction == nullptr || TargetFunction->isDeclaration()) {
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", CB)
+ << "Cannot promote indirect call: target with md5sum "
+ << ore::NV("target md5sum", Candidate.Value) << " not found";
+ });
+ // FIXME: See if we can use the new declaration importing support to
+ // at least get the declarations imported for this case. Hot indirect
+ // targets should have been imported normally, however.
+ continue;
+ }
- // Update TotalCount (all clones should get same count above)
- TotalCount -= Candidate.Count;
- NumPromoted++;
+ // Check if legal to promote
+ const char *Reason = nullptr;
+ if (!isLegalToPromote(*CB, TargetFunction, &Reason)) {
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", CB)
+ << "Cannot promote indirect call to "
+ << ore::NV("TargetFunction", TargetFunction)
+ << " with count of " << ore::NV("TotalCount", TotalCount)
+ << ": " << Reason;
+ });
+ continue;
}
- // Adjust the MD.prof metadata for all clones, now that we have the new
- // TotalCount and the number promoted.
+
+ assert(!isMemProfClone(*TargetFunction));
+
+ // Handle each call clone, applying ICP so that each clone directly
+ // calls the specified callee clone, guarded by the appropriate ICP
+ // check.
CallBase *CBClone = CB;
for (unsigned J = 0; J < NumClones; J++) {
// Copy 0 is the original function.
if (J > 0)
CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
- // First delete the old one.
- CBClone->setMetadata(LLVMContext::MD_prof, nullptr);
- // If all promoted, we don't need the MD.prof metadata.
- // Otherwise we need update with the un-promoted records back.
- if (TotalCount != 0)
- annotateValueSite(
- M, *CBClone,
- ArrayRef(Info.CandidateProfileData).slice(NumPromoted),
- TotalCount, IPVK_IndirectCallTarget, Info.NumCandidates);
+ // We do the promotion using the original name, so that the comparison
+ // is against the name in the vtable. Then just below, change the new
+ // direct call to call the cloned function.
+ auto &DirectCall =
+ pgo::promoteIndirectCall(*CBClone, TargetFunction, Candidate.Count,
+ TotalCount, isSamplePGO, &ORE);
+ auto *TargetToUse = TargetFunction;
+ // Call original if this version calls the original version of its
+ // callee.
+ if (StackNode.Clones[J]) {
+ TargetToUse =
+ cast<Function>(M.getOrInsertFunction(
+ getMemProfFuncName(TargetFunction->getName(),
+ StackNode.Clones[J]),
+ TargetFunction->getFunctionType())
+ .getCallee());
+ }
+ DirectCall.setCalledFunction(TargetToUse);
+ ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
+ << ore::NV("Call", CBClone) << " in clone "
+ << ore::NV("Caller", CBClone->getFunction())
+ << " promoted and assigned to call function clone "
+ << ore::NV("Callee", TargetToUse));
}
+
+ // Update TotalCount (all clones should get same count above)
+ TotalCount -= Candidate.Count;
+ NumPromoted++;
+ }
+ // Adjust the MD.prof metadata for all clones, now that we have the new
+ // TotalCount and the number promoted.
+ CallBase *CBClone = CB;
+ for (unsigned J = 0; J < NumClones; J++) {
+ // Copy 0 is the original function.
+ if (J > 0)
+ CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
+ // First delete the old one.
+ CBClone->setMetadata(LLVMContext::MD_prof, nullptr);
+ // If all promoted, we don't need the MD.prof metadata.
+ // Otherwise we need update with the un-promoted records back.
+ if (TotalCount != 0)
+ annotateValueSite(
+ M, *CBClone, ArrayRef(Info.CandidateProfileData).slice(NumPromoted),
+ TotalCount, IPVK_IndirectCallTarget, Info.NumCandidates);
}
}
-
- return Changed;
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
More information about the llvm-commits
mailing list