[llvm] 04f3c5a - Restore again "[MemProf] Context disambiguation cloning pass [patch 3/4]"
Teresa Johnson via llvm-commits
llvm-commits at lists.llvm.org
Fri May 5 13:27:46 PDT 2023
Author: Teresa Johnson
Date: 2023-05-05T13:27:33-07:00
New Revision: 04f3c5a71e8d6cd09843dc4d72be9d96f019c7cc
URL: https://github.com/llvm/llvm-project/commit/04f3c5a71e8d6cd09843dc4d72be9d96f019c7cc
DIFF: https://github.com/llvm/llvm-project/commit/04f3c5a71e8d6cd09843dc4d72be9d96f019c7cc.diff
LOG: Restore again "[MemProf] Context disambiguation cloning pass [patch 3/4]"
This reverts commit f09807ca9dda2f588298d8733e89a81105c88120, restoring
bfe7205975a63a605ff3faacd97fe4c1bf4c19b3 and follow on fix
e3e6bc699574550f2ed1de07f4e5bcdddaa65557, now that the nondeterminism
has been addressed by D149924.
Differential Revision: https://reviews.llvm.org/D141077
Added:
Modified:
llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
llvm/test/ThinLTO/X86/memprof-basic.ll
llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll
llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll
llvm/test/ThinLTO/X86/memprof-indirectcall.ll
llvm/test/ThinLTO/X86/memprof-inlined.ll
llvm/test/Transforms/MemProfContextDisambiguation/basic.ll
llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids.ll
llvm/test/Transforms/MemProfContextDisambiguation/funcassigncloning.ll
llvm/test/Transforms/MemProfContextDisambiguation/indirectcall.ll
llvm/test/Transforms/MemProfContextDisambiguation/inlined.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
index 475ea48cca932..13f3a7eb7ce3f 100644
--- a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
+++ b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
@@ -25,11 +25,14 @@ namespace llvm {
class GlobalValueSummary;
class Module;
class ModuleSummaryIndex;
+class OptimizationRemarkEmitter;
class MemProfContextDisambiguation
: public PassInfoMixin<MemProfContextDisambiguation> {
/// Run the context disambiguator on \p M, returns true if any changes made.
- bool processModule(Module &M);
+ bool processModule(
+ Module &M,
+ function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
public:
MemProfContextDisambiguation() {}
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 9b371ea887ad9..d9a433e79b1be 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -28,8 +28,10 @@
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/MemoryProfileInfo.h"
#include "llvm/Analysis/ModuleSummaryAnalysis.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
@@ -40,6 +42,7 @@
#include "llvm/Support/GraphWriter.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/Cloning.h"
#include <sstream>
#include <vector>
using namespace llvm;
@@ -47,6 +50,13 @@ using namespace llvm::memprof;
#define DEBUG_TYPE "memprof-context-disambiguation"
+STATISTIC(FunctionClonesAnalysis,
+ "Number of function clones created during whole program analysis");
+STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
+ "cloned) during whole program analysis");
+STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
+ "during whole program analysis");
+
static cl::opt<std::string> DotFilePathPrefix(
"memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
cl::value_desc("filename"),
@@ -96,6 +106,13 @@ class CallsiteContextGraph {
/// behavior of an allocation based on its context.
void identifyClones();
+ /// Assign callsite clones to functions, cloning functions as needed to
+ /// accommodate the combinations of their callsite clones reached by callers.
+ /// For regular LTO this clones functions and callsites in the IR, but for
+ /// ThinLTO the cloning decisions are noted in the summaries and applied
+ /// later.
+ bool assignFunctions();
+
void dump() const;
void print(raw_ostream &OS) const;
@@ -376,6 +393,28 @@ class CallsiteContextGraph {
return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
}
+ /// Update the allocation call to record type of allocated memory.
+ void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
+ AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
+ static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
+ }
+
+ /// Update non-allocation call to invoke (possibly cloned) function
+ /// CalleeFunc.
+ void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
+ static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
+ }
+
+ /// Clone the given function for the given callsite, recording mapping of all
+ /// of the functions tracked calls to their new versions in the CallMap.
+ /// Assigns new clones to clone number CloneNo.
+ FuncInfo cloneFunctionForCallsite(
+ FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
+ std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
+ return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
+ Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
+ }
+
/// Gets a label to use in the dot graph for the given call clone in the given
/// function.
std::string getLabel(const FuncTy *Func, const CallTy Call,
@@ -470,7 +509,9 @@ class ModuleCallsiteContextGraph
: public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
Instruction *> {
public:
- ModuleCallsiteContextGraph(Module &M);
+ ModuleCallsiteContextGraph(
+ Module &M,
+ function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
private:
friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
@@ -480,10 +521,19 @@ class ModuleCallsiteContextGraph
bool calleeMatchesFunc(Instruction *Call, const Function *Func);
uint64_t getLastStackId(Instruction *Call);
std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
+ void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
+ void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
+ CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
+ Instruction *>::FuncInfo
+ cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
+ std::map<CallInfo, CallInfo> &CallMap,
+ std::vector<CallInfo> &CallsWithMetadataInFunc,
+ unsigned CloneNo);
std::string getLabel(const Function *Func, const Instruction *Call,
unsigned CloneNo) const;
const Module &Mod;
+ function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
};
/// Represents a call in the summary index graph, which can either be an
@@ -529,6 +579,14 @@ class IndexCallsiteContextGraph
bool calleeMatchesFunc(IndexCall &Call, const FunctionSummary *Func);
uint64_t getLastStackId(IndexCall &Call);
std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
+ void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
+ void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
+ CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
+ IndexCall>::FuncInfo
+ cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
+ std::map<CallInfo, CallInfo> &CallMap,
+ std::vector<CallInfo> &CallsWithMetadataInFunc,
+ unsigned CloneNo);
std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
unsigned CloneNo) const;
@@ -1298,10 +1356,14 @@ uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
return Index.getStackIdAtIndex(CallsiteContext.back());
}
+static const std::string MemProfCloneSuffix = ".memprof.";
+
static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
+ // We use CloneNo == 0 to refer to the original version, which doesn't get
+ // renamed with a suffix.
if (!CloneNo)
return Base.str();
- return (Base + ".memprof." + Twine(CloneNo)).str();
+ return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
}
std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
@@ -1363,7 +1425,9 @@ CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
return StackIds;
}
-ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(Module &M) : Mod(M) {
+ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
+ Module &M, function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
+ : Mod(M), OREGetter(OREGetter) {
for (auto &F : M) {
std::vector<CallInfo> CallsWithMetadata;
for (auto &BB : F) {
@@ -1677,7 +1741,7 @@ static void checkEdge(
template <typename DerivedCCG, typename FuncTy, typename CallTy>
static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
- bool CheckEdges = false) {
+ bool CheckEdges = true) {
if (Node->isRemoved())
return;
// Node's context ids should be the union of both its callee and caller edge
@@ -1717,7 +1781,7 @@ template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
for (const auto Node : nodes<GraphType>(this)) {
- checkNode<DerivedCCG, FuncTy, CallTy>(Node);
+ checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
for (auto &Edge : Node->CallerEdges)
checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
}
@@ -1941,12 +2005,14 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
NewEdge->Callee->CallerEdges.push_back(NewEdge);
}
if (VerifyCCG) {
- checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee);
- checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee);
+ checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
+ checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
- checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee);
+ checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
+ /*CheckEdges=*/false);
for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
- checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee);
+ checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
+ /*CheckEdges=*/false);
}
}
@@ -1961,7 +2027,7 @@ template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
if (VerifyNodes)
- checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/true);
+ checkNode<DerivedCCG, FuncTy, CallTy>(Node);
assert(!Node->CloneOf);
// If Node as a null call, then either it wasn't found in the module (regular
@@ -2115,7 +2181,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
for (auto *Clone : Node->Clones) {
removeNoneTypeCalleeEdges(Clone);
if (VerifyNodes)
- checkNode<DerivedCCG, FuncTy, CallTy>(Clone, /*CheckEdges=*/true);
+ checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
}
// We should still have some context ids on the original Node.
assert(!Node->ContextIds.empty());
@@ -2136,7 +2202,581 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
}));
if (VerifyNodes)
- checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/true);
+ checkNode<DerivedCCG, FuncTy, CallTy>(Node);
+}
+
+static std::string getAllocTypeAttributeString(AllocationType Type) {
+ switch (Type) {
+ case AllocationType::NotCold:
+ return "notcold";
+ break;
+ case AllocationType::Cold:
+ return "cold";
+ break;
+ default:
+ dbgs() << "Unexpected alloc type " << (uint8_t)Type;
+ assert(false);
+ }
+ llvm_unreachable("invalid alloc type");
+}
+
+void ModuleCallsiteContextGraph::updateAllocationCall(
+ CallInfo &Call, AllocationType AllocType) {
+ std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
+ auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
+ "memprof", AllocTypeString);
+ cast<CallBase>(Call.call())->addFnAttr(A);
+ OREGetter(Call.call()->getFunction())
+ .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
+ << ore::NV("AllocationCall", Call.call()) << " in clone "
+ << ore::NV("Caller", Call.call()->getFunction())
+ << " marked with memprof allocation attribute "
+ << ore::NV("Attribute", AllocTypeString));
+}
+
+void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
+ AllocationType AllocType) {
+ auto *AI = Call.call().dyn_cast<AllocInfo *>();
+ assert(AI);
+ assert(AI->Versions.size() > Call.cloneNo());
+ AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
+}
+
+void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
+ FuncInfo CalleeFunc) {
+ if (CalleeFunc.cloneNo() > 0)
+ cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
+ OREGetter(CallerCall.call()->getFunction())
+ .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
+ << ore::NV("Call", CallerCall.call()) << " in clone "
+ << ore::NV("Caller", CallerCall.call()->getFunction())
+ << " assigned to call function clone "
+ << ore::NV("Callee", CalleeFunc.func()));
+}
+
+void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
+ FuncInfo CalleeFunc) {
+ auto *CI = CallerCall.call().dyn_cast<CallsiteInfo *>();
+ assert(CI &&
+ "Caller cannot be an allocation which should not have profiled calls");
+ assert(CI->Clones.size() > CallerCall.cloneNo());
+ CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
+}
+
+CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
+ Instruction *>::FuncInfo
+ModuleCallsiteContextGraph::cloneFunctionForCallsite(
+ FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
+ std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
+ // Use existing LLVM facilities for cloning and obtaining Call in clone
+ ValueToValueMapTy VMap;
+ auto *NewFunc = CloneFunction(Func.func(), VMap);
+ std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
+ assert(!Func.func()->getParent()->getFunction(Name));
+ NewFunc->setName(Name);
+ for (auto &Inst : CallsWithMetadataInFunc) {
+ // This map always has the initial version in it.
+ assert(Inst.cloneNo() == 0);
+ CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
+ }
+ OREGetter(Func.func())
+ .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
+ << "created clone " << ore::NV("NewFunction", NewFunc));
+ return {NewFunc, CloneNo};
+}
+
+CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
+ IndexCall>::FuncInfo
+IndexCallsiteContextGraph::cloneFunctionForCallsite(
+ FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
+ std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
+ // Check how many clones we have of Call (and therefore function).
+ // The next clone number is the current size of versions array.
+ // Confirm this matches the CloneNo provided by the caller, which is based on
+ // the number of function clones we have.
+ assert(CloneNo ==
+ (Call.call().is<AllocInfo *>()
+ ? Call.call().dyn_cast<AllocInfo *>()->Versions.size()
+ : Call.call().dyn_cast<CallsiteInfo *>()->Clones.size()));
+ // Walk all the instructions in this function. Create a new version for
+ // each (by adding an entry to the Versions/Clones summary array), and copy
+ // over the version being called for the function clone being cloned here.
+ // Additionally, add an entry to the CallMap for the new function clone,
+ // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
+ // to the new call clone.
+ for (auto &Inst : CallsWithMetadataInFunc) {
+ // This map always has the initial version in it.
+ assert(Inst.cloneNo() == 0);
+ if (auto *AI = Inst.call().dyn_cast<AllocInfo *>()) {
+ assert(AI->Versions.size() == CloneNo);
+ // We assign the allocation type later (in updateAllocationCall), just add
+ // an entry for it here.
+ AI->Versions.push_back(0);
+ } else {
+ auto *CI = Inst.call().dyn_cast<CallsiteInfo *>();
+ assert(CI && CI->Clones.size() == CloneNo);
+ // We assign the clone number later (in updateCall), just add an entry for
+ // it here.
+ CI->Clones.push_back(0);
+ }
+ CallMap[Inst] = {Inst.call(), CloneNo};
+ }
+ return {Func.func(), CloneNo};
+}
+
+// This method assigns cloned callsites to functions, cloning the functions as
+// needed. The assignment is greedy and proceeds roughly as follows:
+//
+// For each function Func:
+// For each call with graph Node having clones:
+// Initialize ClonesWorklist to Node and its clones
+// Initialize NodeCloneCount to 0
+// While ClonesWorklist is not empty:
+// Clone = pop front ClonesWorklist
+// NodeCloneCount++
+// If Func has been cloned less than NodeCloneCount times:
+// If NodeCloneCount is 1:
+// Assign Clone to original Func
+// Continue
+// Create a new function clone
+// If other callers not assigned to call a function clone yet:
+// Assign them to call new function clone
+// Continue
+// Assign any other caller calling the cloned version to new clone
+//
+// For each caller of Clone:
+// If caller is assigned to call a specific function clone:
+// If we cannot assign Clone to that function clone:
+// Create new callsite Clone NewClone
+// Add NewClone to ClonesWorklist
+// Continue
+// Assign Clone to existing caller's called function clone
+// Else:
+// If Clone not already assigned to a function clone:
+// Assign to first function clone without assignment
+// Assign caller to selected function clone
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
+ bool Changed = false;
+
+ // Keep track of the assignment of nodes (callsites) to function clones they
+ // call.
+ DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
+
+ // Update caller node to call function version CalleeFunc, by recording the
+ // assignment in CallsiteToCalleeFuncCloneMap.
+ auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
+ const FuncInfo &CalleeFunc) {
+ assert(Caller->hasCall());
+ CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
+ };
+
+ // Walk all functions for which we saw calls with memprof metadata, and handle
+ // cloning for each of its calls.
+ for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
+ FuncInfo OrigFunc(Func);
+ // Map from each clone of OrigFunc to a map of remappings of each call of
+ // interest (from original uncloned call to the corresponding cloned call in
+ // that function clone).
+ std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
+ for (auto &Call : CallsWithMetadata) {
+ ContextNode *Node = getNodeForInst(Call);
+ // Skip call if we do not have a node for it (all uses of its stack ids
+ // were either on inlined chains or pruned from the MIBs), or if we did
+ // not create any clones for it.
+ if (!Node || Node->Clones.empty())
+ continue;
+ assert(Node->hasCall() &&
+ "Not having a call should have prevented cloning");
+
+ // Track the assignment of function clones to clones of the current
+ // callsite Node being handled.
+ std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
+
+ // Assign callsite version CallsiteClone to function version FuncClone,
+ // and also assign (possibly cloned) Call to CallsiteClone.
+ auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
+ CallInfo &Call,
+ ContextNode *CallsiteClone,
+ bool IsAlloc) {
+ // Record the clone of callsite node assigned to this function clone.
+ FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
+
+ assert(FuncClonesToCallMap.count(FuncClone));
+ std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
+ CallInfo CallClone(Call);
+ if (CallMap.count(Call))
+ CallClone = CallMap[Call];
+ CallsiteClone->setCall(CallClone);
+ };
+
+ // Keep track of the clones of callsite Node that need to be assigned to
+ // function clones. This list may be expanded in the loop body below if we
+ // find additional cloning is required.
+ std::deque<ContextNode *> ClonesWorklist;
+ // Ignore original Node if we moved all of its contexts to clones.
+ if (!Node->ContextIds.empty())
+ ClonesWorklist.push_back(Node);
+ ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(),
+ Node->Clones.end());
+
+ // Now walk through all of the clones of this callsite Node that we need,
+ // and determine the assignment to a corresponding clone of the current
+ // function (creating new function clones as needed).
+ unsigned NodeCloneCount = 0;
+ while (!ClonesWorklist.empty()) {
+ ContextNode *Clone = ClonesWorklist.front();
+ ClonesWorklist.pop_front();
+ NodeCloneCount++;
+ if (VerifyNodes)
+ checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
+
+ // Need to create a new function clone if we have more callsite clones
+ // than existing function clones, which would have been assigned to an
+ // earlier clone in the list (we assign callsite clones to function
+ // clones greedily).
+ if (FuncClonesToCallMap.size() < NodeCloneCount) {
+ // If this is the first callsite copy, assign to original function.
+ if (NodeCloneCount == 1) {
+ // Since FuncClonesToCallMap is empty in this case, no clones have
+ // been created for this function yet, and no callers should have
+ // been assigned a function clone for this callee node yet.
+ assert(llvm::none_of(
+ Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
+ return CallsiteToCalleeFuncCloneMap.count(E->Caller);
+ }));
+ // Initialize with empty call map, assign Clone to original function
+ // and its callers, and skip to the next clone.
+ FuncClonesToCallMap[OrigFunc] = {};
+ AssignCallsiteCloneToFuncClone(
+ OrigFunc, Call, Clone,
+ AllocationCallToContextNodeMap.count(Call));
+ for (auto &CE : Clone->CallerEdges) {
+ // Ignore any caller that does not have a recorded callsite Call.
+ if (!CE->Caller->hasCall())
+ continue;
+ RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
+ }
+ continue;
+ }
+
+ // First locate which copy of OrigFunc to clone again. If a caller
+ // of this callsite clone was already assigned to call a particular
+ // function clone, we need to redirect all of those callers to the
+ // new function clone, and update their other callees within this
+ // function.
+ FuncInfo PreviousAssignedFuncClone;
+ auto EI = llvm::find_if(
+ Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
+ return CallsiteToCalleeFuncCloneMap.count(E->Caller);
+ });
+ bool CallerAssignedToCloneOfFunc = false;
+ if (EI != Clone->CallerEdges.end()) {
+ const std::shared_ptr<ContextEdge> &Edge = *EI;
+ PreviousAssignedFuncClone =
+ CallsiteToCalleeFuncCloneMap[Edge->Caller];
+ CallerAssignedToCloneOfFunc = true;
+ }
+
+ // Clone function and save it along with the CallInfo map created
+ // during cloning in the FuncClonesToCallMap.
+ std::map<CallInfo, CallInfo> NewCallMap;
+ unsigned CloneNo = FuncClonesToCallMap.size();
+ assert(CloneNo > 0 && "Clone 0 is the original function, which "
+ "should already exist in the map");
+ FuncInfo NewFuncClone = cloneFunctionForCallsite(
+ OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
+ FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
+ FunctionClonesAnalysis++;
+ Changed = true;
+
+ // If no caller callsites were already assigned to a clone of this
+ // function, we can simply assign this clone to the new func clone
+ // and update all callers to it, then skip to the next clone.
+ if (!CallerAssignedToCloneOfFunc) {
+ AssignCallsiteCloneToFuncClone(
+ NewFuncClone, Call, Clone,
+ AllocationCallToContextNodeMap.count(Call));
+ for (auto &CE : Clone->CallerEdges) {
+ // Ignore any caller that does not have a recorded callsite Call.
+ if (!CE->Caller->hasCall())
+ continue;
+ RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
+ }
+ continue;
+ }
+
+ // We may need to do additional node cloning in this case.
+ // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
+ // that were previously assigned to call PreviousAssignedFuncClone,
+ // to record that they now call NewFuncClone.
+ for (auto CE : Clone->CallerEdges) {
+ // Ignore any caller that does not have a recorded callsite Call.
+ if (!CE->Caller->hasCall())
+ continue;
+
+ if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
+ // We subsequently fall through to later handling that
+ // will perform any additional cloning required for
+ // callers that were calling other function clones.
+ CallsiteToCalleeFuncCloneMap[CE->Caller] !=
+ PreviousAssignedFuncClone)
+ continue;
+
+ RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
+
+ // If we are cloning a function that was already assigned to some
+ // callers, then essentially we are creating new callsite clones
+ // of the other callsites in that function that are reached by those
+ // callers. Clone the other callees of the current callsite's caller
+ // that were already assigned to PreviousAssignedFuncClone
+ // accordingly. This is important since we subsequently update the
+ // calls from the nodes in the graph and their assignments to callee
+ // functions recorded in CallsiteToCalleeFuncCloneMap.
+ for (auto CalleeEdge : CE->Caller->CalleeEdges) {
+ // Skip any that have been removed on an earlier iteration when
+ // cleaning up newly None type callee edges.
+ if (!CalleeEdge)
+ continue;
+ ContextNode *Callee = CalleeEdge->Callee;
+ // Skip the current callsite, we are looking for other
+ // callsites Caller calls, as well as any that does not have a
+ // recorded callsite Call.
+ if (Callee == Clone || !Callee->hasCall())
+ continue;
+ ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge);
+ removeNoneTypeCalleeEdges(NewClone);
+ // Moving the edge may have resulted in some none type
+ // callee edges on the original Callee.
+ removeNoneTypeCalleeEdges(Callee);
+ assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
+ // If the Callee node was already assigned to call a specific
+ // function version, make sure its new clone is assigned to call
+ // that same function clone.
+ if (CallsiteToCalleeFuncCloneMap.count(Callee))
+ RecordCalleeFuncOfCallsite(
+ NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
+ // Update NewClone with the new Call clone of this callsite's Call
+ // created for the new function clone created earlier.
+ // Recall that we have already ensured when building the graph
+ // that each caller can only call callsites within the same
+ // function, so we are guaranteed that Callee Call is in the
+ // current OrigFunc.
+ // CallMap is set up as indexed by original Call at clone 0.
+ CallInfo OrigCall(Callee->getOrigNode()->Call);
+ OrigCall.setCloneNo(0);
+ std::map<CallInfo, CallInfo> &CallMap =
+ FuncClonesToCallMap[NewFuncClone];
+ assert(CallMap.count(OrigCall));
+ CallInfo NewCall(CallMap[OrigCall]);
+ assert(NewCall);
+ NewClone->setCall(NewCall);
+ }
+ }
+ // Fall through to handling below to perform the recording of the
+ // function for this callsite clone. This enables handling of cases
+ // where the callers were assigned to
diff erent clones of a function.
+ }
+
+ // See if we can use existing function clone. Walk through
+ // all caller edges to see if any have already been assigned to
+ // a clone of this callsite's function. If we can use it, do so. If not,
+ // because that function clone is already assigned to a
diff erent clone
+ // of this callsite, then we need to clone again.
+ // Basically, this checking is needed to handle the case where
diff erent
+ // caller functions/callsites may need versions of this function
+ // containing
diff erent mixes of callsite clones across the
diff erent
+ // callsites within the function. If that happens, we need to create
+ // additional function clones to handle the various combinations.
+ //
+ // Keep track of any new clones of this callsite created by the
+ // following loop, as well as any existing clone that we decided to
+ // assign this clone to.
+ std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
+ FuncInfo FuncCloneAssignedToCurCallsiteClone;
+ // We need to be able to remove Edge from CallerEdges, so need to adjust
+ // iterator in the loop.
+ for (auto EI = Clone->CallerEdges.begin();
+ EI != Clone->CallerEdges.end();) {
+ auto Edge = *EI;
+ // Ignore any caller that does not have a recorded callsite Call.
+ if (!Edge->Caller->hasCall()) {
+ EI++;
+ continue;
+ }
+ // If this caller already assigned to call a version of OrigFunc, need
+ // to ensure we can assign this callsite clone to that function clone.
+ if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
+ FuncInfo FuncCloneCalledByCaller =
+ CallsiteToCalleeFuncCloneMap[Edge->Caller];
+ // First we need to confirm that this function clone is available
+ // for use by this callsite node clone.
+ //
+ // While FuncCloneToCurNodeCloneMap is built only for this Node and
+ // its callsite clones, one of those callsite clones X could have
+ // been assigned to the same function clone called by Edge's caller
+ // - if Edge's caller calls another callsite within Node's original
+ // function, and that callsite has another caller reaching clone X.
+ // We need to clone Node again in this case.
+ if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
+ FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
+ Clone) ||
+ // Detect when we have multiple callers of this callsite that
+ // have already been assigned to specific, and
diff erent, clones
+ // of OrigFunc (due to other unrelated callsites in Func they
+ // reach via call contexts). Is this Clone of callsite Node
+ // assigned to a
diff erent clone of OrigFunc? If so, clone Node
+ // again.
+ (FuncCloneAssignedToCurCallsiteClone &&
+ FuncCloneAssignedToCurCallsiteClone !=
+ FuncCloneCalledByCaller)) {
+ // We need to use a
diff erent newly created callsite clone, in
+ // order to assign it to another new function clone on a
+ // subsequent iteration over the Clones array (adjusted below).
+ // Note we specifically do not reset the
+ // CallsiteToCalleeFuncCloneMap entry for this caller, so that
+ // when this new clone is processed later we know which version of
+ // the function to copy (so that other callsite clones we have
+ // assigned to that function clone are properly cloned over). See
+ // comments in the function cloning handling earlier.
+
+ // Check if we already have cloned this callsite again while
+ // walking through caller edges, for a caller calling the same
+ // function clone. If so, we can move this edge to that new clone
+ // rather than creating yet another new clone.
+ if (FuncCloneToNewCallsiteCloneMap.count(
+ FuncCloneCalledByCaller)) {
+ ContextNode *NewClone =
+ FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
+ moveEdgeToExistingCalleeClone(Edge, NewClone, &EI);
+ // Cleanup any none type edges cloned over.
+ removeNoneTypeCalleeEdges(NewClone);
+ } else {
+ // Create a new callsite clone.
+ ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, &EI);
+ removeNoneTypeCalleeEdges(NewClone);
+ FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
+ NewClone;
+ // Add to list of clones and process later.
+ ClonesWorklist.push_back(NewClone);
+ assert(EI == Clone->CallerEdges.end() ||
+ Clone->AllocTypes != (uint8_t)AllocationType::None);
+ assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
+ }
+ // Moving the caller edge may have resulted in some none type
+ // callee edges.
+ removeNoneTypeCalleeEdges(Clone);
+ // We will handle the newly created callsite clone in a subsequent
+ // iteration over this Node's Clones. Continue here since we
+ // already adjusted iterator EI while moving the edge.
+ continue;
+ }
+
+ // Otherwise, we can use the function clone already assigned to this
+ // caller.
+ if (!FuncCloneAssignedToCurCallsiteClone) {
+ FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
+ // Assign Clone to FuncCloneCalledByCaller
+ AssignCallsiteCloneToFuncClone(
+ FuncCloneCalledByCaller, Call, Clone,
+ AllocationCallToContextNodeMap.count(Call));
+ } else
+ // Don't need to do anything - callsite is already calling this
+ // function clone.
+ assert(FuncCloneAssignedToCurCallsiteClone ==
+ FuncCloneCalledByCaller);
+
+ } else {
+ // We have not already assigned this caller to a version of
+ // OrigFunc. Do the assignment now.
+
+ // First check if we have already assigned this callsite clone to a
+ // clone of OrigFunc for another caller during this iteration over
+ // its caller edges.
+ if (!FuncCloneAssignedToCurCallsiteClone) {
+ // Find first function in FuncClonesToCallMap without an assigned
+ // clone of this callsite Node. We should always have one
+ // available at this point due to the earlier cloning when the
+ // FuncClonesToCallMap size was smaller than the clone number.
+ for (auto &CF : FuncClonesToCallMap) {
+ if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
+ FuncCloneAssignedToCurCallsiteClone = CF.first;
+ break;
+ }
+ }
+ assert(FuncCloneAssignedToCurCallsiteClone);
+ // Assign Clone to FuncCloneAssignedToCurCallsiteClone
+ AssignCallsiteCloneToFuncClone(
+ FuncCloneAssignedToCurCallsiteClone, Call, Clone,
+ AllocationCallToContextNodeMap.count(Call));
+ } else
+ assert(FuncCloneToCurNodeCloneMap
+ [FuncCloneAssignedToCurCallsiteClone] == Clone);
+ // Update callers to record function version called.
+ RecordCalleeFuncOfCallsite(Edge->Caller,
+ FuncCloneAssignedToCurCallsiteClone);
+ }
+
+ EI++;
+ }
+ }
+ if (VerifyCCG) {
+ checkNode<DerivedCCG, FuncTy, CallTy>(Node);
+ for (const auto &PE : Node->CalleeEdges)
+ checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
+ for (const auto &CE : Node->CallerEdges)
+ checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
+ for (auto *Clone : Node->Clones) {
+ checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
+ for (const auto &PE : Clone->CalleeEdges)
+ checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
+ for (const auto &CE : Clone->CallerEdges)
+ checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
+ }
+ }
+ }
+ }
+
+ auto UpdateCalls = [&](ContextNode *Node,
+ DenseSet<const ContextNode *> &Visited,
+ auto &&UpdateCalls) {
+ auto Inserted = Visited.insert(Node);
+ if (!Inserted.second)
+ return;
+
+ for (auto *Clone : Node->Clones)
+ UpdateCalls(Clone, Visited, UpdateCalls);
+
+ for (auto &Edge : Node->CallerEdges)
+ UpdateCalls(Edge->Caller, Visited, UpdateCalls);
+
+ // Skip if either no call to update, or if we ended up with no context ids
+ // (we moved all edges onto other clones).
+ if (!Node->hasCall() || Node->ContextIds.empty())
+ return;
+
+ if (Node->IsAllocation) {
+ updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes));
+ return;
+ }
+
+ if (!CallsiteToCalleeFuncCloneMap.count(Node))
+ return;
+
+ auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
+ updateCall(Node->Call, CalleeFunc);
+ };
+
+ // Performs DFS traversal starting from allocation nodes to update calls to
+ // reflect cloning decisions recorded earlier. For regular LTO this will
+ // update the actual calls in the IR to call the appropriate function clone
+ // (and add attributes to allocation calls), whereas for ThinLTO the decisions
+ // are recorded in the summary entries.
+ DenseSet<const ContextNode *> Visited;
+ for (auto &Entry : AllocationCallToContextNodeMap)
+ UpdateCalls(Entry.second, Visited, UpdateCalls);
+
+ return Changed;
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
@@ -2165,13 +2805,24 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
if (ExportToDot)
exportToDot("cloned");
- return false;
+ bool Changed = assignFunctions();
+
+ if (DumpCCG) {
+ dbgs() << "CCG after assigning function clones:\n";
+ dbgs() << *this;
+ }
+ if (ExportToDot)
+ exportToDot("clonefuncassign");
+
+ return Changed;
}
-bool MemProfContextDisambiguation::processModule(Module &M) {
+bool MemProfContextDisambiguation::processModule(
+ Module &M,
+ function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
bool Changed = false;
- ModuleCallsiteContextGraph CCG(M);
+ ModuleCallsiteContextGraph CCG(M, OREGetter);
Changed = CCG.process();
return Changed;
@@ -2179,7 +2830,11 @@ bool MemProfContextDisambiguation::processModule(Module &M) {
PreservedAnalyses MemProfContextDisambiguation::run(Module &M,
ModuleAnalysisManager &AM) {
- if (!processModule(M))
+ auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+ auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
+ return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
+ };
+ if (!processModule(M, OREGetter))
return PreservedAnalyses::all();
return PreservedAnalyses::none();
}
diff --git a/llvm/test/ThinLTO/X86/memprof-basic.ll b/llvm/test/ThinLTO/X86/memprof-basic.ll
index 4df89cdb12afd..11378cf5bef47 100644
--- a/llvm/test/ThinLTO/X86/memprof-basic.ll
+++ b/llvm/test/ThinLTO/X86/memprof-basic.ll
@@ -42,13 +42,35 @@
; RUN: -r=%t.o,_Znam, \
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
-; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \
+; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN: --check-prefix=STATS
; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
;; We should have cloned bar, baz, and foo, for the cold memory allocation.
; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
+;; Try again but with distributed ThinLTO
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN: -thinlto-distributed-indexes \
+; RUN: -r=%t.o,main,plx \
+; RUN: -r=%t.o,_ZdaPv, \
+; RUN: -r=%t.o,sleep, \
+; RUN: -r=%t.o,_Znam, \
+; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t2. \
+; RUN: -stats -pass-remarks=memprof-context-disambiguation \
+; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN: --check-prefix=STATS
+
+; RUN: cat %t2.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
+;; We should have cloned bar, baz, and foo, for the cold memory allocation.
+; RUN: cat %t2.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
+
+;; Check distributed index
+; RUN: llvm-dis %t.o.thinlto.bc -o - | FileCheck %s --check-prefix=DISTRIB
+
source_filename = "memprof-basic.ll"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
@@ -230,6 +252,11 @@ uselistorder ptr @_Z3foov, { 1, 0 }
; DUMP: Clone of [[BAR]]
+; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis
+
+
; DOT: digraph "postbuild" {
; DOT: label="postbuild";
; DOT: Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> alloc}"];
@@ -261,3 +288,9 @@ uselistorder ptr @_Z3foov, { 1, 0 }
; DOTCLONED: Node[[BAZ2]] -> Node[[BAR2:0x[a-z0-9]+]][tooltip="ContextIds: 2",fillcolor="cyan"];
; DOTCLONED: Node[[BAR2]] [shape=record,tooltip="N[[BAR2]] ContextIds: 2",fillcolor="cyan",style="filled",color="blue",style="filled,bold,dashed",label="{OrigId: Alloc0\n_Z3barv -\> alloc}"];
; DOTCLONED: }
+
+
+; DISTRIB: ^[[BAZ:[0-9]+]] = gv: (guid: 5878270615442837395, {{.*}} callsites: ((callee: ^[[BAR:[0-9]+]], clones: (0, 1)
+; DISTRIB: ^[[FOO:[0-9]+]] = gv: (guid: 6731117468105397038, {{.*}} callsites: ((callee: ^[[BAZ]], clones: (0, 1)
+; DISTRIB: ^[[BAR]] = gv: (guid: 9832687305761716512, {{.*}} allocs: ((versions: (notcold, cold)
+; DISTRIB: ^[[MAIN:[0-9]+]] = gv: (guid: 15822663052811949562, {{.*}} callsites: ((callee: ^[[FOO]], clones: (0), {{.*}} (callee: ^[[FOO]], clones: (1)
diff --git a/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll
index 12e2fc39b5f5e..7f7447eaf58e4 100644
--- a/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll
+++ b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll
@@ -1,7 +1,8 @@
;; Test callsite context graph generation for call graph with with MIBs
;; that have pruned contexts that partially match multiple inlined
;; callsite contexts, requiring duplication of context ids and nodes
-;; while matching callsite nodes onto the graph.
+;; while matching callsite nodes onto the graph. Also tests graph and IR
+;; cloning.
;;
;; Original code looks like:
;;
@@ -63,7 +64,9 @@
; RUN: -r=%t.o,_Znam, \
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
-; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \
+; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN: --check-prefix=STATS
; RUN: cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE
; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST
@@ -71,6 +74,27 @@
; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
+;; Try again but with distributed ThinLTO
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN: -thinlto-distributed-indexes \
+; RUN: -r=%t.o,main,plx \
+; RUN: -r=%t.o,_ZdaPv, \
+; RUN: -r=%t.o,sleep, \
+; RUN: -r=%t.o,_Znam, \
+; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t2. \
+; RUN: -stats -pass-remarks=memprof-context-disambiguation \
+; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN: --check-prefix=STATS
+
+; RUN: cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE
+; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST
+;; We should clone D once for the cold allocations via C.
+; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
+
+;; Check distributed index
+; RUN: llvm-dis %t.o.thinlto.bc -o - | FileCheck %s --check-prefix=DISTRIB
+
source_filename = "duplicate-context-ids.ll"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
@@ -107,7 +131,13 @@ entry:
ret ptr null
}
-declare i32 @main()
+define i32 @main() {
+entry:
+ call ptr @_Z1Bv()
+ call ptr @_Z1Ev()
+ call ptr @_Z1Fv()
+ ret i32 0
+}
declare void @_ZdaPv()
@@ -271,6 +301,11 @@ declare i32 @sleep()
; DUMP: Clone of [[D]]
+; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 1 memprof-context-disambiguation - Number of function clones created during whole program analysis
+
+
; DOTPRE: digraph "prestackupdate" {
; DOTPRE: label="prestackupdate";
; DOTPRE: Node[[D:0x[a-z0-9]+]] [shape=record,tooltip="N[[D]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z1Dv -\> alloc}"];
@@ -308,3 +343,9 @@ declare i32 @sleep()
; DOTCLONED: Node[[E]] -> Node[[D2]][tooltip="ContextIds: 1",fillcolor="cyan"];
; DOTCLONED: Node[[D2]] [shape=record,tooltip="N[[D2]] ContextIds: 1 3 4",fillcolor="cyan",style="filled",color="blue",style="filled,bold,dashed",label="{OrigId: Alloc0\n_Z1Dv -\> alloc}"];
; DOTCLONED: }
+
+; DISTRIB: ^[[C:[0-9]+]] = gv: (guid: 1643923691937891493, {{.*}} callsites: ((callee: ^[[D:[0-9]+]], clones: (1)
+; DISTRIB: ^[[D]] = gv: (guid: 4881081444663423788, {{.*}} allocs: ((versions: (notcold, cold)
+; DISTRIB: ^[[B:[0-9]+]] = gv: (guid: 14590037969532473829, {{.*}} callsites: ((callee: ^[[D]], clones: (1)
+; DISTRIB: ^[[F:[0-9]+]] = gv: (guid: 17035303613541779335, {{.*}} callsites: ((callee: ^[[D]], clones: (0)
+; DISTRIB: ^[[E:[0-9]+]] = gv: (guid: 17820708772846654376, {{.*}} callsites: ((callee: ^[[D]], clones: (1)
diff --git a/llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll b/llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll
index 38bfe9d1e0170..54aad0dc94ac0 100644
--- a/llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll
+++ b/llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll
@@ -45,6 +45,9 @@
;;
;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
+;; -stats requires asserts
+; REQUIRES: asserts
+
; RUN: opt -thinlto-bc %s >%t.o
; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
@@ -53,7 +56,9 @@
; RUN: -r=%t.o,sleep, \
; RUN: -r=%t.o,_Znam, \
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
-; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \
+; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN: --check-prefix=STATS
;; Try again but with distributed ThinLTO
@@ -64,7 +69,9 @@
; RUN: -r=%t.o,sleep, \
; RUN: -r=%t.o,_Znam, \
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
-; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN: -stats -pass-remarks=memprof-context-disambiguation \
+; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN: --check-prefix=STATS
source_filename = "funcassigncloning.ll"
@@ -221,3 +228,8 @@ uselistorder ptr @_Znam, { 1, 0 }
; DUMP: CallerEdges:
; DUMP: Edge from Callee [[ENEW2CLONE]] to Caller: [[C]] AllocTypes: Cold ContextIds: 5
; DUMP: Clone of [[ENEW2ORIG]]
+
+
+; STATS: 2 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 4 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis
diff --git a/llvm/test/ThinLTO/X86/memprof-indirectcall.ll b/llvm/test/ThinLTO/X86/memprof-indirectcall.ll
index bd9f5e9250592..c311d6243688f 100644
--- a/llvm/test/ThinLTO/X86/memprof-indirectcall.ll
+++ b/llvm/test/ThinLTO/X86/memprof-indirectcall.ll
@@ -1,7 +1,7 @@
;; Tests callsite context graph generation for call graph containing indirect
;; calls. Currently this should result in conservative behavior, such that the
;; indirect call receives a null call in its graph node, to prevent subsequent
-;; cloning.
+;; cloning. Also tests graph and IR cloning.
;;
;; Original code looks like:
;;
@@ -64,7 +64,9 @@
; RUN: -r=%t.o,_ZTVN10__cxxabiv117__class_type_infoE, \
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
-; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \
+; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN: --check-prefix=STATS
; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
;; We should only create a single clone of foo, for the direct call
@@ -72,6 +74,26 @@
; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
+;; Try again but with distributed ThinLTO
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN: -thinlto-distributed-indexes \
+; RUN: -r=%t.o,main,plx \
+; RUN: -r=%t.o,_ZdaPv, \
+; RUN: -r=%t.o,sleep, \
+; RUN: -r=%t.o,_Znam, \
+; RUN: -r=%t.o,_ZTVN10__cxxabiv120__si_class_type_infoE, \
+; RUN: -r=%t.o,_ZTVN10__cxxabiv117__class_type_infoE, \
+; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t2. \
+; RUN: -stats -pass-remarks=memprof-context-disambiguation \
+; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN: --check-prefix=STATS
+
+; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
+;; We should only create a single clone of foo, for the direct call
+;; from main allocating cold memory.
+; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
+
source_filename = "indirectcall.ll"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
@@ -362,6 +384,11 @@ uselistorder ptr @_Z3foov, { 3, 2, 1, 0 }
; DUMP: Clone of [[FOO]]
+; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 1 memprof-context-disambiguation - Number of function clones created during whole program analysis
+
+
; DOT: digraph "postbuild" {
; DOT: label="postbuild";
; DOT: Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2 3 4 5 6",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3foov -\> alloc}"];
diff --git a/llvm/test/ThinLTO/X86/memprof-inlined.ll b/llvm/test/ThinLTO/X86/memprof-inlined.ll
index e87168b4e3f92..27eab8a5bcd20 100644
--- a/llvm/test/ThinLTO/X86/memprof-inlined.ll
+++ b/llvm/test/ThinLTO/X86/memprof-inlined.ll
@@ -1,6 +1,7 @@
;; Test callsite context graph generation for call graph with two memprof
;; contexts and partial inlining, requiring generation of a new fused node to
;; represent the inlined sequence while matching callsite nodes onto the graph.
+;; Also tests graph and IR cloning.
;;
;; Original code looks like:
;;
@@ -51,7 +52,9 @@
; RUN: -r=%t.o,_Znam, \
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
-; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \
+; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN: --check-prefix=STATS
; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
;; We should create clones for foo and bar for the call from main to allocate
@@ -59,6 +62,24 @@
; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
+;; Try again but with distributed ThinLTO
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN: -thinlto-distributed-indexes \
+; RUN: -r=%t.o,main,plx \
+; RUN: -r=%t.o,_ZdaPv, \
+; RUN: -r=%t.o,sleep, \
+; RUN: -r=%t.o,_Znam, \
+; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t2. \
+; RUN: -stats -pass-remarks=memprof-context-disambiguation \
+; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN: --check-prefix=STATS
+
+; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
+;; We should create clones for foo and bar for the call from main to allocate
+;; cold memory.
+; RUN: cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
+
source_filename = "inlined.ll"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
@@ -260,6 +281,11 @@ declare i32 @sleep()
; DUMP: Clone of [[BAR]]
+; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 2 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 2 memprof-context-disambiguation - Number of function clones created during whole program analysis
+
+
; DOT: digraph "postbuild" {
; DOT: label="postbuild";
; DOT: Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3bazv -\> alloc}"];
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll b/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll
index 99a8d68a5b1d2..bd938754ce9d0 100644
--- a/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll
@@ -1,5 +1,5 @@
;; Test callsite context graph generation for simple call graph with
-;; two memprof contexts and no inlining.
+;; two memprof contexts and no inlining, as well as graph and IR cloning.
;;
;; Original code looks like:
;;
@@ -37,7 +37,9 @@
; RUN: opt -passes=memprof-context-disambiguation \
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
-; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN: -stats -pass-remarks=memprof-context-disambiguation \
+; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \
+; RUN: --check-prefix=STATS --check-prefix=REMARKS
; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
;; We should have cloned bar, baz, and foo, for the cold memory allocation.
@@ -225,6 +227,48 @@ attributes #6 = { builtin }
; DUMP: Clone of [[BAR]]
+; REMARKS: created clone _Z3barv.memprof.1
+; REMARKS: created clone _Z3bazv.memprof.1
+; REMARKS: created clone _Z3foov.memprof.1
+; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1
+; REMARKS: call in clone _Z3foov.memprof.1 assigned to call function clone _Z3bazv.memprof.1
+; REMARKS: call in clone _Z3bazv.memprof.1 assigned to call function clone _Z3barv.memprof.1
+; REMARKS: call in clone _Z3barv.memprof.1 marked with memprof allocation attribute cold
+; REMARKS: call in clone main assigned to call function clone _Z3foov
+; REMARKS: call in clone _Z3foov assigned to call function clone _Z3bazv
+; REMARKS: call in clone _Z3bazv assigned to call function clone _Z3barv
+; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold
+
+
+; IR: define {{.*}} @main
+;; The first call to foo does not allocate cold memory. It should call the
+;; original functions, which ultimately call the original allocation decorated
+;; with a "notcold" attribute.
+; IR: call {{.*}} @_Z3foov()
+;; The second call to foo allocates cold memory. It should call cloned functions
+;; which ultimately call a cloned allocation decorated with a "cold" attribute.
+; IR: call {{.*}} @_Z3foov.memprof.1()
+; IR: define internal {{.*}} @_Z3barv()
+; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z3bazv()
+; IR: call {{.*}} @_Z3barv()
+; IR: define internal {{.*}} @_Z3foov()
+; IR: call {{.*}} @_Z3bazv()
+; IR: define internal {{.*}} @_Z3barv.memprof.1()
+; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z3bazv.memprof.1()
+; IR: call {{.*}} @_Z3barv.memprof.1()
+; IR: define internal {{.*}} @_Z3foov.memprof.1()
+; IR: call {{.*}} @_Z3bazv.memprof.1()
+; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" }
+; IR: attributes #[[COLD]] = { builtin "memprof"="cold" }
+
+
+; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis
+
+
; DOT: digraph "postbuild" {
; DOT: label="postbuild";
; DOT: Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> _Znam}"];
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids.ll b/llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids.ll
index 143f892c18950..1f23ad3c6a51b 100644
--- a/llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids.ll
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids.ll
@@ -1,7 +1,8 @@
;; Test callsite context graph generation for call graph with with MIBs
;; that have pruned contexts that partially match multiple inlined
;; callsite contexts, requiring duplication of context ids and nodes
-;; while matching callsite nodes onto the graph.
+;; while matching callsite nodes onto the graph. Also tests graph and IR
+;; cloning.
;;
;; Original code looks like:
;;
@@ -58,7 +59,9 @@
; RUN: opt -passes=memprof-context-disambiguation \
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
-; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN: -stats -pass-remarks=memprof-context-disambiguation \
+; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \
+; RUN: --check-prefix=STATS --check-prefix=REMARKS
; RUN: cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE
; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST
@@ -266,6 +269,39 @@ attributes #6 = { builtin }
; DUMP: Edge from Callee [[D2]] to Caller: [[B:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4
; DUMP: Clone of [[D]]
+; REMARKS: created clone _Z1Dv.memprof.1
+; REMARKS: call in clone _Z1Ev assigned to call function clone _Z1Dv.memprof.1
+; REMARKS: call in clone _Z1Cv assigned to call function clone _Z1Dv.memprof.1
+; REMARKS: call in clone _Z1Bv assigned to call function clone _Z1Dv.memprof.1
+; REMARKS: call in clone _Z1Dv.memprof.1 marked with memprof allocation attribute cold
+; REMARKS: call in clone _Z1Fv assigned to call function clone _Z1Dv
+; REMARKS: call in clone _Z1Dv marked with memprof allocation attribute notcold
+
+
+;; The allocation via F does not allocate cold memory. It should call the
+;; original D, which ultimately call the original allocation decorated
+;; with a "notcold" attribute.
+; IR: define internal {{.*}} @_Z1Dv()
+; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z1Fv()
+; IR: call {{.*}} @_Z1Dv()
+;; The allocations via B and E allocate cold memory. They should call the
+;; cloned D, which ultimately call the cloned allocation decorated with a
+;; "cold" attribute.
+; IR: define internal {{.*}} @_Z1Bv()
+; IR: call {{.*}} @_Z1Dv.memprof.1()
+; IR: define internal {{.*}} @_Z1Ev()
+; IR: call {{.*}} @_Z1Dv.memprof.1()
+; IR: define internal {{.*}} @_Z1Dv.memprof.1()
+; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]]
+; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" }
+; IR: attributes #[[COLD]] = { builtin "memprof"="cold" }
+
+
+; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 1 memprof-context-disambiguation - Number of function clones created during whole program analysis
+
; DOTPRE: digraph "prestackupdate" {
; DOTPRE: label="prestackupdate";
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/funcassigncloning.ll b/llvm/test/Transforms/MemProfContextDisambiguation/funcassigncloning.ll
index 3d9efa5ec13f9..b94e9b855b747 100644
--- a/llvm/test/Transforms/MemProfContextDisambiguation/funcassigncloning.ll
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/funcassigncloning.ll
@@ -45,9 +45,14 @@
;;
;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
+;; -stats requires asserts
+; REQUIRES: asserts
+
; RUN: opt -passes=memprof-context-disambiguation \
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
-; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN: -stats -pass-remarks=memprof-context-disambiguation \
+; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \
+; RUN: --check-prefix=STATS --check-prefix=REMARKS
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
@@ -187,3 +192,56 @@ attributes #6 = { builtin }
; DUMP: CallerEdges:
; DUMP: Edge from Callee [[ENEW2CLONE]] to Caller: [[C]] AllocTypes: Cold ContextIds: 5
; DUMP: Clone of [[ENEW2ORIG]]
+
+
+;; We greedily create a clone of E that is initially used by the clones of the
+;; first call to new. However, we end up with an incompatible set of callers
+;; given the second call to new which has clones with a
diff erent combination of
+;; callers. Eventually, we create 2 more clones, and the first clone becomes dead.
+; REMARKS: created clone _Z1EPPcS0_.memprof.1
+; REMARKS: created clone _Z1EPPcS0_.memprof.2
+; REMARKS: created clone _Z1EPPcS0_.memprof.3
+; REMARKS: call in clone _Z1DPPcS0_ assigned to call function clone _Z1EPPcS0_.memprof.2
+; REMARKS: call in clone _Z1EPPcS0_.memprof.2 marked with memprof allocation attribute cold
+; REMARKS: call in clone _Z1CPPcS0_ assigned to call function clone _Z1EPPcS0_.memprof.3
+; REMARKS: call in clone _Z1EPPcS0_.memprof.3 marked with memprof allocation attribute notcold
+; REMARKS: call in clone _Z1BPPcS0_ assigned to call function clone _Z1EPPcS0_
+; REMARKS: call in clone _Z1EPPcS0_ marked with memprof allocation attribute notcold
+; REMARKS: call in clone _Z1EPPcS0_.memprof.2 marked with memprof allocation attribute notcold
+; REMARKS: call in clone _Z1EPPcS0_.memprof.3 marked with memprof allocation attribute cold
+; REMARKS: call in clone _Z1EPPcS0_ marked with memprof allocation attribute notcold
+
+
+;; Original version of E is used for the non-cold allocations, both from B.
+; IR: define internal {{.*}} @_Z1EPPcS0_(
+; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]]
+; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]]
+; IR: define internal {{.*}} @_Z1BPPcS0_(
+; IR: call {{.*}} @_Z1EPPcS0_(
+;; C calls a clone of E with the first new allocating cold memory and the
+;; second allocating non-cold memory.
+; IR: define internal {{.*}} @_Z1CPPcS0_(
+; IR: call {{.*}} @_Z1EPPcS0_.memprof.3(
+;; D calls a clone of E with the first new allocating non-cold memory and the
+;; second allocating cold memory.
+; IR: define internal {{.*}} @_Z1DPPcS0_(
+; IR: call {{.*}} @_Z1EPPcS0_.memprof.2(
+;; Transient clone that will get removed as it ends up with no callers.
+;; Its calls to new never get updated with a memprof attribute as a result.
+; IR: define internal {{.*}} @_Z1EPPcS0_.memprof.1(
+; IR: call {{.*}} @_Znam(i64 noundef 10) #[[DEFAULT:[0-9]+]]
+; IR: call {{.*}} @_Znam(i64 noundef 10) #[[DEFAULT]]
+; IR: define internal {{.*}} @_Z1EPPcS0_.memprof.2(
+; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]]
+; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]]
+; IR: define internal {{.*}} @_Z1EPPcS0_.memprof.3(
+; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]]
+; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD]]
+; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" }
+; IR: attributes #[[DEFAULT]] = { builtin }
+; IR: attributes #[[COLD]] = { builtin "memprof"="cold" }
+
+
+; STATS: 2 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 4 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/indirectcall.ll b/llvm/test/Transforms/MemProfContextDisambiguation/indirectcall.ll
index 49ca9407d9250..f3216aa13d88f 100644
--- a/llvm/test/Transforms/MemProfContextDisambiguation/indirectcall.ll
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/indirectcall.ll
@@ -1,7 +1,7 @@
;; Tests callsite context graph generation for call graph containing indirect
;; calls. Currently this should result in conservative behavior, such that the
;; indirect call receives a null call in its graph node, to prevent subsequent
-;; cloning.
+;; cloning. Also tests graph and IR cloning.
;;
;; Original code looks like:
;;
@@ -57,7 +57,9 @@
; RUN: opt -passes=memprof-context-disambiguation \
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
-; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN: -stats -pass-remarks=memprof-context-disambiguation \
+; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \
+; RUN: --check-prefix=STATS --check-prefix=REMARKS
; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
;; We should only create a single clone of foo, for the direct call
@@ -343,6 +345,41 @@ attributes #7 = { builtin }
; DUMP: Clone of [[FOO]]
+; REMARKS: created clone _Z3foov.memprof.1
+; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1
+; REMARKS: call in clone _Z3foov.memprof.1 marked with memprof allocation attribute cold
+; REMARKS: call in clone _ZN1A1xEv assigned to call function clone _Z3foov
+; REMARKS: call in clone _ZN1B1xEv assigned to call function clone _Z3foov
+; REMARKS: call in clone main assigned to call function clone _Z3foov
+; REMARKS: call in clone _Z3foov marked with memprof allocation attribute notcold
+
+
+; IR: define {{.*}} @main(
+; IR: call {{.*}} @_Z3foov()
+;; Only the second call to foo, which allocates cold memory via direct calls,
+;; is replaced with a call to a clone that calls a cold allocation.
+; IR: call {{.*}} @_Z3foov.memprof.1()
+; IR: call {{.*}} @_Z3barP1A(
+; IR: call {{.*}} @_Z3barP1A(
+; IR: call {{.*}} @_Z3barP1A(
+; IR: call {{.*}} @_Z3barP1A(
+; IR: define internal {{.*}} @_ZN1A1xEv(
+; IR: call {{.*}} @_Z3foov()
+; IR: define internal {{.*}} @_ZN1B1xEv(
+; IR: call {{.*}} @_Z3foov()
+; IR: define internal {{.*}} @_Z3foov()
+; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z3foov.memprof.1()
+; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]]
+; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" }
+; IR: attributes #[[COLD]] = { builtin "memprof"="cold" }
+
+
+; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 1 memprof-context-disambiguation - Number of function clones created during whole program analysis
+
+
; DOT: digraph "postbuild" {
; DOT: label="postbuild";
; DOT: Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2 3 4 5 6",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3foov -\> _Znam}"];
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/inlined.ll b/llvm/test/Transforms/MemProfContextDisambiguation/inlined.ll
index 70a6f39980ede..f1b74f13fb148 100644
--- a/llvm/test/Transforms/MemProfContextDisambiguation/inlined.ll
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/inlined.ll
@@ -1,6 +1,7 @@
;; Test callsite context graph generation for call graph with two memprof
;; contexts and partial inlining, requiring generation of a new fused node to
;; represent the inlined sequence while matching callsite nodes onto the graph.
+;; Also tests graph and IR cloning.
;;
;; Original code looks like:
;;
@@ -46,7 +47,9 @@
; RUN: opt -passes=memprof-context-disambiguation \
; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
-; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN: -stats -pass-remarks=memprof-context-disambiguation \
+; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \
+; RUN: --check-prefix=STATS --check-prefix=REMARKS
; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
;; We should create clones for foo and bar for the call from main to allocate
@@ -254,6 +257,42 @@ attributes #7 = { builtin }
; DUMP: Clone of [[BAR]]
+; REMARKS: created clone _Z3barv.memprof.1
+; REMARKS: created clone _Z3foov.memprof.1
+; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1
+; REMARKS: call in clone _Z3foov.memprof.1 assigned to call function clone _Z3barv.memprof.1
+; REMARKS: call in clone _Z3barv.memprof.1 marked with memprof allocation attribute cold
+; REMARKS: call in clone main assigned to call function clone _Z3foov
+; REMARKS: call in clone _Z3foov assigned to call function clone _Z3barv
+; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold
+; REMARKS: call in clone _Z3bazv marked with memprof allocation attribute notcold
+
+
+; IR: define internal {{.*}} @_Z3barv()
+; IR: call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z3foov()
+; IR: call {{.*}} @_Z3barv()
+; IR: define {{.*}} @main()
+;; The first call to foo does not allocate cold memory. It should call the
+;; original functions, which ultimately call the original allocation decorated
+;; with a "notcold" attribute.
+; IR: call {{.*}} @_Z3foov()
+;; The second call to foo allocates cold memory. It should call cloned functions
+;; which ultimately call a cloned allocation decorated with a "cold" attribute.
+; IR: call {{.*}} @_Z3foov.memprof.1()
+; IR: define internal {{.*}} @_Z3barv.memprof.1()
+; IR: call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z3foov.memprof.1()
+; IR: call {{.*}} @_Z3barv.memprof.1()
+; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" }
+; IR: attributes #[[COLD]] = { builtin "memprof"="cold" }
+
+
+; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 2 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 2 memprof-context-disambiguation - Number of function clones created during whole program analysis
+
+
; DOT: digraph "postbuild" {
; DOT: label="postbuild";
; DOT: Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> _Znam}"];
More information about the llvm-commits
mailing list