[llvm] 1117b9a - [InstrProf] Use BalancedPartitioning to order temporal profiling trace data
Ellis Hoag via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 6 12:00:04 PDT 2023
Author: Ellis Hoag
Date: 2023-06-06T11:59:57-07:00
New Revision: 1117b9a284aa6e4b1f3cbde31825605bd07a2384
URL: https://github.com/llvm/llvm-project/commit/1117b9a284aa6e4b1f3cbde31825605bd07a2384
DIFF: https://github.com/llvm/llvm-project/commit/1117b9a284aa6e4b1f3cbde31825605bd07a2384.diff
LOG: [InstrProf] Use BalancedPartitioning to order temporal profiling trace data
In [0] we described an algorithm called //BalancedPartitioning// (bp) to consume function traces [1] and compute a function order that reduces the number of page faults during startup.
This patch adds the `order` command to the `llvm-profdata` tool which uses bp to output a function order that can be passed to the linker via `--symbol-ordering-file=`.
Special thanks to Sergey Pupyrev and Julian Mestre for designing this balanced partitioning algorithm.
[0] https://discourse.llvm.org/t/rfc-temporal-profiling-extension-for-irpgo/68068
[1] https://reviews.llvm.org/D147287
Reviewed By: spupyrev
Differential Revision: https://reviews.llvm.org/D147812
Added:
llvm/include/llvm/Support/BalancedPartitioning.h
llvm/lib/Support/BalancedPartitioning.cpp
llvm/test/tools/llvm-profdata/show-order.proftext
llvm/unittests/Support/BalancedPartitioningTest.cpp
Modified:
llvm/docs/CommandGuide/llvm-profdata.rst
llvm/include/llvm/ProfileData/InstrProf.h
llvm/lib/ProfileData/InstrProf.cpp
llvm/lib/Support/CMakeLists.txt
llvm/tools/llvm-profdata/llvm-profdata.cpp
llvm/unittests/Support/CMakeLists.txt
Removed:
################################################################################
diff --git a/llvm/docs/CommandGuide/llvm-profdata.rst b/llvm/docs/CommandGuide/llvm-profdata.rst
index c0a551fd47924..be42733ca1405 100644
--- a/llvm/docs/CommandGuide/llvm-profdata.rst
+++ b/llvm/docs/CommandGuide/llvm-profdata.rst
@@ -20,6 +20,7 @@ COMMANDS
* :ref:`merge <profdata-merge>`
* :ref:`show <profdata-show>`
* :ref:`overlap <profdata-overlap>`
+* :ref:`order <profdata-order>`
.. program:: llvm-profdata merge
@@ -418,6 +419,40 @@ OPTIONS
Only show overlap for the context sensitive profile counts. The default is to show
non-context sensitive profile counts.
+.. program:: llvm-profdata order
+
+.. _profdata-order:
+
+ORDER
+-------
+
+SYNOPSIS
+^^^^^^^^
+
+:program:`llvm-profdata order` [*options*] [*filename*]
+
+DESCRIPTION
+^^^^^^^^^^^
+
+:program:`llvm-profdata order` uses temporal profiling traces from a profile and
+finds a function order that reduces the number of page faults for those traces.
+This output can be directly passed to ``lld`` via ``--symbol-ordering-file=``
+for ELF or ``-order-file`` for Mach-O. If the traces found in the profile are
+representative of the real world, then this order should improve startup
+performance.
+
+OPTIONS
+^^^^^^^
+
+.. option:: --help
+
+ Print a summary of command line options.
+
+.. option:: --output=<output>, -o
+
+ Specify the output file name. If *output* is ``-`` or it isn't specified,
+ then the output is sent to standard output.
+
EXIT STATUS
-----------
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index fe5f635d19343..ccdcebfd0891d 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -23,6 +23,7 @@
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/ProfileSummary.h"
#include "llvm/ProfileData/InstrProfData.inc"
+#include "llvm/Support/BalancedPartitioning.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Endian.h"
@@ -337,8 +338,17 @@ enum class instrprof_error {
/// An ordered list of functions identified by their NameRef found in
/// INSTR_PROF_DATA
struct TemporalProfTraceTy {
- uint64_t Weight = 1;
std::vector<uint64_t> FunctionNameRefs;
+ uint64_t Weight;
+ TemporalProfTraceTy(std::initializer_list<uint64_t> Trace = {},
+ uint64_t Weight = 1)
+ : FunctionNameRefs(Trace), Weight(Weight) {}
+
+ /// Use a set of temporal profile traces to create a list of balanced
+ /// partitioning function nodes used by BalancedPartitioning to generate a
+ /// function order that reduces page faults during startup
+ static std::vector<BPFunctionNode>
+ createBPFunctionNodes(ArrayRef<TemporalProfTraceTy> Traces);
};
inline std::error_code make_error_code(instrprof_error E) {
diff --git a/llvm/include/llvm/Support/BalancedPartitioning.h b/llvm/include/llvm/Support/BalancedPartitioning.h
new file mode 100644
index 0000000000000..0baf682387445
--- /dev/null
+++ b/llvm/include/llvm/Support/BalancedPartitioning.h
@@ -0,0 +1,197 @@
+//===- BalancedPartitioning.h ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements BalancedPartitioning, a recursive balanced graph
+// partitioning algorithm.
+//
+// The algorithm is used to find an ordering of FunctionNodes while optimizing
+// a specified objective. The algorithm uses recursive bisection; it starts
+// with a collection of unordered FunctionNodes and tries to split them into
+// two sets (buckets) of equal cardinality. Each bisection step is comprised of
+// iterations that greedily swap the FunctionNodes between the two buckets while
+// there is an improvement of the objective. Once the process converges, the
+// problem is divided into two sub-problems of half the size, which are
+// recursively applied for the two buckets. The final ordering of the
+// FunctionNodes is obtained by concatenating the two (recursively computed)
+// orderings.
+//
+// In order to speed up the computation, we limit the depth of the recursive
+// tree by a specified constant (SplitDepth) and apply at most a constant
+// number of greedy iterations per split (IterationsPerSplit). The worst-case
+// time complexity of the implementation is bounded by O(M*log^2 N), where
+// N is the number of FunctionNodes and M is the number of
+// FunctionNode-UtilityNode edges; (assuming that any collection of D
+// FunctionNodes contains O(D) UtilityNodes). Notice that the two
diff erent
+// recursive sub-problems are independent and thus can be efficiently processed
+// in parallel.
+//
+// Reference:
+// * Optimizing Function Layout for Mobile Applications,
+// https://arxiv.org/abs/2211.09285
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_BALANCED_PARTITIONING_H
+#define LLVM_SUPPORT_BALANCED_PARTITIONING_H
+
+#include "raw_ostream.h"
+#include "llvm/Support/ThreadPool.h"
+
+#include <random>
+#include <vector>
+
+namespace llvm {
+
+/// A function with a set of utility nodes where it is beneficial to order two
+/// functions close together if they have similar utility nodes
+class BPFunctionNode {
+ friend class BalancedPartitioning;
+
+public:
+ using IDT = uint64_t;
+ using UtilityNodeT = uint32_t;
+
+ /// \param UtilityNodes the set of utility nodes (must be unique'd)
+ BPFunctionNode(IDT Id, ArrayRef<UtilityNodeT> UtilityNodes)
+ : Id(Id), UtilityNodes(UtilityNodes) {}
+
+ /// The ID of this node
+ IDT Id;
+
+ void dump(raw_ostream &OS) const;
+
+protected:
+ /// The list of utility nodes associated with this node
+ SmallVector<UtilityNodeT, 4> UtilityNodes;
+ /// The bucket assigned by balanced partitioning
+ std::optional<unsigned> Bucket;
+ /// The index of the input order of the FunctionNodes
+ uint64_t InputOrderIndex = 0;
+
+ friend class BPFunctionNodeTest_Basic_Test;
+ friend class BalancedPartitioningTest_Basic_Test;
+ friend class BalancedPartitioningTest_Large_Test;
+};
+
+/// Algorithm parameters; default values are tuned on real-world binaries
+struct BalancedPartitioningConfig {
+ /// The depth of the recursive bisection
+ unsigned SplitDepth = 18;
+ /// The maximum number of bp iterations per split
+ unsigned IterationsPerSplit = 40;
+ /// The probability for a vertex to skip a move from its current bucket to
+ /// another bucket; it often helps to escape from a local optima
+ float SkipProbability = 0.1;
+ /// Recursive subtasks up to the given depth are added to the queue and
+ /// distributed among threads by ThreadPool; all subsequent calls are executed
+ /// on the same thread
+ unsigned TaskSplitDepth = 9;
+};
+
+class BalancedPartitioning {
+public:
+ BalancedPartitioning(const BalancedPartitioningConfig &Config);
+
+ /// Run recursive graph partitioning that optimizes a given objective.
+ void run(std::vector<BPFunctionNode> &Nodes) const;
+
+private:
+ struct UtilitySignature;
+ using SignaturesT = SmallVector<UtilitySignature, 4>;
+ using FunctionNodeRange =
+ iterator_range<std::vector<BPFunctionNode>::iterator>;
+
+ /// A special ThreadPool that allows for spawning new tasks after blocking on
+ /// wait(). BalancedPartitioning recursively spawns new threads inside other
+ /// threads, so we need to track how many active threads that could spawn more
+ /// threads.
+ struct BPThreadPool {
+ ThreadPool TheThreadPool;
+ std::mutex mtx;
+ std::condition_variable cv;
+ /// The number of threads that could spawn more threads
+ std::atomic<int> NumActiveThreads = 0;
+ /// Only true when all threads are down spawning new threads
+ bool IsFinishedSpawning = false;
+ /// Asynchronous submission of the task to the pool
+ template <typename Func> void async(Func &&F);
+ /// Blocking wait for all threads to complete. Unlike ThreadPool, it is
+ /// acceptable for other threads to add more tasks while blocking on this
+ /// call.
+ void wait();
+ };
+
+ /// Run a recursive bisection of a given list of FunctionNodes
+ /// \param RecDepth the current depth of recursion
+ /// \param RootBucket the initial bucket of the dataVertices
+ /// \param Offset the assigned buckets are the range [Offset, Offset +
+ /// Nodes.size()]
+ void bisect(const FunctionNodeRange Nodes, unsigned RecDepth,
+ unsigned RootBucket, unsigned Offset,
+ std::optional<BPThreadPool> &TP) const;
+
+ /// Run bisection iterations
+ void runIterations(const FunctionNodeRange Nodes, unsigned RecDepth,
+ unsigned LeftBucket, unsigned RightBucket,
+ std::mt19937 &RNG) const;
+
+ /// Run a bisection iteration to improve the optimization goal
+ /// \returns the total number of moved FunctionNodes
+ unsigned runIteration(const FunctionNodeRange Nodes, unsigned LeftBucket,
+ unsigned RightBucket, SignaturesT &Signatures,
+ std::mt19937 &RNG) const;
+
+ /// Try to move \p N from one bucket to another
+ /// \returns true iff \p N is moved
+ bool moveFunctionNode(BPFunctionNode &N, unsigned LeftBucket,
+ unsigned RightBucket, SignaturesT &Signatures,
+ std::mt19937 &RNG) const;
+
+ /// Split all the FunctionNodes into 2 buckets, StartBucket and StartBucket +
+ /// 1 The method is used for an initial assignment before a bisection step
+ void split(const FunctionNodeRange Nodes, unsigned StartBucket) const;
+
+ /// The cost of the uniform log-gap cost, assuming a utility node has \p X
+ /// FunctionNodes in the left bucket and \p Y FunctionNodes in the right one.
+ float logCost(unsigned X, unsigned Y) const;
+
+ float log2Cached(unsigned i) const;
+
+ const BalancedPartitioningConfig &Config;
+
+ /// Precomputed values of log2(x). Table size is small enough to fit in cache.
+ static constexpr unsigned LOG_CACHE_SIZE = 16384;
+ float Log2Cache[LOG_CACHE_SIZE];
+
+ /// The signature of a particular utility node used for the bisection step,
+ /// i.e., the number of \p FunctionNodes in each of the two buckets
+ struct UtilitySignature {
+ /// The number of \p FunctionNodes in the left bucket
+ unsigned LeftCount = 0;
+ /// The number of \p FunctionNodes in the right bucket
+ unsigned RightCount = 0;
+ /// The cached gain of moving a \p FunctionNode from the left bucket to the
+ /// right bucket
+ float CachedGainLR;
+ /// The cached gain of moving a \p FunctionNode from the right bucket to the
+ /// left bucket
+ float CachedGainRL;
+ /// Whether \p CachedGainLR and \p CachedGainRL are valid
+ bool CachedGainIsValid = false;
+ };
+
+protected:
+ /// Compute the move gain for uniform log-gap cost
+ static float moveGain(const BPFunctionNode &N, bool FromLeftToRight,
+ const SignaturesT &Signatures);
+ friend class BalancedPartitioningTest_MoveGain_Test;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_SUPPORT_BALANCED_PARTITIONING_H
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 749147a09fa5b..9bb091a493632 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -13,6 +13,7 @@
#include "llvm/ProfileData/InstrProf.h"
#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
@@ -803,6 +804,48 @@ void InstrProfRecord::addValueData(uint32_t ValueKind, uint32_t Site,
ValueSites.emplace_back(VData, VData + N);
}
+std::vector<BPFunctionNode> TemporalProfTraceTy::createBPFunctionNodes(
+ ArrayRef<TemporalProfTraceTy> Traces) {
+ using IDT = BPFunctionNode::IDT;
+ using UtilityNodeT = BPFunctionNode::UtilityNodeT;
+ // Collect all function IDs ordered by their smallest timestamp. This will be
+ // used as the initial FunctionNode order.
+ SetVector<IDT> FunctionIds;
+ size_t LargestTraceSize = 0;
+ for (auto &Trace : Traces)
+ LargestTraceSize =
+ std::max(LargestTraceSize, Trace.FunctionNameRefs.size());
+ for (size_t Timestamp = 0; Timestamp < LargestTraceSize; Timestamp++)
+ for (auto &Trace : Traces)
+ if (Timestamp < Trace.FunctionNameRefs.size())
+ FunctionIds.insert(Trace.FunctionNameRefs[Timestamp]);
+
+ int N = std::ceil(std::log2(LargestTraceSize));
+
+ // TODO: We need to use the Trace.Weight field to give more weight to more
+ // important utilities
+ DenseMap<IDT, SmallVector<UtilityNodeT, 4>> FuncGroups;
+ for (size_t TraceIdx = 0; TraceIdx < Traces.size(); TraceIdx++) {
+ auto &Trace = Traces[TraceIdx].FunctionNameRefs;
+ for (size_t Timestamp = 0; Timestamp < Trace.size(); Timestamp++) {
+ for (int I = std::floor(std::log2(Timestamp + 1)); I < N; I++) {
+ auto &FunctionId = Trace[Timestamp];
+ UtilityNodeT GroupId = TraceIdx * N + I;
+ FuncGroups[FunctionId].push_back(GroupId);
+ }
+ }
+ }
+
+ std::vector<BPFunctionNode> Nodes;
+ for (auto &Id : FunctionIds) {
+ auto &UNs = FuncGroups[Id];
+ llvm::sort(UNs);
+ UNs.erase(std::unique(UNs.begin(), UNs.end()), UNs.end());
+ Nodes.emplace_back(Id, UNs);
+ }
+ return Nodes;
+}
+
#define INSTR_PROF_COMMON_API_IMPL
#include "llvm/ProfileData/InstrProfData.inc"
diff --git a/llvm/lib/Support/BalancedPartitioning.cpp b/llvm/lib/Support/BalancedPartitioning.cpp
new file mode 100644
index 0000000000000..a982bb157ab82
--- /dev/null
+++ b/llvm/lib/Support/BalancedPartitioning.cpp
@@ -0,0 +1,325 @@
+//===- BalancedPartitioning.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements BalancedPartitioning, a recursive balanced graph
+// partitioning algorithm.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/BalancedPartitioning.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormatVariadic.h"
+
+using namespace llvm;
+#define DEBUG_TYPE "balanced-partitioning"
+
+void BPFunctionNode::dump(raw_ostream &OS) const {
+ OS << formatv("{{ID={0} Utilities={{{1:$[,]}} Bucket={2}}", Id,
+ make_range(UtilityNodes.begin(), UtilityNodes.end()), Bucket);
+}
+
+template <typename Func>
+void BalancedPartitioning::BPThreadPool::async(Func &&F) {
+ // This new thread could spawn more threads, so mark it as active
+ ++NumActiveThreads;
+ TheThreadPool.async([=]() {
+ // Run the task
+ F();
+
+ // This thread will no longer spawn new threads, so mark it as inactive
+ if (--NumActiveThreads == 0) {
+ // There are no more active threads, so mark as finished and notify
+ {
+ std::unique_lock<std::mutex> lock(mtx);
+ assert(!IsFinishedSpawning);
+ IsFinishedSpawning = true;
+ }
+ cv.notify_one();
+ }
+ });
+}
+
+void BalancedPartitioning::BPThreadPool::wait() {
+ // TODO: We could remove the mutex and condition variable and use
+ // std::atomic::wait() instead, but that isn't available until C++20
+ {
+ std::unique_lock<std::mutex> lock(mtx);
+ cv.wait(lock, [&]() { return IsFinishedSpawning; });
+ assert(IsFinishedSpawning && NumActiveThreads == 0);
+ }
+ // Now we can call ThreadPool::wait() since all tasks have been submitted
+ TheThreadPool.wait();
+}
+
+BalancedPartitioning::BalancedPartitioning(
+ const BalancedPartitioningConfig &Config)
+ : Config(Config) {
+ // Pre-computing log2 values
+ Log2Cache[0] = 0.0;
+ for (unsigned I = 1; I < LOG_CACHE_SIZE; I++)
+ Log2Cache[I] = std::log2(I);
+}
+
+void BalancedPartitioning::run(std::vector<BPFunctionNode> &Nodes) const {
+ LLVM_DEBUG(
+ dbgs() << format(
+ "Partitioning %d nodes using depth %d and %d iterations per split\n",
+ Nodes.size(), Config.SplitDepth, Config.IterationsPerSplit));
+ std::optional<BPThreadPool> TP;
+ if (Config.TaskSplitDepth > 1)
+ TP.emplace();
+
+ // Record the input order
+ for (unsigned I = 0; I < Nodes.size(); I++)
+ Nodes[I].InputOrderIndex = I;
+
+ auto NodesRange = llvm::make_range(Nodes.begin(), Nodes.end());
+ auto BisectTask = [=, &TP]() {
+ bisect(NodesRange, /*RecDepth=*/0, /*RootBucket=*/1, /*Offset=*/0, TP);
+ };
+ if (TP) {
+ TP->async(std::move(BisectTask));
+ TP->wait();
+ } else {
+ BisectTask();
+ }
+
+ llvm::stable_sort(NodesRange, [](const auto &L, const auto &R) {
+ return L.Bucket < R.Bucket;
+ });
+
+ LLVM_DEBUG(dbgs() << "Balanced partitioning completed\n");
+}
+
+void BalancedPartitioning::bisect(const FunctionNodeRange Nodes,
+ unsigned RecDepth, unsigned RootBucket,
+ unsigned Offset,
+ std::optional<BPThreadPool> &TP) const {
+ unsigned NumNodes = std::distance(Nodes.begin(), Nodes.end());
+ if (NumNodes <= 1 || RecDepth >= Config.SplitDepth) {
+ // We've reach the lowest level of the recursion tree. Fall back to the
+ // original order and assign to buckets.
+ llvm::stable_sort(Nodes, [](const auto &L, const auto &R) {
+ return L.InputOrderIndex < R.InputOrderIndex;
+ });
+ for (auto &N : Nodes)
+ N.Bucket = Offset++;
+ return;
+ }
+
+ LLVM_DEBUG(dbgs() << format("Bisect with %d nodes and root bucket %d\n",
+ NumNodes, RootBucket));
+
+ std::mt19937 RNG(RootBucket);
+
+ unsigned LeftBucket = 2 * RootBucket;
+ unsigned RightBucket = 2 * RootBucket + 1;
+
+ // Split into two and assign to the left and right buckets
+ split(Nodes, LeftBucket);
+
+ runIterations(Nodes, RecDepth, LeftBucket, RightBucket, RNG);
+
+ // Split nodes wrt the resulting buckets
+ auto NodesMid =
+ llvm::partition(Nodes, [&](auto &N) { return N.Bucket == LeftBucket; });
+ unsigned MidOffset = Offset + std::distance(Nodes.begin(), NodesMid);
+
+ auto LeftNodes = llvm::make_range(Nodes.begin(), NodesMid);
+ auto RightNodes = llvm::make_range(NodesMid, Nodes.end());
+
+ auto LeftRecTask = [=, &TP]() {
+ bisect(LeftNodes, RecDepth + 1, LeftBucket, Offset, TP);
+ };
+ auto RightRecTask = [=, &TP]() {
+ bisect(RightNodes, RecDepth + 1, RightBucket, MidOffset, TP);
+ };
+
+ if (TP && RecDepth < Config.TaskSplitDepth && NumNodes >= 4) {
+ TP->async(std::move(LeftRecTask));
+ TP->async(std::move(RightRecTask));
+ } else {
+ LeftRecTask();
+ RightRecTask();
+ }
+}
+
+void BalancedPartitioning::runIterations(const FunctionNodeRange Nodes,
+ unsigned RecDepth, unsigned LeftBucket,
+ unsigned RightBucket,
+ std::mt19937 &RNG) const {
+ unsigned NumNodes = std::distance(Nodes.begin(), Nodes.end());
+ DenseMap<BPFunctionNode::UtilityNodeT, unsigned> UtilityNodeDegree;
+ for (auto &N : Nodes)
+ for (auto &UN : N.UtilityNodes)
+ ++UtilityNodeDegree[UN];
+ // Remove utility nodes if they have just one edge or are connected to all
+ // functions
+ for (auto &N : Nodes)
+ llvm::erase_if(N.UtilityNodes, [&](auto &UN) {
+ return UtilityNodeDegree[UN] <= 1 || UtilityNodeDegree[UN] >= NumNodes;
+ });
+
+ // Renumber utility nodes so they can be used to index into Signatures
+ DenseMap<BPFunctionNode::UtilityNodeT, unsigned> UtilityNodeIndex;
+ for (auto &N : Nodes)
+ for (auto &UN : N.UtilityNodes)
+ if (!UtilityNodeIndex.count(UN))
+ UtilityNodeIndex[UN] = UtilityNodeIndex.size();
+ for (auto &N : Nodes)
+ for (auto &UN : N.UtilityNodes)
+ UN = UtilityNodeIndex[UN];
+
+ // Initialize signatures
+ SignaturesT Signatures(/*Size=*/UtilityNodeIndex.size());
+ for (auto &N : Nodes) {
+ for (auto &UN : N.UtilityNodes) {
+ assert(UN < Signatures.size());
+ if (N.Bucket == LeftBucket) {
+ Signatures[UN].LeftCount++;
+ } else {
+ Signatures[UN].RightCount++;
+ }
+ }
+ }
+
+ for (unsigned I = 0; I < Config.IterationsPerSplit; I++) {
+ unsigned NumMovedNodes =
+ runIteration(Nodes, LeftBucket, RightBucket, Signatures, RNG);
+ if (NumMovedNodes == 0)
+ break;
+ }
+}
+
+unsigned BalancedPartitioning::runIteration(const FunctionNodeRange Nodes,
+ unsigned LeftBucket,
+ unsigned RightBucket,
+ SignaturesT &Signatures,
+ std::mt19937 &RNG) const {
+ // Init signature cost caches
+ for (auto &Signature : Signatures) {
+ if (Signature.CachedGainIsValid)
+ continue;
+ unsigned L = Signature.LeftCount;
+ unsigned R = Signature.RightCount;
+ assert((L > 0 || R > 0) && "incorrect signature");
+ float Cost = logCost(L, R);
+ Signature.CachedGainLR = 0;
+ Signature.CachedGainRL = 0;
+ if (L > 0)
+ Signature.CachedGainLR = Cost - logCost(L - 1, R + 1);
+ if (R > 0)
+ Signature.CachedGainRL = Cost - logCost(L + 1, R - 1);
+ Signature.CachedGainIsValid = true;
+ }
+
+ // Compute move gains
+ typedef std::pair<float, BPFunctionNode *> GainPair;
+ std::vector<GainPair> Gains;
+ for (auto &N : Nodes) {
+ bool FromLeftToRight = (N.Bucket == LeftBucket);
+ float Gain = moveGain(N, FromLeftToRight, Signatures);
+ Gains.push_back(std::make_pair(Gain, &N));
+ }
+
+ // Collect left and right gains
+ auto LeftEnd = llvm::partition(
+ Gains, [&](const auto &GP) { return GP.second->Bucket == LeftBucket; });
+ auto LeftRange = llvm::make_range(Gains.begin(), LeftEnd);
+ auto RightRange = llvm::make_range(LeftEnd, Gains.end());
+
+ // Sort gains in descending order
+ auto LargerGain = [](const auto &L, const auto &R) {
+ return L.first > R.first;
+ };
+ llvm::stable_sort(LeftRange, LargerGain);
+ llvm::stable_sort(RightRange, LargerGain);
+
+ unsigned NumMovedDataVertices = 0;
+ for (auto [LeftPair, RightPair] : llvm::zip(LeftRange, RightRange)) {
+ auto &[LeftGain, LeftNode] = LeftPair;
+ auto &[RightGain, RightNode] = RightPair;
+ // Stop when the gain is no longer beneficial
+ if (LeftGain + RightGain <= 0.0)
+ break;
+ // Try to exchange the nodes between buckets
+ if (moveFunctionNode(*LeftNode, LeftBucket, RightBucket, Signatures, RNG))
+ ++NumMovedDataVertices;
+ if (moveFunctionNode(*RightNode, LeftBucket, RightBucket, Signatures, RNG))
+ ++NumMovedDataVertices;
+ }
+ return NumMovedDataVertices;
+}
+
+bool BalancedPartitioning::moveFunctionNode(BPFunctionNode &N,
+ unsigned LeftBucket,
+ unsigned RightBucket,
+ SignaturesT &Signatures,
+ std::mt19937 &RNG) const {
+ // Sometimes we skip the move. This helps to escape local optima
+ if (std::uniform_real_distribution<float>(0.0, 1.0)(RNG) <=
+ Config.SkipProbability)
+ return false;
+
+ bool FromLeftToRight = (N.Bucket == LeftBucket);
+ // Update the current bucket
+ N.Bucket = (FromLeftToRight ? RightBucket : LeftBucket);
+
+ // Update signatures and invalidate gain cache
+ if (FromLeftToRight) {
+ for (auto &UN : N.UtilityNodes) {
+ auto &Signature = Signatures[UN];
+ Signature.LeftCount--;
+ Signature.RightCount++;
+ Signature.CachedGainIsValid = false;
+ }
+ } else {
+ for (auto &UN : N.UtilityNodes) {
+ auto &Signature = Signatures[UN];
+ Signature.LeftCount++;
+ Signature.RightCount--;
+ Signature.CachedGainIsValid = false;
+ }
+ }
+ return true;
+}
+
+void BalancedPartitioning::split(const FunctionNodeRange Nodes,
+ unsigned StartBucket) const {
+ unsigned NumNodes = std::distance(Nodes.begin(), Nodes.end());
+ auto NodesMid = Nodes.begin() + (NumNodes + 1) / 2;
+
+ std::nth_element(Nodes.begin(), NodesMid, Nodes.end(), [](auto &L, auto &R) {
+ return L.InputOrderIndex < R.InputOrderIndex;
+ });
+
+ for (auto &N : llvm::make_range(Nodes.begin(), NodesMid))
+ N.Bucket = StartBucket;
+ for (auto &N : llvm::make_range(NodesMid, Nodes.end()))
+ N.Bucket = StartBucket + 1;
+}
+
+float BalancedPartitioning::moveGain(const BPFunctionNode &N,
+ bool FromLeftToRight,
+ const SignaturesT &Signatures) {
+ float Gain = 0;
+ for (auto &UN : N.UtilityNodes)
+ Gain += (FromLeftToRight ? Signatures[UN].CachedGainLR
+ : Signatures[UN].CachedGainRL);
+ return Gain;
+}
+
+float BalancedPartitioning::logCost(unsigned X, unsigned Y) const {
+ return -(X * log2Cached(X + 1) + Y * log2Cached(Y + 1));
+}
+
+float BalancedPartitioning::log2Cached(unsigned i) const {
+ return (i < LOG_CACHE_SIZE) ? Log2Cache[i] : std::log2(i);
+}
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index 714f9ba0214f5..e6bff119b094f 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -129,6 +129,7 @@ add_llvm_component_library(LLVMSupport
Allocator.cpp
AutoConvert.cpp
Base64.cpp
+ BalancedPartitioning.cpp
BinaryStreamError.cpp
BinaryStreamReader.cpp
BinaryStreamRef.cpp
diff --git a/llvm/test/tools/llvm-profdata/show-order.proftext b/llvm/test/tools/llvm-profdata/show-order.proftext
new file mode 100644
index 0000000000000..8ef26847ad77e
--- /dev/null
+++ b/llvm/test/tools/llvm-profdata/show-order.proftext
@@ -0,0 +1,46 @@
+# RUN: llvm-profdata order %s | FileCheck %s
+
+# CHECK: a
+# CHECK: b
+# CHECK: c
+# CHECK: x
+
+# Header
+:ir
+:temporal_prof_traces
+# Num Traces
+3
+# Trace Stream Size:
+3
+# Weight
+1
+a, main.c:b, c
+# Weight
+1
+a, x, main.c:b, c
+# Weight
+1
+a, main.c:b, c
+
+a
+# Func Hash:
+0x1234
+# Num Counters:
+1
+# Counter Values:
+101
+
+main.c:b
+0x5678
+1
+202
+
+c
+0xabcd
+1
+303
+
+x
+0xefff
+1
+404
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index 9199a69677155..7c282e346747e 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -23,6 +23,7 @@
#include "llvm/ProfileData/RawMemProfReader.h"
#include "llvm/ProfileData/SampleProfReader.h"
#include "llvm/ProfileData/SampleProfWriter.h"
+#include "llvm/Support/BalancedPartitioning.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Discriminator.h"
#include "llvm/Support/Errc.h"
@@ -3038,6 +3039,51 @@ static int show_main(int argc, const char *argv[]) {
return showMemProfProfile(Filename, ProfiledBinary, SFormat, OS);
}
+static int order_main(int argc, const char *argv[]) {
+ cl::opt<std::string> Filename(cl::Positional, cl::desc("<profdata-file>"));
+ cl::opt<std::string> OutputFilename("output", cl::value_desc("output"),
+ cl::init("-"), cl::desc("Output file"));
+ cl::alias OutputFilenameA("o", cl::desc("Alias for --output"),
+ cl::aliasopt(OutputFilename));
+ cl::ParseCommandLineOptions(argc, argv, "LLVM profile data order\n");
+
+ std::error_code EC;
+ raw_fd_ostream OS(OutputFilename.data(), EC, sys::fs::OF_TextWithCRLF);
+ if (EC)
+ exitWithErrorCode(EC, OutputFilename);
+ auto FS = vfs::getRealFileSystem();
+ auto ReaderOrErr = InstrProfReader::create(Filename, *FS);
+ if (Error E = ReaderOrErr.takeError())
+ exitWithError(std::move(E), Filename);
+
+ auto Reader = std::move(ReaderOrErr.get());
+ for (auto &I : *Reader) {
+ // Read all entries
+ (void)I;
+ }
+ auto &Traces = Reader->getTemporalProfTraces();
+ auto Nodes = TemporalProfTraceTy::createBPFunctionNodes(Traces);
+ BalancedPartitioningConfig Config;
+ BalancedPartitioning BP(Config);
+ BP.run(Nodes);
+
+ WithColor::note() << "# Ordered " << Nodes.size() << " functions\n";
+ for (auto &N : Nodes) {
+ auto FuncName = Reader->getSymtab().getFuncName(N.Id);
+ if (FuncName.contains(':')) {
+ // GlobalValue::getGlobalIdentifier() prefixes the filename if the symbol
+ // is local. This logic will break if there is a colon in the filename,
+ // but we cannot use rsplit() because ObjC symbols can have colons.
+ auto [Filename, ParsedFuncName] = FuncName.split(':');
+ // Emit a comment describing where this symbol came from
+ OS << "# " << Filename << "\n";
+ FuncName = ParsedFuncName;
+ }
+ OS << FuncName << "\n";
+ }
+ return 0;
+}
+
int llvm_profdata_main(int argc, char **argvNonConst,
const llvm::ToolContext &) {
const char **argv = const_cast<const char **>(argvNonConst);
@@ -3053,6 +3099,8 @@ int llvm_profdata_main(int argc, char **argvNonConst,
func = show_main;
else if (strcmp(argv[1], "overlap") == 0)
func = overlap_main;
+ else if (strcmp(argv[1], "order") == 0)
+ func = order_main;
if (func) {
std::string Invocation(ProgName.str() + " " + argv[1]);
@@ -3083,6 +3131,6 @@ int llvm_profdata_main(int argc, char **argvNonConst,
else
errs() << ProgName << ": Unknown command!\n";
- errs() << "USAGE: " << ProgName << " <merge|show|overlap> [args...]\n";
+ errs() << "USAGE: " << ProgName << " <merge|show|overlap|order> [args...]\n";
return 1;
}
diff --git a/llvm/unittests/Support/BalancedPartitioningTest.cpp b/llvm/unittests/Support/BalancedPartitioningTest.cpp
new file mode 100644
index 0000000000000..a71e19c3a2392
--- /dev/null
+++ b/llvm/unittests/Support/BalancedPartitioningTest.cpp
@@ -0,0 +1,121 @@
+//===- BalancedPartitioningTest.cpp - BalancedPartitioning tests ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/BalancedPartitioning.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using testing::Each;
+using testing::Field;
+using testing::Not;
+using testing::UnorderedElementsAre;
+using testing::UnorderedElementsAreArray;
+
+namespace llvm {
+
+void PrintTo(const BPFunctionNode &Node, std::ostream *OS) {
+ raw_os_ostream ROS(*OS);
+ Node.dump(ROS);
+}
+
+TEST(BPFunctionNodeTest, Basic) {
+ auto Nodes = TemporalProfTraceTy::createBPFunctionNodes({
+ TemporalProfTraceTy({0, 1, 2, 3, 4}),
+ TemporalProfTraceTy({4, 2}),
+ });
+
+ auto NodeIs = [](BPFunctionNode::IDT Id,
+ ArrayRef<BPFunctionNode::UtilityNodeT> UNs) {
+ return AllOf(Field("Id", &BPFunctionNode::Id, Id),
+ Field("UtilityNodes", &BPFunctionNode::UtilityNodes,
+ UnorderedElementsAreArray(UNs)));
+ };
+
+ EXPECT_THAT(Nodes,
+ UnorderedElementsAre(NodeIs(0, {0, 1, 2}), NodeIs(1, {1, 2}),
+ NodeIs(2, {1, 2, 4, 5}), NodeIs(3, {2}),
+ NodeIs(4, {2, 3, 4, 5})));
+}
+
+class BalancedPartitioningTest : public ::testing::Test {
+protected:
+ BalancedPartitioningConfig Config;
+ BalancedPartitioning Bp;
+ BalancedPartitioningTest() : Bp(Config) {}
+
+ static std::vector<BPFunctionNode::IDT>
+ getIds(std::vector<BPFunctionNode> Nodes) {
+ std::vector<BPFunctionNode::IDT> Ids;
+ for (auto &N : Nodes)
+ Ids.push_back(N.Id);
+ return Ids;
+ }
+};
+
+TEST_F(BalancedPartitioningTest, Basic) {
+ std::vector<BPFunctionNode> Nodes = {
+ BPFunctionNode(0, {1, 2}), BPFunctionNode(2, {3, 4}),
+ BPFunctionNode(1, {1, 2}), BPFunctionNode(3, {3, 4}),
+ BPFunctionNode(4, {4}),
+ };
+
+ Bp.run(Nodes);
+
+ auto NodeIs = [](BPFunctionNode::IDT Id, std::optional<uint32_t> Bucket) {
+ return AllOf(Field("Id", &BPFunctionNode::Id, Id),
+ Field("Bucket", &BPFunctionNode::Bucket, Bucket));
+ };
+
+ EXPECT_THAT(Nodes,
+ UnorderedElementsAre(NodeIs(0, 0), NodeIs(1, 1), NodeIs(2, 2),
+ NodeIs(3, 3), NodeIs(4, 4)));
+}
+
+TEST_F(BalancedPartitioningTest, Large) {
+ const int ProblemSize = 1000;
+ std::vector<BPFunctionNode::UtilityNodeT> AllUNs;
+ for (int i = 0; i < ProblemSize; i++)
+ AllUNs.emplace_back(i);
+
+ std::mt19937 RNG;
+ std::vector<BPFunctionNode> Nodes;
+ for (int i = 0; i < ProblemSize; i++) {
+ std::vector<BPFunctionNode::UtilityNodeT> UNs;
+ int SampleSize =
+ std::uniform_int_distribution<int>(0, AllUNs.size() - 1)(RNG);
+ std::sample(AllUNs.begin(), AllUNs.end(), std::back_inserter(UNs),
+ SampleSize, RNG);
+ Nodes.emplace_back(i, UNs);
+ }
+
+ auto OrigIds = getIds(Nodes);
+
+ Bp.run(Nodes);
+
+ EXPECT_THAT(
+ Nodes, Each(Not(Field("Bucket", &BPFunctionNode::Bucket, std::nullopt))));
+ EXPECT_THAT(getIds(Nodes), UnorderedElementsAreArray(OrigIds));
+}
+
+TEST_F(BalancedPartitioningTest, MoveGain) {
+ BalancedPartitioning::SignaturesT Signatures = {
+ {10, 10, 10.f, 0.f, true}, // 0
+ {10, 10, 0.f, 10.f, true}, // 1
+ {10, 10, 0.f, 20.f, true}, // 2
+ };
+ EXPECT_FLOAT_EQ(Bp.moveGain(BPFunctionNode(0, {}), true, Signatures), 0.f);
+ EXPECT_FLOAT_EQ(Bp.moveGain(BPFunctionNode(0, {0, 1}), true, Signatures),
+ 10.f);
+ EXPECT_FLOAT_EQ(Bp.moveGain(BPFunctionNode(0, {1, 2}), false, Signatures),
+ 30.f);
+}
+
+} // end namespace llvm
diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt
index d1185827db202..ad0e658eba3ad 100644
--- a/llvm/unittests/Support/CMakeLists.txt
+++ b/llvm/unittests/Support/CMakeLists.txt
@@ -1,4 +1,5 @@
set(LLVM_LINK_COMPONENTS
+ ProfileData
Support
TargetParser
)
@@ -14,6 +15,7 @@ add_llvm_unittest(SupportTests
BinaryStreamTest.cpp
BLAKE3Test.cpp
BlockFrequencyTest.cpp
+ BalancedPartitioningTest.cpp
BranchProbabilityTest.cpp
CachePruningTest.cpp
CrashRecoveryTest.cpp
More information about the llvm-commits
mailing list