[Mlir-commits] [mlir] abd3c6f - [mlir][Inliner] Use llvm::parallelForEach instead of llvm::parallelTransformReduce

Tue Feb 23 14:37:17 PST 2021

Author: River Riddle
Date: 2021-02-23T14:36:45-08:00
New Revision: abd3c6f24c823be6fb316b501482d8637c4a0724

URL: https://github.com/llvm/llvm-project/commit/abd3c6f24c823be6fb316b501482d8637c4a0724
DIFF: https://github.com/llvm/llvm-project/commit/abd3c6f24c823be6fb316b501482d8637c4a0724.diff

LOG: [mlir][Inliner] Use llvm::parallelForEach instead of llvm::parallelTransformReduce

llvm::parallelTransformReduce does not schedule work on the caller thread, which becomes very costly for
the inliner where a majority of SCCs are small, often ~1 element. The switch to llvm::parallelForEach solves this,
and also aligns the implementation with the PassManager (which realistically should share the same implementation).

This change dropped compile time on an internal benchmark by ~1(25%) second.

Differential Revision: https://reviews.llvm.org/D96086

Added: 
    

Modified: 
    mlir/lib/Transforms/Inliner.cpp

Removed: 
    


################################################################################
diff  --git a/mlir/lib/Transforms/Inliner.cpp b/mlir/lib/Transforms/Inliner.cpp
index d79af21cb4ac..ad89e4771508 100644

--- a/mlir/lib/Transforms/Inliner.cpp
+++ b/mlir/lib/Transforms/Inliner.cpp
@@ -688,8 +688,10 @@ InlinerPass::optimizeSCCAsync(MutableArrayRef<CallGraphNode *> nodesToVisit,
                               MLIRContext *context) {
   // Ensure that there are enough pipeline maps for the optimizer to run in
   // parallel.
-  size_t numThreads = llvm::hardware_concurrency().compute_thread_count();
-  if (opPipelines.size() != numThreads) {
+  size_t numThreads =
+      std::min((size_t)llvm::hardware_concurrency().compute_thread_count(),
+               nodesToVisit.size());
+  if (opPipelines.size() < numThreads) {
     // Reserve before resizing so that we can use a reference to the first
     // element.
     opPipelines.reserve(numThreads);
@@ -706,14 +708,11 @@ InlinerPass::optimizeSCCAsync(MutableArrayRef<CallGraphNode *> nodesToVisit,
 
   // Optimize the nodes of the SCC in parallel.
   ParallelDiagnosticHandler optimizerHandler(context);
-  return llvm::parallelTransformReduce(
-      llvm::seq<size_t>(0, numThreads), success(),
-      [](LogicalResult lhs, LogicalResult rhs) {
-        return success(succeeded(lhs) && succeeded(rhs));
-      },
-      [&](size_t index) {
-        LogicalResult result = success();
-        for (auto e = nodesToVisit.size(); nodeIt < e && succeeded(result);) {
+  std::atomic<bool> passFailed(false);
+  llvm::parallelForEach(
+      opPipelines.begin(), std::next(opPipelines.begin(), numThreads),
+      [&](llvm::StringMap<OpPassManager> &pipelines) {
+        for (auto e = nodesToVisit.size(); !passFailed && nodeIt < e;) {
           // Get the next available operation index.
           unsigned nextID = nodeIt++;
           if (nextID >= e)
@@ -722,11 +721,17 @@ InlinerPass::optimizeSCCAsync(MutableArrayRef<CallGraphNode *> nodesToVisit,
           // Set the order for this thread so that diagnostics will be
           // properly ordered, and reset after optimization has finished.
           optimizerHandler.setOrderIDForThread(nextID);
-          result = optimizeCallable(nodesToVisit[nextID], opPipelines[index]);
+          LogicalResult pipelineResult =
+              optimizeCallable(nodesToVisit[nextID], pipelines);
           optimizerHandler.eraseOrderIDForThread();
+
+          if (failed(pipelineResult)) {
+            passFailed = true;
+            break;
+          }
         }
-        return result;
       });
+  return failure(passFailed);
 }
 
 LogicalResult