[llvm] 8ef460f - [llvm-reduce] Add parallel chunk processing.

Wed Nov 24 01:24:11 PST 2021

Author: Florian Hahn
Date: 2021-11-24T09:23:52Z
New Revision: 8ef460fc5137816c0bcc9ec471e7d60d3cc2f5ee

URL: https://github.com/llvm/llvm-project/commit/8ef460fc5137816c0bcc9ec471e7d60d3cc2f5ee
DIFF: https://github.com/llvm/llvm-project/commit/8ef460fc5137816c0bcc9ec471e7d60d3cc2f5ee.diff

LOG: [llvm-reduce] Add parallel chunk processing.

This patch adds parallel processing of chunks. When reducing very large
inputs, e.g. functions with 500k basic blocks, processing chunks in
parallel can significantly speed up the reduction.

To allow modifying clones of the original module in parallel, each clone
needs their own LLVMContext object. To achieve this, each job parses the
input module with their own LLVMContext. In case a job successfully
reduced the input, it serializes the result module as bitcode into a
result array.

To ensure parallel reduction produces the same results as serial
reduction, only the first successfully reduced result is used, and
results of other successful jobs are dropped. Processing resumes after
the chunk that was successfully reduced.

The number of threads to use can be configured using the -j option.
It defaults to 1, which means serial processing.

Reviewed By: Meinersbur

Differential Revision: https://reviews.llvm.org/D113857

Added: 
    

Modified: 
    llvm/test/tools/llvm-reduce/operands-skip.ll
    llvm/tools/llvm-reduce/CMakeLists.txt
    llvm/tools/llvm-reduce/deltas/Delta.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/test/tools/llvm-reduce/operands-skip.ll b/llvm/test/tools/llvm-reduce/operands-skip.ll
index 5f13d59f939d0..da37f613e8e51 100644

--- a/llvm/test/tools/llvm-reduce/operands-skip.ll
+++ b/llvm/test/tools/llvm-reduce/operands-skip.ll
@@ -1,6 +1,13 @@
 ; RUN: llvm-reduce %s -o %t --delta-passes=operands-skip --test FileCheck --test-arg %s --test-arg --match-full-lines --test-arg --check-prefix=INTERESTING --test-arg --input-file
 ; RUN: FileCheck %s --input-file %t --check-prefixes=REDUCED
 
+; RUN: llvm-reduce -j 2 %s -o %t.1 --delta-passes=operands-skip --test FileCheck --test-arg %s --test-arg --match-full-lines --test-arg --check-prefix=INTERESTING --test-arg --input-file
+; RUN: FileCheck %s --input-file %t.1 --check-prefixes=REDUCED
+
+; RUN: llvm-reduce -j 4 %s -o %t.2 --delta-passes=operands-skip --test FileCheck --test-arg %s --test-arg --match-full-lines --test-arg --check-prefix=INTERESTING --test-arg --input-file
+; RUN: FileCheck %s --input-file %t.2 --check-prefixes=REDUCED
+
+
 ; INTERESTING: store i32 43, i32* {{(%imm|%indirect)}}, align 4
 ; REDUCED:     store i32 43, i32* %imm, align 4
 

diff  --git a/llvm/tools/llvm-reduce/CMakeLists.txt b/llvm/tools/llvm-reduce/CMakeLists.txt
index c7933fcfc0cc4..6588fd8c77936 100644
--- a/llvm/tools/llvm-reduce/CMakeLists.txt
+++ b/llvm/tools/llvm-reduce/CMakeLists.txt
@@ -3,6 +3,7 @@ set(LLVM_LINK_COMPONENTS
   AllTargetsCodeGens
   AllTargetsDescs
   AllTargetsInfos
+  BitReader
   BitWriter
   CodeGen
   Core

diff  --git a/llvm/tools/llvm-reduce/deltas/Delta.cpp b/llvm/tools/llvm-reduce/deltas/Delta.cpp
index f2807b05e0f81..4e1bd6d4f495c 100644
--- a/llvm/tools/llvm-reduce/deltas/Delta.cpp
+++ b/llvm/tools/llvm-reduce/deltas/Delta.cpp
@@ -15,9 +15,11 @@
 #include "Delta.h"
 #include "ReducerWorkItem.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ThreadPool.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include <fstream>
 #include <set>
@@ -37,6 +39,16 @@ static cl::opt<bool> TmpFilesAsBitcode(
     cl::desc("Write temporary files as bitcode, instead of textual IR"),
     cl::init(false));
 
+#ifdef LLVM_ENABLE_THREADS
+static cl::opt<unsigned> NumJobs(
+    "j",
+    cl::desc("Maximum number of threads to use to process chunks. Set to 1 to "
+             "disables parallelism."),
+    cl::init(1));
+#else
+unsigned NumJobs = 1;
+#endif
+
 void writeOutput(ReducerWorkItem &M, llvm::StringRef Message);
 
 bool isReduced(ReducerWorkItem &M, TestRunner &Test,
@@ -120,7 +132,8 @@ static bool increaseGranularity(std::vector<Chunk> &Chunks) {
 // modified module if the chunk resulted in a reduction.
 template <typename T>
 static std::unique_ptr<ReducerWorkItem>
-CheckChunk(Chunk &ChunkToCheckForUninterestingness, TestRunner &Test,
+CheckChunk(Chunk &ChunkToCheckForUninterestingness,
+           std::unique_ptr<ReducerWorkItem> Clone, TestRunner &Test,
            function_ref<void(Oracle &, T &)> ExtractChunksFromModule,
            std::set<Chunk> &UninterestingChunks,
            std::vector<Chunk> &ChunksStillConsideredInteresting) {
@@ -137,9 +150,6 @@ CheckChunk(Chunk &ChunkToCheckForUninterestingness, TestRunner &Test,
                    C != ChunkToCheckForUninterestingness;
           });
 
-  // Clone module before hacking it up..
-  std::unique_ptr<ReducerWorkItem> Clone =
-      cloneReducerWorkItem(Test.getProgram());
   // Generate Module with only Targets inside Current Chunks
   Oracle O(CurrentChunks);
   ExtractChunksFromModule(O, *Clone);
@@ -169,6 +179,36 @@ CheckChunk(Chunk &ChunkToCheckForUninterestingness, TestRunner &Test,
   return Clone;
 }
 
+template <typename T>
+SmallString<0> ProcessChunkFromSerializedBitcode(
+    Chunk &ChunkToCheckForUninterestingness, TestRunner &Test,
+    function_ref<void(Oracle &, T &)> ExtractChunksFromModule,
+    std::set<Chunk> &UninterestingChunks,
+    std::vector<Chunk> &ChunksStillConsideredInteresting,
+    SmallString<0> &OriginalBC, std::atomic<bool> &AnyReduced) {
+  LLVMContext Ctx;
+  Expected<std::unique_ptr<Module>> MOrErr = parseBitcodeFile(
+      MemoryBufferRef(StringRef(OriginalBC.data(), OriginalBC.size()),
+                      "<llvm-reduce tmp module>"),
+      Ctx);
+  if (!MOrErr)
+    report_fatal_error("Failed to read bitcode");
+  auto CloneMMM = std::make_unique<ReducerWorkItem>();
+  CloneMMM->M = std::move(MOrErr.get());
+
+  SmallString<0> Result;
+  if (std::unique_ptr<ReducerWorkItem> ChunkResult =
+          CheckChunk(ChunkToCheckForUninterestingness, std::move(CloneMMM),
+                     Test, ExtractChunksFromModule, UninterestingChunks,
+                     ChunksStillConsideredInteresting)) {
+    raw_svector_ostream BCOS(Result);
+    WriteBitcodeToFile(*ChunkResult->M, BCOS);
+    // Communicate that the task reduced a chunk.
+    AnyReduced = true;
+  }
+  return Result;
+}
+
 /// Runs the Delta Debugging algorithm, splits the code into chunks and
 /// reduces the amount of chunks that are considered interesting by the
 /// given test.
@@ -207,19 +247,112 @@ void runDeltaPassInt(
     increaseGranularity(ChunksStillConsideredInteresting);
   }
 
+  std::atomic<bool> AnyReduced;
+  std::unique_ptr<ThreadPool> ChunkThreadPoolPtr;
+  if (NumJobs > 1)
+    ChunkThreadPoolPtr =
+        std::make_unique<ThreadPool>(hardware_concurrency(NumJobs));
+
   bool FoundAtLeastOneNewUninterestingChunkWithCurrentGranularity;
   do {
     FoundAtLeastOneNewUninterestingChunkWithCurrentGranularity = false;
 
     std::set<Chunk> UninterestingChunks;
-    for (Chunk &ChunkToCheckForUninterestingness :
-         reverse(ChunksStillConsideredInteresting)) {
-      std::unique_ptr<ReducerWorkItem> Result = CheckChunk(
-          ChunkToCheckForUninterestingness, Test, ExtractChunksFromModule,
-          UninterestingChunks, ChunksStillConsideredInteresting);
+
+    // When running with more than one thread, serialize the original bitcode
+    // to OriginalBC.
+    SmallString<0> OriginalBC;
+    if (NumJobs > 1) {
+      raw_svector_ostream BCOS(OriginalBC);
+      WriteBitcodeToFile(*Test.getProgram().M, BCOS);
+    }
+
+    std::deque<std::future<SmallString<0>>> TaskQueue;
+    for (auto I = ChunksStillConsideredInteresting.rbegin(),
+              E = ChunksStillConsideredInteresting.rend();
+         I != E; ++I) {
+      std::unique_ptr<ReducerWorkItem> Result = nullptr;
+      unsigned WorkLeft = std::distance(I, E);
+
+      // Run in parallel mode, if the user requested more than one thread and
+      // there are at least a few chunks to process.
+      if (NumJobs > 1 && WorkLeft > 1) {
+        unsigned NumInitialTasks = std::min(WorkLeft, unsigned(NumJobs));
+        unsigned NumChunksProcessed = 0;
+
+        ThreadPool &ChunkThreadPool = *ChunkThreadPoolPtr;
+        TaskQueue.clear();
+
+        AnyReduced = false;
+        // Queue jobs to process NumInitialTasks chunks in parallel using
+        // ChunkThreadPool. When the tasks are added to the pool, parse the
+        // original module from OriginalBC with a fresh LLVMContext object. This
+        // ensures that the cloned module of each task uses an independent
+        // LLVMContext object. If a task reduces the input, serialize the result
+        // back in the corresponding Result element.
+        for (unsigned J = 0; J < NumInitialTasks; ++J) {
+          TaskQueue.emplace_back(ChunkThreadPool.async(
+              [J, I, &Test, &ExtractChunksFromModule, &UninterestingChunks,
+               &ChunksStillConsideredInteresting, &OriginalBC, &AnyReduced]() {
+                return ProcessChunkFromSerializedBitcode(
+                    *(I + J), Test, ExtractChunksFromModule,
+                    UninterestingChunks, ChunksStillConsideredInteresting,
+                    OriginalBC, AnyReduced);
+              }));
+        }
+
+        // Start processing results of the queued tasks. We wait for the first
+        // task in the queue to finish. If it reduced a chunk, we parse the
+        // result and exit the loop.
+        //  Otherwise we will try to schedule a new task, if
+        //  * no other pending job reduced a chunk and
+        //  * we have not reached the end of the chunk.
+        while (!TaskQueue.empty()) {
+          auto &Future = TaskQueue.front();
+          Future.wait();
+
+          NumChunksProcessed++;
+          SmallString<0> Res = Future.get();
+          TaskQueue.pop_front();
+          if (Res.empty()) {
+            unsigned NumScheduledTasks = NumChunksProcessed + TaskQueue.size();
+            if (!AnyReduced && I + NumScheduledTasks != E) {
+              Chunk &ChunkToCheck = *(I + NumScheduledTasks);
+              TaskQueue.emplace_back(ChunkThreadPool.async(
+                  [&Test, &ExtractChunksFromModule, &UninterestingChunks,
+                   &ChunksStillConsideredInteresting, &OriginalBC,
+                   &ChunkToCheck, &AnyReduced]() {
+                    return ProcessChunkFromSerializedBitcode(
+                        ChunkToCheck, Test, ExtractChunksFromModule,
+                        UninterestingChunks, ChunksStillConsideredInteresting,
+                        OriginalBC, AnyReduced);
+                  }));
+            }
+            continue;
+          }
+
+          Expected<std::unique_ptr<Module>> MOrErr = parseBitcodeFile(
+              MemoryBufferRef(StringRef(Res.data(), Res.size()),
+                              "<llvm-reduce tmp module>"),
+              Test.getProgram().M->getContext());
+          if (!MOrErr)
+            report_fatal_error("Failed to read bitcode");
+          Result = std::make_unique<ReducerWorkItem>();
+          Result->M = std::move(MOrErr.get());
+          break;
+        }
+        // Forward I to the last chunk processed in parallel.
+        I += NumChunksProcessed - 1;
+      } else {
+        Result = CheckChunk(*I, cloneReducerWorkItem(Test.getProgram()), Test,
+                            ExtractChunksFromModule, UninterestingChunks,
+                            ChunksStillConsideredInteresting);
+      }
+
       if (!Result)
         continue;
 
+      Chunk &ChunkToCheckForUninterestingness = *I;
       FoundAtLeastOneNewUninterestingChunkWithCurrentGranularity = true;
       UninterestingChunks.insert(ChunkToCheckForUninterestingness);
       ReducedProgram = std::move(Result);