[llvm] 3a6bfcf - [OpenMPOpt] Merge parallel regions

Fri Oct 9 09:59:11 PDT 2020

Author: Giorgis Georgakoudis
Date: 2020-10-09T09:59:04-07:00
New Revision: 3a6bfcf2f902341bb618c4d5e06b2b10d9f02c72

URL: https://github.com/llvm/llvm-project/commit/3a6bfcf2f902341bb618c4d5e06b2b10d9f02c72
DIFF: https://github.com/llvm/llvm-project/commit/3a6bfcf2f902341bb618c4d5e06b2b10d9f02c72.diff

LOG: [OpenMPOpt] Merge parallel regions

There are cases that generated OpenMP code consists of multiple,
consecutive OpenMP parallel regions, either due to high-level
programming models, such as RAJA, Kokkos, lowering to OpenMP code, or
simply because the programmer parallelized code this way.  This
optimization merges consecutive parallel OpenMP regions to: (1) reduce
the runtime overhead of re-activating a team of threads; (2) enlarge the
scope for other OpenMP optimizations, e.g., runtime call deduplication
and synchronization elimination.

This implementation defensively merges parallel regions, only when they
are within the same BB and any in-between instructions are safe to
execute in parallel.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D83635

Added: 
    llvm/test/Transforms/OpenMP/parallel_region_merging.ll
    llvm/test/Transforms/OpenMP/parallel_region_merging_legacy_pm.ll

Modified: 
    llvm/lib/Transforms/IPO/OpenMPOpt.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index f1eb88e5ab34..5a2287c14b79 100644

--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -19,14 +19,15 @@
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/Attributor.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/CallGraphUpdater.h"
-#include "llvm/Analysis/ValueTracking.h"
 
 using namespace llvm;
 using namespace omp;
@@ -38,6 +39,11 @@ static cl::opt<bool> DisableOpenMPOptimizations(
     cl::desc("Disable OpenMP specific optimizations."), cl::Hidden,
     cl::init(false));
 
+static cl::opt<bool> EnableParallelRegionMerging(
+    "openmp-opt-enable-merging", cl::ZeroOrMore,
+    cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden,
+    cl::init(false));
+
 static cl::opt<bool> PrintICVValues("openmp-print-icv-values", cl::init(false),
                                     cl::Hidden);
 static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels",
@@ -63,6 +69,8 @@ STATISTIC(NumOpenMPTargetRegionKernels,
 STATISTIC(
     NumOpenMPParallelRegionsReplacedInGPUStateMachine,
     "Number of OpenMP parallel regions replaced with ID in GPU state machines");
+STATISTIC(NumOpenMPParallelRegionsMerged,
+          "Number of OpenMP parallel regions merged");
 
 #if !defined(NDEBUG)
 static constexpr auto TAG = "[" DEBUG_TYPE "]";
@@ -505,12 +513,18 @@ struct OpenMPOpt {
     // Recollect uses, in case Attributor deleted any.
     OMPInfoCache.recollectUses();
 
-    Changed |= deduplicateRuntimeCalls();
     Changed |= deleteParallelRegions();
     if (HideMemoryTransferLatency)
       Changed |= hideMemTransfersLatency();
     if (remarksEnabled())
       analysisGlobalization();
+    Changed |= deduplicateRuntimeCalls();
+    if (EnableParallelRegionMerging) {
+      if (mergeParallelRegions()) {
+        deduplicateRuntimeCalls();
+        Changed = true;
+      }
+    }
 
     return Changed;
   }
@@ -575,6 +589,244 @@ struct OpenMPOpt {
   }
 
 private:
+  /// Merge parallel regions when it is safe.
+  bool mergeParallelRegions() {
+    const unsigned CallbackCalleeOperand = 2;
+    const unsigned CallbackFirstArgOperand = 3;
+    using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
+
+    // Check if there are any __kmpc_fork_call calls to merge.
+    OMPInformationCache::RuntimeFunctionInfo &RFI =
+        OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
+
+    if (!RFI.Declaration)
+      return false;
+
+    // Check if there any __kmpc_push_proc_bind calls for explicit affinities.
+    OMPInformationCache::RuntimeFunctionInfo &ProcBindRFI =
+        OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind];
+
+    // Defensively abort if explicit affinities are set.
+    // TODO: Track ICV proc_bind to merge when mergable regions have the same
+    // affinity.
+    if (ProcBindRFI.Declaration)
+      return false;
+
+    bool Changed = false;
+    LoopInfo *LI = nullptr;
+    DominatorTree *DT = nullptr;
+
+    SmallDenseMap<BasicBlock *, SmallPtrSet<Instruction *, 4>> BB2PRMap;
+
+    BasicBlock *StartBB = nullptr, *EndBB = nullptr;
+    auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
+                         BasicBlock &ContinuationIP) {
+      BasicBlock *CGStartBB = CodeGenIP.getBlock();
+      BasicBlock *CGEndBB =
+          SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
+      assert(StartBB != nullptr && "StartBB should not be null");
+      CGStartBB->getTerminator()->setSuccessor(0, StartBB);
+      assert(EndBB != nullptr && "EndBB should not be null");
+      EndBB->getTerminator()->setSuccessor(0, CGEndBB);
+    };
+
+    auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
+                      Value &VPtr, Value *&ReplacementValue) -> InsertPointTy {
+      ReplacementValue = &VPtr;
+      return CodeGenIP;
+    };
+
+    auto FiniCB = [&](InsertPointTy CodeGenIP) {};
+
+    // Helper to merge the __kmpc_fork_call calls in MergableCIs. They are all
+    // contained in BB and only separated by instructions that can be
+    // redundantly executed in parallel. The block BB is split before the first
+    // call (in MergableCIs) and after the last so the entire region we merge
+    // into a single parallel region is contained in a single basic block
+    // without any other instructions. We use the OpenMPIRBuilder to outline
+    // that block and call the resulting function via __kmpc_fork_call.
+    auto Merge = [&](SmallVectorImpl<CallInst *> &MergableCIs, BasicBlock *BB) {
+      // TODO: Change the interface to allow single CIs expanded, e.g, to
+      // include an outer loop.
+      assert(MergableCIs.size() > 1 && "Assumed multiple mergable CIs");
+
+      auto Remark = [&](OptimizationRemark OR) {
+        OR << "Parallel region at "
+           << ore::NV("OpenMPParallelMergeFront",
+                      MergableCIs.front()->getDebugLoc())
+           << " merged with parallel regions at ";
+        for (auto *CI :
+             llvm::make_range(MergableCIs.begin() + 1, MergableCIs.end())) {
+          OR << ore::NV("OpenMPParallelMerge", CI->getDebugLoc());
+          if (CI != MergableCIs.back())
+            OR << ", ";
+        }
+        return OR;
+      };
+
+      emitRemark<OptimizationRemark>(MergableCIs.front(),
+                                     "OpenMPParallelRegionMerging", Remark);
+
+      Function *OriginalFn = BB->getParent();
+      LLVM_DEBUG(dbgs() << TAG << "Merge " << MergableCIs.size()
+                        << " parallel regions in " << OriginalFn->getName()
+                        << "\n");
+
+      // Isolate the calls to merge in a separate block.
+      EndBB = SplitBlock(BB, MergableCIs.back()->getNextNode(), DT, LI);
+      BasicBlock *AfterBB =
+          SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI);
+      StartBB = SplitBlock(BB, MergableCIs.front(), DT, LI, nullptr,
+                           "omp.par.merged");
+
+      assert(BB->getUniqueSuccessor() == StartBB && "Expected a 
diff erent CFG");
+      const DebugLoc DL = BB->getTerminator()->getDebugLoc();
+      BB->getTerminator()->eraseFromParent();
+
+      OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB, BB->end()),
+                                               DL);
+      IRBuilder<>::InsertPoint AllocaIP(
+          &OriginalFn->getEntryBlock(),
+          OriginalFn->getEntryBlock().getFirstInsertionPt());
+      // Create the merged parallel region with default proc binding, to
+      // avoid overriding binding settings, and without explicit cancellation.
+      InsertPointTy AfterIP = OMPInfoCache.OMPBuilder.CreateParallel(
+          Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB, nullptr, nullptr,
+          OMP_PROC_BIND_default, /* IsCancellable */ false);
+      BranchInst::Create(AfterBB, AfterIP.getBlock());
+
+      // Perform the actual outlining.
+      OMPInfoCache.OMPBuilder.finalize();
+
+      Function *OutlinedFn = MergableCIs.front()->getCaller();
+
+      // Replace the __kmpc_fork_call calls with direct calls to the outlined
+      // callbacks.
+      SmallVector<Value *, 8> Args;
+      for (auto *CI : MergableCIs) {
+        Value *Callee =
+            CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts();
+        FunctionType *FT =
+            cast<FunctionType>(Callee->getType()->getPointerElementType());
+        Args.clear();
+        Args.push_back(OutlinedFn->getArg(0));
+        Args.push_back(OutlinedFn->getArg(1));
+        for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands();
+             U < E; ++U)
+          Args.push_back(CI->getArgOperand(U));
+
+        CallInst *NewCI = CallInst::Create(FT, Callee, Args, "", CI);
+        if (CI->getDebugLoc())
+          NewCI->setDebugLoc(CI->getDebugLoc());
+
+        // Forward parameter attributes from the callback to the callee.
+        for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands();
+             U < E; ++U)
+          for (const Attribute &A : CI->getAttributes().getParamAttributes(U))
+            NewCI->addParamAttr(
+                U - (CallbackFirstArgOperand - CallbackCalleeOperand), A);
+
+        // Emit an explicit barrier to replace the implicit fork-join barrier.
+        if (CI != MergableCIs.back()) {
+          // TODO: Remove barrier if the merged parallel region includes the
+          // 'nowait' clause.
+          OMPInfoCache.OMPBuilder.CreateBarrier(
+              InsertPointTy(NewCI->getParent(),
+                            NewCI->getNextNode()->getIterator()),
+              OMPD_parallel);
+        }
+
+        auto Remark = [&](OptimizationRemark OR) {
+          return OR << "Parallel region at "
+                    << ore::NV("OpenMPParallelMerge", CI->getDebugLoc())
+                    << " merged with "
+                    << ore::NV("OpenMPParallelMergeFront",
+                               MergableCIs.front()->getDebugLoc());
+        };
+        if (CI != MergableCIs.front())
+          emitRemark<OptimizationRemark>(CI, "OpenMPParallelRegionMerging",
+                                         Remark);
+
+        CI->eraseFromParent();
+      }
+
+      assert(OutlinedFn != OriginalFn && "Outlining failed");
+      CGUpdater.registerOutlinedFunction(*OutlinedFn);
+      CGUpdater.reanalyzeFunction(*OriginalFn);
+
+      NumOpenMPParallelRegionsMerged += MergableCIs.size();
+
+      return true;
+    };
+
+    // Helper function that identifes sequences of
+    // __kmpc_fork_call uses in a basic block.
+    auto DetectPRsCB = [&](Use &U, Function &F) {
+      CallInst *CI = getCallIfRegularCall(U, &RFI);
+      BB2PRMap[CI->getParent()].insert(CI);
+
+      return false;
+    };
+
+    BB2PRMap.clear();
+    RFI.foreachUse(SCC, DetectPRsCB);
+    SmallVector<SmallVector<CallInst *, 4>, 4> MergableCIsVector;
+    // Find mergable parallel regions within a basic block that are
+    // safe to merge, that is any in-between instructions can safely
+    // execute in parallel after merging.
+    // TODO: support merging across basic-blocks.
+    for (auto &It : BB2PRMap) {
+      auto &CIs = It.getSecond();
+      if (CIs.size() < 2)
+        continue;
+
+      BasicBlock *BB = It.getFirst();
+      SmallVector<CallInst *, 4> MergableCIs;
+
+      // Find maximal number of parallel region CIs that are safe to merge.
+      for (Instruction &I : *BB) {
+        if (CIs.count(&I)) {
+          MergableCIs.push_back(cast<CallInst>(&I));
+          continue;
+        }
+
+        if (isSafeToSpeculativelyExecute(&I, &I, DT))
+          continue;
+
+        if (MergableCIs.size() > 1) {
+          MergableCIsVector.push_back(MergableCIs);
+          LLVM_DEBUG(dbgs() << TAG << "Found " << MergableCIs.size()
+                            << " parallel regions in block " << BB->getName()
+                            << " of function " << BB->getParent()->getName()
+                            << "\n";);
+        }
+
+        MergableCIs.clear();
+      }
+
+      if (!MergableCIsVector.empty()) {
+        Changed = true;
+
+        for (auto &MergableCIs : MergableCIsVector)
+          Merge(MergableCIs, BB);
+      }
+    }
+
+    if (Changed) {
+      // Update RFI info to set it up for later passes.
+      RFI.clearUsesMap();
+      OMPInfoCache.collectUses(RFI, /* CollectStats */ false);
+
+      // Collect uses for the emitted barrier call.
+      OMPInformationCache::RuntimeFunctionInfo &BarrierRFI =
+          OMPInfoCache.RFIs[OMPRTL___kmpc_barrier];
+      BarrierRFI.clearUsesMap();
+      OMPInfoCache.collectUses(BarrierRFI, /* CollectStats */ false);
+    }
+
+    return Changed;
+  }
+
   /// Try to delete parallel regions if possible.
   bool deleteParallelRegions() {
     const unsigned CallbackCalleeOperand = 2;

diff  --git a/llvm/test/Transforms/OpenMP/parallel_region_merging.ll b/llvm/test/Transforms/OpenMP/parallel_region_merging.ll
new file mode 100644
index 000000000000..708629996285
--- /dev/null
+++ b/llvm/test/Transforms/OpenMP/parallel_region_merging.ll
@@ -0,0 +1,412 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs
+; RUN: opt -S -passes='attributor,cgscc(openmpopt)' -openmp-opt-enable-merging  < %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+
+ at 0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+ at 1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8
+
+;   void merge_all() {
+;       int a = 1;
+;   #pragma omp parallel
+;       {
+;           a = 2;
+;       }
+;   #pragma omp parallel
+;       {
+;           a = 3;
+;       }
+;   }
+;
+; Merge all parallel regions.
+define dso_local void @merge_all() local_unnamed_addr  {
+  %1 = alloca i32, align 4
+  %2 = bitcast i32* %1 to i8*
+  store i32 1, i32* %1, align 4
+  %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par to void (i32*, i32*, ...)*), i32* nonnull %1)
+  %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par.1 to void (i32*, i32*, ...)*), i32* nonnull %1)
+  ret void
+}
+
+define internal void @merge_all..omp_par.1(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
+  store i32 3, i32* %2, align 4
+  ret void
+}
+
+define internal void @merge_all..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
+  store i32 2, i32* %2, align 4
+  ret void
+}
+
+
+declare i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr
+
+declare !callback !1 void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) local_unnamed_addr
+
+;   void merge_none() {
+;       int a = 1;
+;   #pragma omp parallel
+;       {
+;           a = 2;
+;       }
+;       a = 3;
+;   #pragma omp parallel
+;       {
+;           a = 4;
+;       }
+;   }
+;
+; Does not merge parallel regions, in-between store
+; instruction is unsafe to execute in parallel.
+define dso_local void @merge_none() local_unnamed_addr  {
+  %1 = alloca i32, align 4
+  %2 = bitcast i32* %1 to i8*
+  store i32 1, i32* %1, align 4
+  %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par to void (i32*, i32*, ...)*), i32* nonnull %1)
+  store i32 3, i32* %1, align 4
+  %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par.2 to void (i32*, i32*, ...)*), i32* nonnull %1)
+  ret void
+}
+
+define internal void @merge_none..omp_par.2(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
+  store i32 4, i32* %2, align 4
+  ret void
+}
+
+define internal void @merge_none..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
+  store i32 2, i32* %2, align 4
+  ret void
+}
+
+;   void merge_some() {
+;       int a = 1;
+;   #pragma omp parallel
+;       {
+;           a = 2;
+;       }
+;       a = 3;
+;   #pragma omp parallel
+;       {
+;           a = 4;
+;       }
+;   #pragma omp parallel
+;       {
+;           a = 5;
+;       }
+;   }
+;
+; Do not merge first parallel region, due to the
+; unsafe store, but merge the two next parallel
+; regions.
+define dso_local void @merge_some() local_unnamed_addr  {
+  %1 = alloca i32, align 4
+  %2 = bitcast i32* %1 to i8*
+  store i32 1, i32* %1, align 4
+  %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par to void (i32*, i32*, ...)*), i32* nonnull %1)
+  store i32 3, i32* %1, align 4
+  %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par.3 to void (i32*, i32*, ...)*), i32* nonnull %1)
+  %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par.4 to void (i32*, i32*, ...)*), i32* nonnull %1)
+  ret void
+}
+
+define internal void @merge_some..omp_par.4(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
+  store i32 5, i32* %2, align 4
+  ret void
+}
+
+define internal void @merge_some..omp_par.3(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
+  store i32 4, i32* %2, align 4
+  ret void
+}
+
+define internal void @merge_some..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
+  store i32 2, i32* %2, align 4
+  ret void
+}
+
+;   void merge_cancellable_regions(int cancel1, int cancel2)
+;   {
+;   #pragma omp parallel
+;       {
+;           if(cancel1) {
+;   #pragma omp cancel parallel
+;           }
+;       }
+;   
+;   #pragma omp parallel
+;       {
+;           if (cancel2) {
+;   #pragma omp cancel parallel
+;           }
+;       }
+;   }
+;
+; Merge correctly cancellable regions.
+define dso_local void @merge_cancellable_regions(i32 %0, i32 %1) local_unnamed_addr  {
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  store i32 %0, i32* %3, align 4
+  store i32 %1, i32* %4, align 4
+  %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_cancellable_regions..omp_par to void (i32*, i32*, ...)*), i32* nonnull %3)
+  %6 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_cancellable_regions..omp_par.5 to void (i32*, i32*, ...)*), i32* nonnull %4)
+  ret void
+}
+
+define internal void @merge_cancellable_regions..omp_par.5(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture readonly %2)  {
+  %4 = load i32, i32* %2, align 4
+  %5 = icmp eq i32 %4, 0
+  br i1 %5, label %6, label %7
+
+6:                                                ; preds = %3
+  ret void
+
+7:                                                ; preds = %3
+  %8 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  %9 = call i32 @__kmpc_cancel(%struct.ident_t* nonnull @1, i32 %8, i32 1)
+  ret void
+}
+
+define internal void @merge_cancellable_regions..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture readonly %2)  {
+  %4 = load i32, i32* %2, align 4
+  %5 = icmp eq i32 %4, 0
+  br i1 %5, label %6, label %7
+
+6:                                                ; preds = %3
+  ret void
+
+7:                                                ; preds = %3
+  %8 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  %9 = call i32 @__kmpc_cancel(%struct.ident_t* nonnull @1, i32 %8, i32 1)
+  ret void
+}
+
+declare i32 @__kmpc_cancel(%struct.ident_t*, i32, i32) local_unnamed_addr
+
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!2}
+!2 = !{i64 2, i64 -1, i64 -1, i1 true}
+; CHECK-LABEL: define {{[^@]+}}@merge_all() local_unnamed_addr {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1:@.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 1, i32* [[TMP2]], align 4
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+; CHECK:       omp_parallel:
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par.3 to void (i32*, i32*, ...)*), i32* [[TMP2]])
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK:       omp.par.outlined.exit:
+; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK:       omp.par.exit.split:
+; CHECK-NEXT:    br label [[DOTSPLIT_SPLIT:%.*]]
+; CHECK:       .split.split:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_all..omp_par.3
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[TMP0:%.*]]) [[ATTR0:#.*]] {
+; CHECK-NEXT:  omp.par.entry:
+; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[TMP1]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
+; CHECK:       omp.par.outlined.exit.exitStub:
+; CHECK-NEXT:    ret void
+; CHECK:       omp.par.region:
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
+; CHECK:       omp.par.merged:
+; CHECK-NEXT:    call void @merge_all..omp_par(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2:@.*]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    call void @merge_all..omp_par.1(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
+; CHECK:       .split:
+; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK:       omp.par.region.split:
+; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK:       omp.par.pre_finalize:
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_all..omp_par.1
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1:#.*]] {
+; CHECK-NEXT:    store i32 3, i32* [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_all..omp_par
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
+; CHECK-NEXT:    store i32 2, i32* [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_none() local_unnamed_addr {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 1, i32* [[TMP2]], align 4
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2]])
+; CHECK-NEXT:    store i32 3, i32* [[TMP2]], align 4
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par.2 to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_none..omp_par.2
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
+; CHECK-NEXT:    store i32 4, i32* [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_none..omp_par
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
+; CHECK-NEXT:    store i32 2, i32* [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_some() local_unnamed_addr {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 1, i32* [[TMP2]], align 4
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2]])
+; CHECK-NEXT:    store i32 3, i32* [[TMP2]], align 4
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+; CHECK:       omp_parallel:
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par.2 to void (i32*, i32*, ...)*), i32* [[TMP2]])
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK:       omp.par.outlined.exit:
+; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK:       omp.par.exit.split:
+; CHECK-NEXT:    br label [[DOTSPLIT_SPLIT:%.*]]
+; CHECK:       .split.split:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par.2
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[TMP0:%.*]]) [[ATTR0]] {
+; CHECK-NEXT:  omp.par.entry:
+; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[TMP1]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
+; CHECK:       omp.par.outlined.exit.exitStub:
+; CHECK-NEXT:    ret void
+; CHECK:       omp.par.region:
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
+; CHECK:       omp.par.merged:
+; CHECK-NEXT:    call void @merge_some..omp_par.3(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    call void @merge_some..omp_par.4(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
+; CHECK:       .split:
+; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK:       omp.par.region.split:
+; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK:       omp.par.pre_finalize:
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par.4
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
+; CHECK-NEXT:    store i32 5, i32* [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par.3
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
+; CHECK-NEXT:    store i32 4, i32* [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
+; CHECK-NEXT:    store i32 2, i32* [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions
+; CHECK-SAME: (i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[TMP4]], align 4
+; CHECK-NEXT:    store i32 [[TMP1]], i32* [[TMP5]], align 4
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+; CHECK:       omp_parallel:
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @merge_cancellable_regions..omp_par.1 to void (i32*, i32*, ...)*), i32* [[TMP4]], i32* [[TMP5]])
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK:       omp.par.outlined.exit:
+; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK:       omp.par.exit.split:
+; CHECK-NEXT:    br label [[DOTSPLIT_SPLIT:%.*]]
+; CHECK:       .split.split:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par.1
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[TMP0:%.*]], i32* [[TMP1:%.*]]) [[ATTR0]] {
+; CHECK-NEXT:  omp.par.entry:
+; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[TMP2]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
+; CHECK:       omp.par.outlined.exit.exitStub:
+; CHECK-NEXT:    ret void
+; CHECK:       omp.par.region:
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
+; CHECK:       omp.par.merged:
+; CHECK-NEXT:    call void @merge_cancellable_regions..omp_par(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    call void @merge_cancellable_regions..omp_par.5(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP1]])
+; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
+; CHECK:       .split:
+; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK:       omp.par.region.split:
+; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK:       omp.par.pre_finalize:
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par.5
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP2:%.*]]) {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; CHECK:       6:
+; CHECK-NEXT:    ret void
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull [[GLOB1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* noundef nonnull [[GLOB1]], i32 [[TMP8]], i32 noundef 1)
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP2:%.*]]) {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; CHECK:       6:
+; CHECK-NEXT:    ret void
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull [[GLOB1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* noundef nonnull [[GLOB1]], i32 [[TMP8]], i32 noundef 1)
+; CHECK-NEXT:    ret void
+;

diff  --git a/llvm/test/Transforms/OpenMP/parallel_region_merging_legacy_pm.ll b/llvm/test/Transforms/OpenMP/parallel_region_merging_legacy_pm.ll
new file mode 100644
index 000000000000..5da416b8c61a
--- /dev/null
+++ b/llvm/test/Transforms/OpenMP/parallel_region_merging_legacy_pm.ll
@@ -0,0 +1,412 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs
+; RUN: opt -S -attributor -openmpopt -openmp-opt-enable-merging  < %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+
+ at 0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+ at 1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8
+
+;   void merge_all() {
+;       int a = 1;
+;   #pragma omp parallel
+;       {
+;           a = 2;
+;       }
+;   #pragma omp parallel
+;       {
+;           a = 3;
+;       }
+;   }
+;
+; Merge all parallel regions.
+define dso_local void @merge_all() local_unnamed_addr  {
+  %1 = alloca i32, align 4
+  %2 = bitcast i32* %1 to i8*
+  store i32 1, i32* %1, align 4
+  %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par to void (i32*, i32*, ...)*), i32* nonnull %1)
+  %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par.1 to void (i32*, i32*, ...)*), i32* nonnull %1)
+  ret void
+}
+
+define internal void @merge_all..omp_par.1(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
+  store i32 3, i32* %2, align 4
+  ret void
+}
+
+define internal void @merge_all..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
+  store i32 2, i32* %2, align 4
+  ret void
+}
+
+
+declare i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr
+
+declare !callback !1 void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) local_unnamed_addr
+
+;   void merge_none() {
+;       int a = 1;
+;   #pragma omp parallel
+;       {
+;           a = 2;
+;       }
+;       a = 3;
+;   #pragma omp parallel
+;       {
+;           a = 4;
+;       }
+;   }
+;
+; Does not merge parallel regions, in-between store
+; instruction is unsafe to execute in parallel.
+define dso_local void @merge_none() local_unnamed_addr  {
+  %1 = alloca i32, align 4
+  %2 = bitcast i32* %1 to i8*
+  store i32 1, i32* %1, align 4
+  %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par to void (i32*, i32*, ...)*), i32* nonnull %1)
+  store i32 3, i32* %1, align 4
+  %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par.2 to void (i32*, i32*, ...)*), i32* nonnull %1)
+  ret void
+}
+
+define internal void @merge_none..omp_par.2(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
+  store i32 4, i32* %2, align 4
+  ret void
+}
+
+define internal void @merge_none..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
+  store i32 2, i32* %2, align 4
+  ret void
+}
+
+;   void merge_some() {
+;       int a = 1;
+;   #pragma omp parallel
+;       {
+;           a = 2;
+;       }
+;       a = 3;
+;   #pragma omp parallel
+;       {
+;           a = 4;
+;       }
+;   #pragma omp parallel
+;       {
+;           a = 5;
+;       }
+;   }
+;
+; Do not merge first parallel region, due to the
+; unsafe store, but merge the two next parallel
+; regions.
+define dso_local void @merge_some() local_unnamed_addr  {
+  %1 = alloca i32, align 4
+  %2 = bitcast i32* %1 to i8*
+  store i32 1, i32* %1, align 4
+  %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par to void (i32*, i32*, ...)*), i32* nonnull %1)
+  store i32 3, i32* %1, align 4
+  %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par.3 to void (i32*, i32*, ...)*), i32* nonnull %1)
+  %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par.4 to void (i32*, i32*, ...)*), i32* nonnull %1)
+  ret void
+}
+
+define internal void @merge_some..omp_par.4(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
+  store i32 5, i32* %2, align 4
+  ret void
+}
+
+define internal void @merge_some..omp_par.3(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
+  store i32 4, i32* %2, align 4
+  ret void
+}
+
+define internal void @merge_some..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
+  store i32 2, i32* %2, align 4
+  ret void
+}
+
+;   void merge_cancellable_regions(int cancel1, int cancel2)
+;   {
+;   #pragma omp parallel
+;       {
+;           if(cancel1) {
+;   #pragma omp cancel parallel
+;           }
+;       }
+;   
+;   #pragma omp parallel
+;       {
+;           if (cancel2) {
+;   #pragma omp cancel parallel
+;           }
+;       }
+;   }
+;
+; Merge correctly cancellable regions.
+define dso_local void @merge_cancellable_regions(i32 %0, i32 %1) local_unnamed_addr  {
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  store i32 %0, i32* %3, align 4
+  store i32 %1, i32* %4, align 4
+  %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_cancellable_regions..omp_par to void (i32*, i32*, ...)*), i32* nonnull %3)
+  %6 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_cancellable_regions..omp_par.5 to void (i32*, i32*, ...)*), i32* nonnull %4)
+  ret void
+}
+
+define internal void @merge_cancellable_regions..omp_par.5(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture readonly %2)  {
+  %4 = load i32, i32* %2, align 4
+  %5 = icmp eq i32 %4, 0
+  br i1 %5, label %6, label %7
+
+6:                                                ; preds = %3
+  ret void
+
+7:                                                ; preds = %3
+  %8 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  %9 = call i32 @__kmpc_cancel(%struct.ident_t* nonnull @1, i32 %8, i32 1)
+  ret void
+}
+
+define internal void @merge_cancellable_regions..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture readonly %2)  {
+  %4 = load i32, i32* %2, align 4
+  %5 = icmp eq i32 %4, 0
+  br i1 %5, label %6, label %7
+
+6:                                                ; preds = %3
+  ret void
+
+7:                                                ; preds = %3
+  %8 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  %9 = call i32 @__kmpc_cancel(%struct.ident_t* nonnull @1, i32 %8, i32 1)
+  ret void
+}
+
+declare i32 @__kmpc_cancel(%struct.ident_t*, i32, i32) local_unnamed_addr
+
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!2}
+!2 = !{i64 2, i64 -1, i64 -1, i1 true}
+; CHECK-LABEL: define {{[^@]+}}@merge_all() local_unnamed_addr {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1:@.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 1, i32* [[TMP2]], align 4
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+; CHECK:       omp_parallel:
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par.2 to void (i32*, i32*, ...)*), i32* [[TMP2]])
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK:       omp.par.outlined.exit:
+; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK:       omp.par.exit.split:
+; CHECK-NEXT:    br label [[DOTSPLIT_SPLIT:%.*]]
+; CHECK:       .split.split:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_all..omp_par.2
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[TMP0:%.*]]) [[ATTR0:#.*]] {
+; CHECK-NEXT:  omp.par.entry:
+; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[TMP1]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
+; CHECK:       omp.par.outlined.exit.exitStub:
+; CHECK-NEXT:    ret void
+; CHECK:       omp.par.region:
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
+; CHECK:       omp.par.merged:
+; CHECK-NEXT:    call void @merge_all..omp_par(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2:@.*]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    call void @merge_all..omp_par.1(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
+; CHECK:       .split:
+; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK:       omp.par.region.split:
+; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK:       omp.par.pre_finalize:
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_all..omp_par.1
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1:#.*]] {
+; CHECK-NEXT:    store i32 3, i32* [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_all..omp_par
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
+; CHECK-NEXT:    store i32 2, i32* [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_none() local_unnamed_addr {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 1, i32* [[TMP2]], align 4
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2]])
+; CHECK-NEXT:    store i32 3, i32* [[TMP2]], align 4
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par.2 to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_none..omp_par.2
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
+; CHECK-NEXT:    store i32 4, i32* [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_none..omp_par
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
+; CHECK-NEXT:    store i32 2, i32* [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_some() local_unnamed_addr {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 1, i32* [[TMP2]], align 4
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2]])
+; CHECK-NEXT:    store i32 3, i32* [[TMP2]], align 4
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+; CHECK:       omp_parallel:
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par.5 to void (i32*, i32*, ...)*), i32* [[TMP2]])
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK:       omp.par.outlined.exit:
+; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK:       omp.par.exit.split:
+; CHECK-NEXT:    br label [[DOTSPLIT_SPLIT:%.*]]
+; CHECK:       .split.split:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par.5
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[TMP0:%.*]]) [[ATTR0]] {
+; CHECK-NEXT:  omp.par.entry:
+; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[TMP1]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
+; CHECK:       omp.par.outlined.exit.exitStub:
+; CHECK-NEXT:    ret void
+; CHECK:       omp.par.region:
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
+; CHECK:       omp.par.merged:
+; CHECK-NEXT:    call void @merge_some..omp_par.3(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    call void @merge_some..omp_par.4(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
+; CHECK:       .split:
+; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK:       omp.par.region.split:
+; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK:       omp.par.pre_finalize:
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par.4
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
+; CHECK-NEXT:    store i32 5, i32* [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par.3
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
+; CHECK-NEXT:    store i32 4, i32* [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
+; CHECK-NEXT:    store i32 2, i32* [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions
+; CHECK-SAME: (i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[TMP4]], align 4
+; CHECK-NEXT:    store i32 [[TMP1]], i32* [[TMP5]], align 4
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+; CHECK:       omp_parallel:
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @merge_cancellable_regions..omp_par.6 to void (i32*, i32*, ...)*), i32* [[TMP4]], i32* [[TMP5]])
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK:       omp.par.outlined.exit:
+; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK:       omp.par.exit.split:
+; CHECK-NEXT:    br label [[DOTSPLIT_SPLIT:%.*]]
+; CHECK:       .split.split:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par.6
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[TMP0:%.*]], i32* [[TMP1:%.*]]) [[ATTR0]] {
+; CHECK-NEXT:  omp.par.entry:
+; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[TMP2]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
+; CHECK:       omp.par.outlined.exit.exitStub:
+; CHECK-NEXT:    ret void
+; CHECK:       omp.par.region:
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
+; CHECK:       omp.par.merged:
+; CHECK-NEXT:    call void @merge_cancellable_regions..omp_par(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    call void @merge_cancellable_regions..omp_par.5(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP1]])
+; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
+; CHECK:       .split:
+; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK:       omp.par.region.split:
+; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK:       omp.par.pre_finalize:
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par.5
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP2:%.*]]) {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; CHECK:       6:
+; CHECK-NEXT:    ret void
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull [[GLOB1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* noundef nonnull [[GLOB1]], i32 [[TMP8]], i32 noundef 1)
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP2:%.*]]) {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; CHECK:       6:
+; CHECK-NEXT:    ret void
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull [[GLOB1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* noundef nonnull [[GLOB1]], i32 [[TMP8]], i32 noundef 1)
+; CHECK-NEXT:    ret void
+;