[llvm-branch-commits] [llvm] 9751705 - [OpenMPOpt][WIP] Expand parallel region merging

Mon Jan 11 08:11:16 PST 2021

Author: Giorgis Georgakoudis
Date: 2021-01-11T08:06:23-08:00
New Revision: 975170551283559ebe5052b6f83b2cc9e50132db

URL: https://github.com/llvm/llvm-project/commit/975170551283559ebe5052b6f83b2cc9e50132db
DIFF: https://github.com/llvm/llvm-project/commit/975170551283559ebe5052b6f83b2cc9e50132db.diff

LOG: [OpenMPOpt][WIP] Expand parallel region merging

The existing implementation of parallel region merging applies only to
consecutive parallel regions that have speculatable sequential
instructions in-between. This patch lifts this limitation to expand
merging with any sequential instructions in-between, except calls to
unmergable OpenMP runtime functions. In-between sequential instructions
in the merged region are sequentialized in a "master" region and any
output values are broadcasted to the following parallel regions and the
sequential region continuation of the merged region.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D90909

Added: 
    

Modified: 
    llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
    llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
    llvm/lib/Transforms/IPO/OpenMPOpt.cpp
    llvm/test/Transforms/OpenMP/parallel_region_merging.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 68df3f2271d98..8e95226d3895e 100644

--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -38,7 +38,10 @@ class OpenMPIRBuilder {
   void initialize();
 
   /// Finalize the underlying module, e.g., by outlining regions.
-  void finalize();
+  /// \param AllowExtractorSinking Flag to include sinking instructions,
+  ///                              emitted by CodeExtractor, in the
+  ///                              outlined region. Default is false.
+  void finalize(bool AllowExtractorSinking = false);
 
   /// Add attributes known for \p FnID to \p Fn.
   void addAttributes(omp::RuntimeFunction FnID, Function &Fn);

diff  --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index d120eaa17fb77..fdedb6beac359 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -127,7 +127,7 @@ Function *OpenMPIRBuilder::getOrCreateRuntimeFunctionPtr(RuntimeFunction FnID) {
 
 void OpenMPIRBuilder::initialize() { initializeTypes(M); }
 
-void OpenMPIRBuilder::finalize() {
+void OpenMPIRBuilder::finalize(bool AllowExtractorSinking) {
   SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
   SmallVector<BasicBlock *, 32> Blocks;
   for (OutlineInfo &OI : OutlineInfos) {
@@ -170,6 +170,25 @@ void OpenMPIRBuilder::finalize() {
       BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
       assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
       assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
+      if (AllowExtractorSinking) {
+        // Move instructions from the to-be-deleted ArtificialEntry to the entry
+        // basic block of the parallel region. CodeExtractor may have sunk
+        // allocas/bitcasts for values that are solely used in the outlined
+        // region and do not escape.
+        assert(!ArtificialEntry.empty() &&
+               "Expected instructions to sink in the outlined region");
+        for (BasicBlock::iterator It = ArtificialEntry.begin(),
+                                  End = ArtificialEntry.end();
+             It != End;) {
+          Instruction &I = *It;
+          It++;
+
+          if (I.isTerminator())
+            continue;
+
+          I.moveBefore(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
+        }
+      }
       OI.EntryBB->moveBefore(&ArtificialEntry);
       ArtificialEntry.eraseFromParent();
     }

diff  --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 389ed1ebad6d5..fc3cc90c6f808 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -28,6 +28,7 @@
 #include "llvm/Transforms/IPO/Attributor.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/CallGraphUpdater.h"
+#include "llvm/Transforms/Utils/CodeExtractor.h"
 
 using namespace llvm;
 using namespace omp;
@@ -317,13 +318,17 @@ struct OMPInformationCache : public InformationCache {
     return NumUses;
   }
 
+  // Helper function to recollect uses of a runtime function.
+  void recollectUsesForFunction(RuntimeFunction RTF) {
+    auto &RFI = RFIs[RTF];
+    RFI.clearUsesMap();
+    collectUses(RFI, /*CollectStats*/ false);
+  }
+
   // Helper function to recollect uses of all runtime functions.
   void recollectUses() {
-    for (int Idx = 0; Idx < RFIs.size(); ++Idx) {
-      auto &RFI = RFIs[static_cast<RuntimeFunction>(Idx)];
-      RFI.clearUsesMap();
-      collectUses(RFI, /*CollectStats*/ false);
-    }
+    for (int Idx = 0; Idx < RFIs.size(); ++Idx)
+      recollectUsesForFunction(static_cast<RuntimeFunction>(Idx));
   }
 
   /// Helper to initialize all runtime function information for those defined
@@ -601,15 +606,11 @@ struct OpenMPOpt {
     if (!RFI.Declaration)
       return false;
 
-    // Check if there any __kmpc_push_proc_bind calls for explicit affinities.
-    OMPInformationCache::RuntimeFunctionInfo &ProcBindRFI =
-        OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind];
-
-    // Defensively abort if explicit affinities are set.
-    // TODO: Track ICV proc_bind to merge when mergable regions have the same
-    // affinity.
-    if (ProcBindRFI.Declaration)
-      return false;
+    // Unmergable calls that prevent merging a parallel region.
+    OMPInformationCache::RuntimeFunctionInfo UnmergableCallsInfo[] = {
+        OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind],
+        OMPInfoCache.RFIs[OMPRTL___kmpc_push_num_threads],
+    };
 
     bool Changed = false;
     LoopInfo *LI = nullptr;
@@ -637,6 +638,90 @@ struct OpenMPOpt {
 
     auto FiniCB = [&](InsertPointTy CodeGenIP) {};
 
+    /// Create a sequential execution region within a merged parallel region,
+    /// encapsulated in a master construct with a barrier for synchronization.
+    auto CreateSequentialRegion = [&](Function *OuterFn,
+                                      BasicBlock *OuterPredBB,
+                                      Instruction *SeqStartI,
+                                      Instruction *SeqEndI) {
+      // Isolate the instructions of the sequential region to a separate
+      // block.
+      BasicBlock *ParentBB = SeqStartI->getParent();
+      BasicBlock *SeqEndBB =
+          SplitBlock(ParentBB, SeqEndI->getNextNode(), DT, LI);
+      BasicBlock *SeqAfterBB =
+          SplitBlock(SeqEndBB, &*SeqEndBB->getFirstInsertionPt(), DT, LI);
+      BasicBlock *SeqStartBB =
+          SplitBlock(ParentBB, SeqStartI, DT, LI, nullptr, "seq.par.merged");
+
+      assert(ParentBB->getUniqueSuccessor() == SeqStartBB &&
+             "Expected a 
diff erent CFG");
+      const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc();
+      ParentBB->getTerminator()->eraseFromParent();
+
+      auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
+                           BasicBlock &ContinuationIP) {
+        BasicBlock *CGStartBB = CodeGenIP.getBlock();
+        BasicBlock *CGEndBB =
+            SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
+        assert(SeqStartBB != nullptr && "SeqStartBB should not be null");
+        CGStartBB->getTerminator()->setSuccessor(0, SeqStartBB);
+        assert(SeqEndBB != nullptr && "SeqEndBB should not be null");
+        SeqEndBB->getTerminator()->setSuccessor(0, CGEndBB);
+      };
+      auto FiniCB = [&](InsertPointTy CodeGenIP) {};
+
+      // Find outputs from the sequential region to outside users and
+      // broadcast their values to them.
+      for (Instruction &I : *SeqStartBB) {
+        SmallPtrSet<Instruction *, 4> OutsideUsers;
+        for (User *Usr : I.users()) {
+          Instruction &UsrI = *cast<Instruction>(Usr);
+          // Ignore outputs to LT intrinsics, code extraction for the merged
+          // parallel region will fix them.
+          if (UsrI.isLifetimeStartOrEnd())
+            continue;
+
+          if (UsrI.getParent() != SeqStartBB)
+            OutsideUsers.insert(&UsrI);
+        }
+
+        if (OutsideUsers.empty())
+          continue;
+
+        // Emit an alloca in the outer region to store the broadcasted
+        // value.
+        const DataLayout &DL = M.getDataLayout();
+        AllocaInst *AllocaI = new AllocaInst(
+            I.getType(), DL.getAllocaAddrSpace(), nullptr,
+            I.getName() + ".seq.output.alloc", &OuterFn->front().front());
+
+        // Emit a store instruction in the sequential BB to update the
+        // value.
+        new StoreInst(&I, AllocaI, SeqStartBB->getTerminator());
+
+        // Emit a load instruction and replace the use of the output value
+        // with it.
+        for (Instruction *UsrI : OutsideUsers) {
+          LoadInst *LoadI = new LoadInst(I.getType(), AllocaI,
+                                         I.getName() + ".seq.output.load", UsrI);
+          UsrI->replaceUsesOfWith(&I, LoadI);
+        }
+      }
+
+      OpenMPIRBuilder::LocationDescription Loc(
+          InsertPointTy(ParentBB, ParentBB->end()), DL);
+      InsertPointTy SeqAfterIP =
+          OMPInfoCache.OMPBuilder.createMaster(Loc, BodyGenCB, FiniCB);
+
+      OMPInfoCache.OMPBuilder.createBarrier(SeqAfterIP, OMPD_parallel);
+
+      BranchInst::Create(SeqAfterBB, SeqAfterIP.getBlock());
+
+      LLVM_DEBUG(dbgs() << TAG << "After sequential inlining " << *OuterFn
+                        << "\n");
+    };
+
     // Helper to merge the __kmpc_fork_call calls in MergableCIs. They are all
     // contained in BB and only separated by instructions that can be
     // redundantly executed in parallel. The block BB is split before the first
@@ -682,6 +767,21 @@ struct OpenMPOpt {
       const DebugLoc DL = BB->getTerminator()->getDebugLoc();
       BB->getTerminator()->eraseFromParent();
 
+      // Create sequential regions for sequential instructions that are
+      // in-between mergable parallel regions.
+      for (auto *It = MergableCIs.begin(), *End = MergableCIs.end() - 1;
+           It != End; ++It) {
+        Instruction *ForkCI = *It;
+        Instruction *NextForkCI = *(It + 1);
+
+        // Continue if there are not in-between instructions.
+        if (ForkCI->getNextNode() == NextForkCI)
+          continue;
+
+        CreateSequentialRegion(OriginalFn, BB, ForkCI->getNextNode(),
+                               NextForkCI->getPrevNode());
+      }
+
       OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB, BB->end()),
                                                DL);
       IRBuilder<>::InsertPoint AllocaIP(
@@ -695,7 +795,7 @@ struct OpenMPOpt {
       BranchInst::Create(AfterBB, AfterIP.getBlock());
 
       // Perform the actual outlining.
-      OMPInfoCache.OMPBuilder.finalize();
+      OMPInfoCache.OMPBuilder.finalize(/* AllowExtractorSinking */ true);
 
       Function *OutlinedFn = MergableCIs.front()->getCaller();
 
@@ -782,16 +882,75 @@ struct OpenMPOpt {
       BasicBlock *BB = It.getFirst();
       SmallVector<CallInst *, 4> MergableCIs;
 
+      /// Returns true if the instruction is mergable, false otherwise.
+      /// A terminator instruction is unmergable by definition since merging
+      /// works within a BB. Instructions before the mergable region are
+      /// mergable if they are not calls to OpenMP runtime functions that may
+      /// set 
diff erent execution parameters for subsequent parallel regions.
+      /// Instructions in-between parallel regions are mergable if they are not
+      /// calls to any non-intrinsic function since that may call a non-mergable
+      /// OpenMP runtime function.
+      auto IsMergable = [&](Instruction &I, bool IsBeforeMergableRegion) {
+        // We do not merge across BBs, hence return false (unmergable) if the
+        // instruction is a terminator.
+        if (I.isTerminator())
+          return false;
+
+        if (!isa<CallInst>(&I))
+          return true;
+
+        CallInst *CI = cast<CallInst>(&I);
+        if (IsBeforeMergableRegion) {
+          Function *CalledFunction = CI->getCalledFunction();
+          if (!CalledFunction)
+            return false;
+          // Return false (unmergable) if the call before the parallel
+          // region calls an explicit affinity (proc_bind) or number of
+          // threads (num_threads) compiler-generated function. Those settings
+          // may be incompatible with following parallel regions.
+          // TODO: ICV tracking to detect compatibility.
+          for (const auto &RFI : UnmergableCallsInfo) {
+            if (CalledFunction == RFI.Declaration)
+              return false;
+          }
+        } else {
+          // Return false (unmergable) if there is a call instruction
+          // in-between parallel regions when it is not an intrinsic. It
+          // may call an unmergable OpenMP runtime function in its callpath.
+          // TODO: Keep track of possible OpenMP calls in the callpath.
+          if (!isa<IntrinsicInst>(CI))
+            return false;
+        }
+
+        return true;
+      };
       // Find maximal number of parallel region CIs that are safe to merge.
-      for (Instruction &I : *BB) {
+      for (auto It = BB->begin(), End = BB->end(); It != End;) {
+        Instruction &I = *It;
+        ++It;
+
         if (CIs.count(&I)) {
           MergableCIs.push_back(cast<CallInst>(&I));
           continue;
         }
 
-        if (isSafeToSpeculativelyExecute(&I, &I, DT))
+        // Continue expanding if the instruction is mergable.
+        if (IsMergable(I, MergableCIs.empty()))
           continue;
 
+        // Forward the instruction iterator to skip the next parallel region
+        // since there is an unmergable instruction which can affect it.
+        for (; It != End; ++It) {
+          Instruction &SkipI = *It;
+          if (CIs.count(&SkipI)) {
+            LLVM_DEBUG(dbgs() << TAG << "Skip parallel region " << SkipI
+                              << " due to " << I << "\n");
+            ++It;
+            break;
+          }
+        }
+
+        // Store mergable regions found.
         if (MergableCIs.size() > 1) {
           MergableCIsVector.push_back(MergableCIs);
           LLVM_DEBUG(dbgs() << TAG << "Found " << MergableCIs.size()
@@ -812,15 +971,12 @@ struct OpenMPOpt {
     }
 
     if (Changed) {
-      // Update RFI info to set it up for later passes.
-      RFI.clearUsesMap();
-      OMPInfoCache.collectUses(RFI, /* CollectStats */ false);
-
-      // Collect uses for the emitted barrier call.
-      OMPInformationCache::RuntimeFunctionInfo &BarrierRFI =
-          OMPInfoCache.RFIs[OMPRTL___kmpc_barrier];
-      BarrierRFI.clearUsesMap();
-      OMPInfoCache.collectUses(BarrierRFI, /* CollectStats */ false);
+      /// Re-collect use for fork calls, emitted barrier calls, and
+      /// any emitted master/end_master calls.
+      OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_fork_call);
+      OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_barrier);
+      OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_master);
+      OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_end_master);
     }
 
     return Changed;

diff  --git a/llvm/test/Transforms/OpenMP/parallel_region_merging.ll b/llvm/test/Transforms/OpenMP/parallel_region_merging.ll
index 56e6edf9883e0..c7d267f6e355b 100644
--- a/llvm/test/Transforms/OpenMP/parallel_region_merging.ll
+++ b/llvm/test/Transforms/OpenMP/parallel_region_merging.ll
@@ -1,7 +1,232 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs
-; RUN: opt -S -attributor -openmpopt -openmp-opt-enable-merging  < %s -enable-new-pm=0 | FileCheck %s
-; RUN: opt -S -passes='attributor,cgscc(openmpopt)' -openmp-opt-enable-merging < %s | FileCheck %s
-
+; RUN: opt -S -attributor -openmpopt -openmp-opt-enable-merging -enable-new-pm=0 < %s | FileCheck %s
+; RUN: opt -S -passes='attributor,cgscc(openmpopt)' -openmp-opt-enable-merging  < %s | FileCheck %s
+; #include <omp.h>
+; void foo();
+; void use(int);
+; void usef(float);
+; void merge(int a) {
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+; }
+; void unmergable_proc_bind(int a) {
+; #pragma omp parallel proc_bind(close)
+;     {
+;         use(a);
+;     }
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+; }
+; void unmergable_num_threads(int a) {
+; #pragma omp parallel num_threads(a)
+;     {
+;         use(a);
+;     }
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+; }
+; void unmergable_seq_call(int a) {
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+;     foo();
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+; }
+; void merge_seq(int a) {
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+;     a = a + 1;
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+;     use(a);
+; }
+; void merge_seq_float(float f, float *p) {
+; #pragma omp parallel
+;     {
+;         use(f);
+;     }
+;     *p = f + 3.14f;
+; #pragma omp parallel
+;     {
+;         use(f);
+;     }
+; }
+; void merge_seq_firstprivate(int a) {
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+;     a = a + 1;
+; #pragma omp parallel firstprivate(a)
+;     {
+;         use(a);
+;     }
+;     use(a);
+; }
+; void merge_seq_sink_lt(int a) {
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+;     {
+;         int b = (int)&b;
+;     }
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+; }
+; void merge_seq_par_use(int a) {
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+;     int b = a + 1;
+; #pragma omp parallel
+;     {
+;         use(a);
+;         use(b);
+;     }
+; }
+; void merge_cancellable_regions(int cancel1, int cancel2)
+; {
+; #pragma omp parallel
+;     {
+;         if(cancel1) {
+; #pragma omp cancel parallel
+;         }
+;     }
+; #pragma omp parallel
+;     {
+;         if (cancel2) {
+; #pragma omp cancel parallel
+;         }
+;     }
+; }
+; void merge_cancellable_regions_seq(int cancel1, int cancel2)
+; {
+; #pragma omp parallel
+;     {
+;         if(cancel1) {
+; #pragma omp cancel parallel
+;         }
+;     }
+;     cancel2 = !cancel1;
+; #pragma omp parallel
+;     {
+;         if (cancel2) {
+; #pragma omp cancel parallel
+;         }
+;     }
+; }
+; void merge_3(int a) {
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+; }
+; void merge_3_seq(int a, int b) {
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+;     b = a + 1;
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+;     b = b + a;
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+;     use(b);
+; }
+; void unmergable_3_seq_call(int a) {
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+;     foo();
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+;     foo();
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+; }
+; void unmergable_3_proc_bind(int a) {
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+; #pragma omp parallel proc_bind(close)
+;     {
+;         use(a);
+;     }
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+; }
+; void unmergable_3_num_threads(int a) {
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+; #pragma omp parallel num_threads(a)
+;     {
+;         use(a);
+;     }
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+; }
+; void merge_2_unmergable_1(int a) {
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+;     foo();
+; #pragma omp parallel
+;     {
+;         use(a);
+;     }
+; }
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 
 %struct.ident_t = type { i32, i32, i32, i32, i8* }
@@ -9,219 +234,583 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8
 
-;   void merge_all() {
-;       int a = 1;
-;   #pragma omp parallel
-;       {
-;           a = 2;
-;       }
-;   #pragma omp parallel
-;       {
-;           a = 3;
-;       }
-;   }
-;
-; Merge all parallel regions.
-define dso_local void @merge_all() local_unnamed_addr  {
-  %1 = alloca i32, align 4
-  %2 = bitcast i32* %1 to i8*
-  store i32 1, i32* %1, align 4
-  %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
-  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par to void (i32*, i32*, ...)*), i32* nonnull %1)
-  %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
-  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par.1 to void (i32*, i32*, ...)*), i32* nonnull %1)
+define dso_local void @merge(i32 %a) local_unnamed_addr  {
+entry:
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
   ret void
 }
 
-define internal void @merge_all..omp_par.1(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
-  store i32 3, i32* %2, align 4
+define internal void @.omp_outlined.(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
   ret void
 }
 
-define internal void @merge_all..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
-  store i32 2, i32* %2, align 4
+declare dso_local void @use(i32) local_unnamed_addr
+
+declare !callback !1 void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) local_unnamed_addr
+
+define internal void @.omp_outlined..1(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
   ret void
 }
 
+define dso_local void @unmergable_proc_bind(i32 %a) local_unnamed_addr  {
+entry:
+  %a.addr = alloca i32, align 4
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  store i32 %a, i32* %a.addr, align 4
+  call void @__kmpc_push_proc_bind(%struct.ident_t* nonnull @1, i32 %0, i32 3)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..2 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..3 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  ret void
+}
+
+define internal void @.omp_outlined..2(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
 
 declare i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr
 
-declare !callback !1 void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) local_unnamed_addr
+declare void @__kmpc_push_proc_bind(%struct.ident_t*, i32, i32) local_unnamed_addr
 
-;   void merge_none() {
-;       int a = 1;
-;   #pragma omp parallel
-;       {
-;           a = 2;
-;       }
-;       a = 3;
-;   #pragma omp parallel
-;       {
-;           a = 4;
-;       }
-;   }
-;
-; Does not merge parallel regions, in-between store
-; instruction is unsafe to execute in parallel.
-define dso_local void @merge_none() local_unnamed_addr  {
-  %1 = alloca i32, align 4
-  %2 = bitcast i32* %1 to i8*
-  store i32 1, i32* %1, align 4
-  %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
-  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par to void (i32*, i32*, ...)*), i32* nonnull %1)
-  store i32 3, i32* %1, align 4
-  %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
-  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par.2 to void (i32*, i32*, ...)*), i32* nonnull %1)
+define internal void @.omp_outlined..3(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
   ret void
 }
 
-define internal void @merge_none..omp_par.2(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
-  store i32 4, i32* %2, align 4
+define dso_local void @unmergable_num_threads(i32 %a) local_unnamed_addr  {
+entry:
+  %a.addr = alloca i32, align 4
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  store i32 %a, i32* %a.addr, align 4
+  call void @__kmpc_push_num_threads(%struct.ident_t* nonnull @1, i32 %0, i32 %a)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..4 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..5 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
   ret void
 }
 
-define internal void @merge_none..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
-  store i32 2, i32* %2, align 4
+define internal void @.omp_outlined..4(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
   ret void
 }
 
-;   void merge_some() {
-;       int a = 1;
-;   #pragma omp parallel
-;       {
-;           a = 2;
-;       }
-;       a = 3;
-;   #pragma omp parallel
-;       {
-;           a = 4;
-;       }
-;   #pragma omp parallel
-;       {
-;           a = 5;
-;       }
-;   }
-;
-; Do not merge first parallel region, due to the
-; unsafe store, but merge the two next parallel
-; regions.
-define dso_local void @merge_some() local_unnamed_addr  {
-  %1 = alloca i32, align 4
-  %2 = bitcast i32* %1 to i8*
-  store i32 1, i32* %1, align 4
-  %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
-  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par to void (i32*, i32*, ...)*), i32* nonnull %1)
-  store i32 3, i32* %1, align 4
-  %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
-  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par.3 to void (i32*, i32*, ...)*), i32* nonnull %1)
-  %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
-  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par.4 to void (i32*, i32*, ...)*), i32* nonnull %1)
+declare void @__kmpc_push_num_threads(%struct.ident_t*, i32, i32) local_unnamed_addr
+
+define internal void @.omp_outlined..5(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
   ret void
 }
 
-define internal void @merge_some..omp_par.4(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
-  store i32 5, i32* %2, align 4
+define dso_local void @unmergable_seq_call(i32 %a) local_unnamed_addr  {
+entry:
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..6 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  call void (...) @foo()
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..7 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
   ret void
 }
 
-define internal void @merge_some..omp_par.3(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
-  store i32 4, i32* %2, align 4
+define internal void @.omp_outlined..6(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
   ret void
 }
 
-define internal void @merge_some..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2)  {
-  store i32 2, i32* %2, align 4
+declare dso_local void @foo(...) local_unnamed_addr
+
+define internal void @.omp_outlined..7(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
   ret void
 }
 
-;   void merge_cancellable_regions(int cancel1, int cancel2)
-;   {
-;   #pragma omp parallel
-;       {
-;           if(cancel1) {
-;   #pragma omp cancel parallel
-;           }
-;       }
-;   #pragma omp parallel
-;       {
-;           if (cancel2) {
-;   #pragma omp cancel parallel
-;           }
-;       }
-;   }
-;
-; Merge correctly cancellable regions.
-define dso_local void @merge_cancellable_regions(i32 %0, i32 %1) local_unnamed_addr  {
-  %3 = alloca i32, align 4
-  %4 = alloca i32, align 4
-  store i32 %0, i32* %3, align 4
-  store i32 %1, i32* %4, align 4
-  %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
-  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_cancellable_regions..omp_par to void (i32*, i32*, ...)*), i32* nonnull %3)
-  %6 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
-  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_cancellable_regions..omp_par.5 to void (i32*, i32*, ...)*), i32* nonnull %4)
+define dso_local void @merge_seq(i32 %a) local_unnamed_addr  {
+entry:
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..8 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  %0 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %0, 1
+  store i32 %add, i32* %a.addr, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..9 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  %1 = load i32, i32* %a.addr, align 4
+  call void @use(i32 %1)
+  ret void
+}
+
+define internal void @.omp_outlined..8(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define internal void @.omp_outlined..9(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define dso_local void @merge_seq_float(float %f, float* nocapture %p) local_unnamed_addr  {
+entry:
+  %f.addr = alloca float, align 4
+  store float %f, float* %f.addr, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, float*)* @.omp_outlined..10 to void (i32*, i32*, ...)*), float* nonnull %f.addr)
+  %0 = load float, float* %f.addr, align 4
+  %add = fadd float %0, 0x40091EB860000000
+  store float %add, float* %p, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, float*)* @.omp_outlined..11 to void (i32*, i32*, ...)*), float* nonnull %f.addr)
+  ret void
+}
+
+define internal void @.omp_outlined..10(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., float* nocapture nonnull readonly align 4 dereferenceable(4) %f)  {
+entry:
+  %0 = load float, float* %f, align 4
+  %conv = fptosi float %0 to i32
+  call void @use(i32 %conv)
+  ret void
+}
+
+define internal void @.omp_outlined..11(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., float* nocapture nonnull readonly align 4 dereferenceable(4) %f)  {
+entry:
+  %0 = load float, float* %f, align 4
+  %conv = fptosi float %0 to i32
+  call void @use(i32 %conv)
   ret void
 }
 
-define internal void @merge_cancellable_regions..omp_par.5(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture readonly %2)  {
-  %4 = load i32, i32* %2, align 4
-  %5 = icmp eq i32 %4, 0
-  br i1 %5, label %6, label %7
+define dso_local void @merge_seq_firstprivate(i32 %a) local_unnamed_addr  {
+entry:
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..12 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  %0 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %0, 1
+  store i32 %add, i32* %a.addr, align 4
+  %a.casted.sroa.0.0.insert.ext = zext i32 %add to i64
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64)* @.omp_outlined..13 to void (i32*, i32*, ...)*), i64 %a.casted.sroa.0.0.insert.ext)
+  %1 = load i32, i32* %a.addr, align 4
+  call void @use(i32 %1)
+  ret void
+}
+
+define internal void @.omp_outlined..12(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define internal void @.omp_outlined..13(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i64 %a)  {
+entry:
+  %a.addr.sroa.0.0.extract.trunc = trunc i64 %a to i32
+  call void @use(i32 %a.addr.sroa.0.0.extract.trunc)
+  ret void
+}
+
+define dso_local void @merge_seq_sink_lt(i32 %a) local_unnamed_addr  {
+entry:
+  %a.addr = alloca i32, align 4
+  %b = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..14 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  %0 = bitcast i32* %b to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0)
+  %1 = ptrtoint i32* %b to i64
+  %2 = trunc i64 %1 to i32
+  store i32 %2, i32* %b, align 4
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..15 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  ret void
+}
+
+define internal void @.omp_outlined..14(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture)
 
-6:                                                ; preds = %3
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture)
+
+define internal void @.omp_outlined..15(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
   ret void
+}
 
-7:                                                ; preds = %3
-  %8 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
-  %9 = call i32 @__kmpc_cancel(%struct.ident_t* nonnull @1, i32 %8, i32 1)
+define dso_local void @merge_seq_par_use(i32 %a) local_unnamed_addr  {
+entry:
+  %a.addr = alloca i32, align 4
+  %b = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..16 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  %0 = bitcast i32* %b to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0)
+  %1 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %1, 1
+  store i32 %add, i32* %b, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @.omp_outlined..17 to void (i32*, i32*, ...)*), i32* nonnull %a.addr, i32* nonnull %b)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0)
   ret void
 }
 
-define internal void @merge_cancellable_regions..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture readonly %2)  {
-  %4 = load i32, i32* %2, align 4
-  %5 = icmp eq i32 %4, 0
-  br i1 %5, label %6, label %7
+define internal void @.omp_outlined..16(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
 
-6:                                                ; preds = %3
+define internal void @.omp_outlined..17(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a, i32* nocapture nonnull readonly align 4 dereferenceable(4) %b)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  %1 = load i32, i32* %b, align 4
+  call void @use(i32 %1)
   ret void
+}
 
-7:                                                ; preds = %3
-  %8 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
-  %9 = call i32 @__kmpc_cancel(%struct.ident_t* nonnull @1, i32 %8, i32 1)
+define dso_local void @merge_cancellable_regions(i32 %cancel1, i32 %cancel2) local_unnamed_addr  {
+entry:
+  %cancel1.addr = alloca i32, align 4
+  %cancel2.addr = alloca i32, align 4
+  store i32 %cancel1, i32* %cancel1.addr, align 4
+  store i32 %cancel2, i32* %cancel2.addr, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..18 to void (i32*, i32*, ...)*), i32* nonnull %cancel1.addr)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..19 to void (i32*, i32*, ...)*), i32* nonnull %cancel2.addr)
+  ret void
+}
+
+define internal void @.omp_outlined..18(i32* noalias nocapture readonly %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %cancel1)  {
+entry:
+  %0 = load i32, i32* %cancel1, align 4
+  %tobool.not = icmp eq i32 %0, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %1 = load i32, i32* %.global_tid., align 4
+  %2 = call i32 @__kmpc_cancel(%struct.ident_t* nonnull @1, i32 %1, i32 1)
+  ret void
+
+if.end:                                           ; preds = %entry
   ret void
 }
 
 declare i32 @__kmpc_cancel(%struct.ident_t*, i32, i32) local_unnamed_addr
 
+define internal void @.omp_outlined..19(i32* noalias nocapture readonly %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %cancel2)  {
+entry:
+  %0 = load i32, i32* %cancel2, align 4
+  %tobool.not = icmp eq i32 %0, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %1 = load i32, i32* %.global_tid., align 4
+  %2 = call i32 @__kmpc_cancel(%struct.ident_t* nonnull @1, i32 %1, i32 1)
+  ret void
+
+if.end:                                           ; preds = %entry
+  ret void
+}
+
+define dso_local void @merge_cancellable_regions_seq(i32 %cancel1, i32 %cancel2) local_unnamed_addr  {
+entry:
+  %cancel1.addr = alloca i32, align 4
+  %cancel2.addr = alloca i32, align 4
+  store i32 %cancel1, i32* %cancel1.addr, align 4
+  store i32 %cancel2, i32* %cancel2.addr, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..20 to void (i32*, i32*, ...)*), i32* nonnull %cancel1.addr)
+  %0 = load i32, i32* %cancel1.addr, align 4
+  %tobool.not = icmp eq i32 %0, 0
+  %lnot.ext = zext i1 %tobool.not to i32
+  store i32 %lnot.ext, i32* %cancel2.addr, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..21 to void (i32*, i32*, ...)*), i32* nonnull %cancel2.addr)
+  ret void
+}
+
+define internal void @.omp_outlined..20(i32* noalias nocapture readonly %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %cancel1)  {
+entry:
+  %0 = load i32, i32* %cancel1, align 4
+  %tobool.not = icmp eq i32 %0, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %1 = load i32, i32* %.global_tid., align 4
+  %2 = call i32 @__kmpc_cancel(%struct.ident_t* nonnull @1, i32 %1, i32 1)
+  ret void
+
+if.end:                                           ; preds = %entry
+  ret void
+}
+
+define internal void @.omp_outlined..21(i32* noalias nocapture readonly %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %cancel2)  {
+entry:
+  %0 = load i32, i32* %cancel2, align 4
+  %tobool.not = icmp eq i32 %0, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %1 = load i32, i32* %.global_tid., align 4
+  %2 = call i32 @__kmpc_cancel(%struct.ident_t* nonnull @1, i32 %1, i32 1)
+  ret void
+
+if.end:                                           ; preds = %entry
+  ret void
+}
+
+define dso_local void @merge_3(i32 %a) local_unnamed_addr  {
+entry:
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..22 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..23 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..24 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  ret void
+}
+
+define internal void @.omp_outlined..22(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define internal void @.omp_outlined..23(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define internal void @.omp_outlined..24(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define dso_local void @merge_3_seq(i32 %a, i32 %b) local_unnamed_addr  {
+entry:
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..25 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  %0 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %0, 1
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..26 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  %1 = load i32, i32* %a.addr, align 4
+  %add1 = add nsw i32 %add, %1
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..27 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  call void @use(i32 %add1)
+  ret void
+}
+
+define internal void @.omp_outlined..25(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define internal void @.omp_outlined..26(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define internal void @.omp_outlined..27(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define dso_local void @unmergable_3_seq_call(i32 %a) local_unnamed_addr  {
+entry:
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..28 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  call void (...) @foo()
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..29 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  call void (...) @foo()
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..30 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  ret void
+}
+
+define internal void @.omp_outlined..28(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define internal void @.omp_outlined..29(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define internal void @.omp_outlined..30(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define dso_local void @unmergable_3_proc_bind(i32 %a) local_unnamed_addr  {
+entry:
+  %a.addr = alloca i32, align 4
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  store i32 %a, i32* %a.addr, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..31 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  call void @__kmpc_push_proc_bind(%struct.ident_t* nonnull @1, i32 %0, i32 3)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..32 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..33 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  ret void
+}
+
+define internal void @.omp_outlined..31(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define internal void @.omp_outlined..32(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define internal void @.omp_outlined..33(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define dso_local void @unmergable_3_num_threads(i32 %a) local_unnamed_addr  {
+entry:
+  %a.addr = alloca i32, align 4
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+  store i32 %a, i32* %a.addr, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..34 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  %1 = load i32, i32* %a.addr, align 4
+  call void @__kmpc_push_num_threads(%struct.ident_t* nonnull @1, i32 %0, i32 %1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..35 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..36 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  ret void
+}
+
+define internal void @.omp_outlined..34(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define internal void @.omp_outlined..35(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define internal void @.omp_outlined..36(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define dso_local void @merge_2_unmergable_1(i32 %a) local_unnamed_addr  {
+entry:
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..37 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..38 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  call void (...) @foo()
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..39 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+  ret void
+}
+
+define internal void @.omp_outlined..37(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define internal void @.omp_outlined..38(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
+define internal void @.omp_outlined..39(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a)  {
+entry:
+  %0 = load i32, i32* %a, align 4
+  call void @use(i32 %0)
+  ret void
+}
+
 
 !llvm.module.flags = !{!0}
 
 !0 = !{i32 1, !"wchar_size", i32 4}
 !1 = !{!2}
 !2 = !{i64 2, i64 -1, i64 -1, i1 true}
-; CHECK-LABEL: define {{[^@]+}}@merge_all() local_unnamed_addr {
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1:@.*]])
-; CHECK-NEXT:    [[TMP2:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 1, i32* [[TMP2]], align 4
-; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-LABEL: define {{[^@]+}}@merge
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1:@.*]])
 ; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
 ; CHECK:       omp_parallel:
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par.2 to void (i32*, i32*, ...)*), i32* [[TMP2]])
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]])
 ; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
 ; CHECK:       omp.par.outlined.exit:
 ; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
 ; CHECK:       omp.par.exit.split:
-; CHECK-NEXT:    br label [[DOTSPLIT_SPLIT:%.*]]
-; CHECK:       .split.split:
+; CHECK-NEXT:    br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK:       entry.split.split:
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@merge_all..omp_par.2
-; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[TMP0:%.*]]) [[ATTR0:#.*]] {
+; CHECK-LABEL: define {{[^@]+}}@merge..omp_par
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[A_ADDR:%.*]]) [[ATTR0:#.*]] {
 ; CHECK-NEXT:  omp.par.entry:
 ; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TID_ADDR]], align 4
-; CHECK-NEXT:    store i32 [[TMP1]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[TID_ADDR_LOCAL]], align 4
 ; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
 ; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
 ; CHECK:       omp.par.outlined.exit.exitStub:
@@ -229,12 +818,12 @@ declare i32 @__kmpc_cancel(%struct.ident_t*, i32, i32) local_unnamed_addr
 ; CHECK:       omp.par.region:
 ; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
 ; CHECK:       omp.par.merged:
-; CHECK-NEXT:    call void @merge_all..omp_par(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT:    call void @.omp_outlined.(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
 ; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
 ; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2:@.*]], i32 [[OMP_GLOBAL_THREAD_NUM]])
-; CHECK-NEXT:    call void @merge_all..omp_par.1(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
-; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
-; CHECK:       .split:
+; CHECK-NEXT:    call void @.omp_outlined..1(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    br label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split:
 ; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
 ; CHECK:       omp.par.region.split:
 ; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
@@ -242,65 +831,212 @@ declare i32 @__kmpc_cancel(%struct.ident_t*, i32, i32) local_unnamed_addr
 ; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@merge_all..omp_par.1
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1:#.*]] {
-; CHECK-NEXT:    store i32 3, i32* [[TMP2]], align 4
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined.
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..1
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@unmergable_proc_bind
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
+; CHECK-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    call void @__kmpc_push_proc_bind(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 [[TMP0]], i32 noundef 3)
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..2 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..3 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..2
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..3
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@unmergable_num_threads
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
+; CHECK-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    call void @__kmpc_push_num_threads(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 [[TMP0]], i32 [[A]])
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..4 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..5 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..4
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@merge_all..omp_par
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
-; CHECK-NEXT:    store i32 2, i32* [[TMP2]], align 4
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..5
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@merge_none() local_unnamed_addr {
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
-; CHECK-NEXT:    [[TMP2:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 1, i32* [[TMP2]], align 4
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2]])
-; CHECK-NEXT:    store i32 3, i32* [[TMP2]], align 4
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par.2 to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2]])
+; CHECK-LABEL: define {{[^@]+}}@unmergable_seq_call
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..6 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    call void (...) @foo()
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..7 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@merge_none..omp_par.2
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
-; CHECK-NEXT:    store i32 4, i32* [[TMP2]], align 4
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..6
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@merge_none..omp_par
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
-; CHECK-NEXT:    store i32 2, i32* [[TMP2]], align 4
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..7
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@merge_some() local_unnamed_addr {
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
-; CHECK-NEXT:    [[TMP2:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 1, i32* [[TMP2]], align 4
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2]])
-; CHECK-NEXT:    store i32 3, i32* [[TMP2]], align 4
+; CHECK-LABEL: define {{[^@]+}}@merge_seq
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+; CHECK:       omp_parallel:
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_seq..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]])
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK:       omp.par.outlined.exit:
+; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK:       omp.par.exit.split:
+; CHECK-NEXT:    br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK:       entry.split.split:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_seq..omp_par
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[A_ADDR:%.*]]) [[ATTR0]] {
+; CHECK-NEXT:  omp.par.entry:
+; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
+; CHECK:       omp.par.outlined.exit.exitStub:
+; CHECK-NEXT:    ret void
+; CHECK:       omp.par.region:
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
+; CHECK:       omp.par.merged:
+; CHECK-NEXT:    call void @.omp_outlined..8(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
 ; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[OMP_REGION_BODY:%.*]], label [[OMP_REGION_END:%.*]]
+; CHECK:       omp_region.end:
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED_SPLIT_SPLIT:%.*]]
+; CHECK:       omp.par.merged.split.split:
+; CHECK-NEXT:    call void @.omp_outlined..9(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    br label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK:       omp.par.region.split:
+; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK:       omp.par.pre_finalize:
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+; CHECK:       omp_region.body:
+; CHECK-NEXT:    br label [[SEQ_PAR_MERGED:%.*]]
+; CHECK:       seq.par.merged:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED_SPLIT:%.*]]
+; CHECK:       omp.par.merged.split:
+; CHECK-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
+; CHECK:       omp_region.body.split:
+; CHECK-NEXT:    call void @__kmpc_end_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    br label [[OMP_REGION_END]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..8
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..9
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_seq_float
+; CHECK-SAME: (float [[F:%.*]], float* nocapture writeonly [[P:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[F_ADDR:%.*]] = alloca float, align 4
+; CHECK-NEXT:    store float [[F]], float* [[F_ADDR]], align 4
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
 ; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
 ; CHECK:       omp_parallel:
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par.5 to void (i32*, i32*, ...)*), i32* [[TMP2]])
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, float*, float*)* @merge_seq_float..omp_par to void (i32*, i32*, ...)*), float* [[F_ADDR]], float* [[P]])
 ; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
 ; CHECK:       omp.par.outlined.exit:
 ; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
 ; CHECK:       omp.par.exit.split:
-; CHECK-NEXT:    br label [[DOTSPLIT_SPLIT:%.*]]
-; CHECK:       .split.split:
+; CHECK-NEXT:    br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK:       entry.split.split:
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par.5
-; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[TMP0:%.*]]) [[ATTR0]] {
+; CHECK-LABEL: define {{[^@]+}}@merge_seq_float..omp_par
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], float* [[F_ADDR:%.*]], float* [[P:%.*]]) [[ATTR0]] {
 ; CHECK-NEXT:  omp.par.entry:
 ; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TID_ADDR]], align 4
-; CHECK-NEXT:    store i32 [[TMP1]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[TID_ADDR_LOCAL]], align 4
 ; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
 ; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
 ; CHECK:       omp.par.outlined.exit.exitStub:
@@ -308,63 +1044,345 @@ declare i32 @__kmpc_cancel(%struct.ident_t*, i32, i32) local_unnamed_addr
 ; CHECK:       omp.par.region:
 ; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
 ; CHECK:       omp.par.merged:
-; CHECK-NEXT:    call void @merge_some..omp_par.3(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT:    call void @.omp_outlined..10(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], float* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[F_ADDR]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
 ; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
-; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
-; CHECK-NEXT:    call void @merge_some..omp_par.4(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
-; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
-; CHECK:       .split:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[OMP_REGION_BODY:%.*]], label [[OMP_REGION_END:%.*]]
+; CHECK:       omp_region.end:
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED_SPLIT_SPLIT:%.*]]
+; CHECK:       omp.par.merged.split.split:
+; CHECK-NEXT:    call void @.omp_outlined..11(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], float* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[F_ADDR]])
+; CHECK-NEXT:    br label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split:
 ; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
 ; CHECK:       omp.par.region.split:
 ; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
 ; CHECK:       omp.par.pre_finalize:
 ; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+; CHECK:       omp_region.body:
+; CHECK-NEXT:    br label [[SEQ_PAR_MERGED:%.*]]
+; CHECK:       seq.par.merged:
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[F_ADDR]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP3]], 0x40091EB860000000
+; CHECK-NEXT:    store float [[ADD]], float* [[P]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED_SPLIT:%.*]]
+; CHECK:       omp.par.merged.split:
+; CHECK-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
+; CHECK:       omp_region.body.split:
+; CHECK-NEXT:    call void @__kmpc_end_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    br label [[OMP_REGION_END]]
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par.4
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
-; CHECK-NEXT:    store i32 5, i32* [[TMP2]], align 4
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..10
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], float* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[F:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[F]], align 4
+; CHECK-NEXT:    [[CONV:%.*]] = fptosi float [[TMP0]] to i32
+; CHECK-NEXT:    call void @use(i32 [[CONV]])
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par.3
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
-; CHECK-NEXT:    store i32 4, i32* [[TMP2]], align 4
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..11
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], float* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[F:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[F]], align 4
+; CHECK-NEXT:    [[CONV:%.*]] = fptosi float [[TMP0]] to i32
+; CHECK-NEXT:    call void @use(i32 [[CONV]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_seq_firstprivate
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_CASTED_SROA_0_0_INSERT_EXT_SEQ_OUTPUT_ALLOC:%.*]] = alloca i64, align 8
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+; CHECK:       omp_parallel:
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i64*)* @merge_seq_firstprivate..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]], i64* [[A_CASTED_SROA_0_0_INSERT_EXT_SEQ_OUTPUT_ALLOC]])
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK:       omp.par.outlined.exit:
+; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK:       omp.par.exit.split:
+; CHECK-NEXT:    br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK:       entry.split.split:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
-; CHECK-NEXT:    store i32 2, i32* [[TMP2]], align 4
+; CHECK-LABEL: define {{[^@]+}}@merge_seq_firstprivate..omp_par
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[A_ADDR:%.*]], i64* [[A_CASTED_SROA_0_0_INSERT_EXT_SEQ_OUTPUT_ALLOC:%.*]]) [[ATTR0]] {
+; CHECK-NEXT:  omp.par.entry:
+; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
+; CHECK:       omp.par.outlined.exit.exitStub:
+; CHECK-NEXT:    ret void
+; CHECK:       omp.par.region:
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
+; CHECK:       omp.par.merged:
+; CHECK-NEXT:    call void @.omp_outlined..12(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[OMP_REGION_BODY:%.*]], label [[OMP_REGION_END:%.*]]
+; CHECK:       omp_region.end:
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED_SPLIT_SPLIT:%.*]]
+; CHECK:       omp.par.merged.split.split:
+; CHECK-NEXT:    [[A_CASTED_SROA_0_0_INSERT_EXT_SEQ_OUTPUT_LOAD:%.*]] = load i64, i64* [[A_CASTED_SROA_0_0_INSERT_EXT_SEQ_OUTPUT_ALLOC]], align 8
+; CHECK-NEXT:    call void @.omp_outlined..13(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i64 [[A_CASTED_SROA_0_0_INSERT_EXT_SEQ_OUTPUT_LOAD]])
+; CHECK-NEXT:    br label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK:       omp.par.region.split:
+; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK:       omp.par.pre_finalize:
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+; CHECK:       omp_region.body:
+; CHECK-NEXT:    br label [[SEQ_PAR_MERGED:%.*]]
+; CHECK:       seq.par.merged:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    [[A_CASTED_SROA_0_0_INSERT_EXT:%.*]] = zext i32 [[ADD]] to i64
+; CHECK-NEXT:    store i64 [[A_CASTED_SROA_0_0_INSERT_EXT]], i64* [[A_CASTED_SROA_0_0_INSERT_EXT_SEQ_OUTPUT_ALLOC]], align 8
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED_SPLIT:%.*]]
+; CHECK:       omp.par.merged.split:
+; CHECK-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
+; CHECK:       omp_region.body.split:
+; CHECK-NEXT:    call void @__kmpc_end_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    br label [[OMP_REGION_END]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..12
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..13
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i64 [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A]] to i32
+; CHECK-NEXT:    call void @use(i32 [[A_ADDR_SROA_0_0_EXTRACT_TRUNC]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_seq_sink_lt
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+; CHECK:       omp_parallel:
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_seq_sink_lt..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]])
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK:       omp.par.outlined.exit:
+; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK:       omp.par.exit.split:
+; CHECK-NEXT:    br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK:       entry.split.split:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_seq_sink_lt..omp_par
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[A_ADDR:%.*]]) [[ATTR0]] {
+; CHECK-NEXT:  omp.par.entry:
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
+; CHECK:       omp.par.outlined.exit.exitStub:
+; CHECK-NEXT:    ret void
+; CHECK:       omp.par.region:
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
+; CHECK:       omp.par.merged:
+; CHECK-NEXT:    call void @.omp_outlined..14(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[OMP_REGION_BODY:%.*]], label [[OMP_REGION_END:%.*]]
+; CHECK:       omp_region.end:
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED_SPLIT_SPLIT:%.*]]
+; CHECK:       omp.par.merged.split.split:
+; CHECK-NEXT:    call void @.omp_outlined..15(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    br label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK:       omp.par.region.split:
+; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK:       omp.par.pre_finalize:
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+; CHECK:       omp_region.body:
+; CHECK-NEXT:    br label [[SEQ_PAR_MERGED:%.*]]
+; CHECK:       seq.par.merged:
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[B]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 noundef 4, i8* noundef nonnull [[TMP3]])
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint i32* [[B]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[TMP4]] to i32
+; CHECK-NEXT:    store i32 [[TMP5]], i32* [[B]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 noundef 4, i8* noundef nonnull [[TMP3]])
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED_SPLIT:%.*]]
+; CHECK:       omp.par.merged.split:
+; CHECK-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
+; CHECK:       omp_region.body.split:
+; CHECK-NEXT:    call void @__kmpc_end_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    br label [[OMP_REGION_END]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..14
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..15
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_seq_par_use
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+; CHECK:       omp_parallel:
+; CHECK-NEXT:    [[LT_CAST3:%.*]] = bitcast i32* [[B]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST3]])
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @merge_seq_par_use..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]], i32* [[B]])
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK:       omp.par.outlined.exit:
+; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK:       omp.par.exit.split:
+; CHECK-NEXT:    br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK:       entry.split.split:
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[B]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 noundef 4, i8* noundef nonnull [[LT_CAST]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_seq_par_use..omp_par
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[A_ADDR:%.*]], i32* [[B:%.*]]) [[ATTR0]] {
+; CHECK-NEXT:  omp.par.entry:
+; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
+; CHECK:       omp.par.outlined.exit.exitStub:
+; CHECK-NEXT:    ret void
+; CHECK:       omp.par.region:
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
+; CHECK:       omp.par.merged:
+; CHECK-NEXT:    call void @.omp_outlined..16(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[OMP_REGION_BODY:%.*]], label [[OMP_REGION_END:%.*]]
+; CHECK:       omp_region.end:
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED_SPLIT_SPLIT:%.*]]
+; CHECK:       omp.par.merged.split.split:
+; CHECK-NEXT:    call void @.omp_outlined..17(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[B]])
+; CHECK-NEXT:    br label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK:       omp.par.region.split:
+; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK:       omp.par.pre_finalize:
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+; CHECK:       omp_region.body:
+; CHECK-NEXT:    br label [[SEQ_PAR_MERGED:%.*]]
+; CHECK:       seq.par.merged:
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[B]] to i8*
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP4]], 1
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[B]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED_SPLIT:%.*]]
+; CHECK:       omp.par.merged.split:
+; CHECK-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
+; CHECK:       omp_region.body.split:
+; CHECK-NEXT:    call void @__kmpc_end_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    br label [[OMP_REGION_END]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..16
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..17
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[B]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP1]])
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions
-; CHECK-SAME: (i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) local_unnamed_addr {
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
-; CHECK-NEXT:    [[TMP4:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 [[TMP0]], i32* [[TMP4]], align 4
-; CHECK-NEXT:    store i32 [[TMP1]], i32* [[TMP5]], align 4
+; CHECK-SAME: (i32 [[CANCEL1:%.*]], i32 [[CANCEL2:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CANCEL1_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[CANCEL2_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[CANCEL1]], i32* [[CANCEL1_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[CANCEL2]], i32* [[CANCEL2_ADDR]], align 4
 ; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
 ; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
 ; CHECK:       omp_parallel:
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @merge_cancellable_regions..omp_par.6 to void (i32*, i32*, ...)*), i32* [[TMP4]], i32* [[TMP5]])
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @merge_cancellable_regions..omp_par to void (i32*, i32*, ...)*), i32* [[CANCEL1_ADDR]], i32* [[CANCEL2_ADDR]])
 ; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
 ; CHECK:       omp.par.outlined.exit:
 ; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
 ; CHECK:       omp.par.exit.split:
-; CHECK-NEXT:    br label [[DOTSPLIT_SPLIT:%.*]]
-; CHECK:       .split.split:
+; CHECK-NEXT:    br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK:       entry.split.split:
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par.6
-; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[TMP0:%.*]], i32* [[TMP1:%.*]]) [[ATTR0]] {
+; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[CANCEL1_ADDR:%.*]], i32* [[CANCEL2_ADDR:%.*]]) [[ATTR0]] {
 ; CHECK-NEXT:  omp.par.entry:
 ; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TID_ADDR]], align 4
-; CHECK-NEXT:    store i32 [[TMP2]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[TID_ADDR_LOCAL]], align 4
 ; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
 ; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
 ; CHECK:       omp.par.outlined.exit.exitStub:
@@ -372,12 +1390,12 @@ declare i32 @__kmpc_cancel(%struct.ident_t*, i32, i32) local_unnamed_addr
 ; CHECK:       omp.par.region:
 ; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
 ; CHECK:       omp.par.merged:
-; CHECK-NEXT:    call void @merge_cancellable_regions..omp_par(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT:    call void @.omp_outlined..18(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[CANCEL1_ADDR]])
 ; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
 ; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
-; CHECK-NEXT:    call void @merge_cancellable_regions..omp_par.5(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP1]])
-; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
-; CHECK:       .split:
+; CHECK-NEXT:    call void @.omp_outlined..19(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[CANCEL2_ADDR]])
+; CHECK-NEXT:    br label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split:
 ; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
 ; CHECK:       omp.par.region.split:
 ; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
@@ -385,28 +1403,497 @@ declare i32 @__kmpc_cancel(%struct.ident_t*, i32, i32) local_unnamed_addr
 ; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par.5
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP2:%.*]]) {
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
-; CHECK:       6:
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..18
+; CHECK-SAME: (i32* noalias nocapture readonly [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[CANCEL1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[CANCEL1]], align 4
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* noundef nonnull [[GLOB1]], i32 [[TMP1]], i32 noundef 1)
 ; CHECK-NEXT:    ret void
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull [[GLOB1]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* noundef nonnull [[GLOB1]], i32 [[TMP8]], i32 noundef 1)
+; CHECK:       if.end:
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP2:%.*]]) {
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
-; CHECK:       6:
-; CHECK-NEXT:    ret void
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull [[GLOB1]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* noundef nonnull [[GLOB1]], i32 [[TMP8]], i32 noundef 1)
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..19
+; CHECK-SAME: (i32* noalias nocapture readonly [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[CANCEL2:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[CANCEL2]], align 4
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* noundef nonnull [[GLOB1]], i32 [[TMP1]], i32 noundef 1)
+; CHECK-NEXT:    ret void
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions_seq
+; CHECK-SAME: (i32 [[CANCEL1:%.*]], i32 [[CANCEL2:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CANCEL1_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[CANCEL2_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[CANCEL1]], i32* [[CANCEL1_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[CANCEL2]], i32* [[CANCEL2_ADDR]], align 4
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+; CHECK:       omp_parallel:
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @merge_cancellable_regions_seq..omp_par to void (i32*, i32*, ...)*), i32* [[CANCEL1_ADDR]], i32* [[CANCEL2_ADDR]])
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK:       omp.par.outlined.exit:
+; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK:       omp.par.exit.split:
+; CHECK-NEXT:    br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK:       entry.split.split:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions_seq..omp_par
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[CANCEL1_ADDR:%.*]], i32* [[CANCEL2_ADDR:%.*]]) [[ATTR0]] {
+; CHECK-NEXT:  omp.par.entry:
+; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
+; CHECK:       omp.par.outlined.exit.exitStub:
+; CHECK-NEXT:    ret void
+; CHECK:       omp.par.region:
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
+; CHECK:       omp.par.merged:
+; CHECK-NEXT:    call void @.omp_outlined..20(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[CANCEL1_ADDR]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[OMP_REGION_BODY:%.*]], label [[OMP_REGION_END:%.*]]
+; CHECK:       omp_region.end:
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED_SPLIT_SPLIT:%.*]]
+; CHECK:       omp.par.merged.split.split:
+; CHECK-NEXT:    call void @.omp_outlined..21(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[CANCEL2_ADDR]])
+; CHECK-NEXT:    br label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK:       omp.par.region.split:
+; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK:       omp.par.pre_finalize:
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+; CHECK:       omp_region.body:
+; CHECK-NEXT:    br label [[SEQ_PAR_MERGED:%.*]]
+; CHECK:       seq.par.merged:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[CANCEL1_ADDR]], align 4
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP3]], 0
+; CHECK-NEXT:    [[LNOT_EXT:%.*]] = zext i1 [[TOBOOL_NOT]] to i32
+; CHECK-NEXT:    store i32 [[LNOT_EXT]], i32* [[CANCEL2_ADDR]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED_SPLIT:%.*]]
+; CHECK:       omp.par.merged.split:
+; CHECK-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
+; CHECK:       omp_region.body.split:
+; CHECK-NEXT:    call void @__kmpc_end_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    br label [[OMP_REGION_END]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..20
+; CHECK-SAME: (i32* noalias nocapture readonly [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[CANCEL1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[CANCEL1]], align 4
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* noundef nonnull [[GLOB1]], i32 [[TMP1]], i32 noundef 1)
+; CHECK-NEXT:    ret void
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..21
+; CHECK-SAME: (i32* noalias nocapture readonly [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[CANCEL2:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[CANCEL2]], align 4
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* noundef nonnull [[GLOB1]], i32 [[TMP1]], i32 noundef 1)
+; CHECK-NEXT:    ret void
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_3
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+; CHECK:       omp_parallel:
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_3..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]])
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK:       omp.par.outlined.exit:
+; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK:       omp.par.exit.split:
+; CHECK-NEXT:    br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK:       entry.split.split:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_3..omp_par
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[A_ADDR:%.*]]) [[ATTR0]] {
+; CHECK-NEXT:  omp.par.entry:
+; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
+; CHECK:       omp.par.outlined.exit.exitStub:
+; CHECK-NEXT:    ret void
+; CHECK:       omp.par.region:
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
+; CHECK:       omp.par.merged:
+; CHECK-NEXT:    call void @.omp_outlined..22(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    call void @.omp_outlined..23(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK-NEXT:    call void @.omp_outlined..24(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    br label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK:       omp.par.region.split:
+; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK:       omp.par.pre_finalize:
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..22
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..23
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..24
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_3_seq
+; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD1_SEQ_OUTPUT_ALLOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[ADD_SEQ_OUTPUT_ALLOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM7:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+; CHECK:       omp_parallel:
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*, i32*)* @merge_3_seq..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]], i32* [[ADD_SEQ_OUTPUT_ALLOC]], i32* [[ADD1_SEQ_OUTPUT_ALLOC]])
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK:       omp.par.outlined.exit:
+; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK:       omp.par.exit.split:
+; CHECK-NEXT:    br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK:       entry.split.split:
+; CHECK-NEXT:    [[ADD1_SEQ_OUTPUT_LOAD:%.*]] = load i32, i32* [[ADD1_SEQ_OUTPUT_ALLOC]], align 4
+; CHECK-NEXT:    call void @use(i32 [[ADD1_SEQ_OUTPUT_LOAD]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_3_seq..omp_par
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[A_ADDR:%.*]], i32* [[ADD_SEQ_OUTPUT_ALLOC:%.*]], i32* [[ADD1_SEQ_OUTPUT_ALLOC:%.*]]) [[ATTR0]] {
+; CHECK-NEXT:  omp.par.entry:
+; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
+; CHECK:       omp.par.outlined.exit.exitStub:
+; CHECK-NEXT:    ret void
+; CHECK:       omp.par.region:
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
+; CHECK:       omp.par.merged:
+; CHECK-NEXT:    call void @.omp_outlined..25(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[OMP_REGION_BODY:%.*]], label [[OMP_REGION_END:%.*]]
+; CHECK:       omp_region.end:
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED_SPLIT_SPLIT:%.*]]
+; CHECK:       omp.par.merged.split.split:
+; CHECK-NEXT:    call void @.omp_outlined..26(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM4]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
+; CHECK-NEXT:    br i1 [[TMP4]], label [[OMP_REGION_BODY5:%.*]], label [[OMP_REGION_END4:%.*]]
+; CHECK:       omp_region.end4:
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM6]])
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED_SPLIT_SPLIT_SPLIT_SPLIT:%.*]]
+; CHECK:       omp.par.merged.split.split.split.split:
+; CHECK-NEXT:    call void @.omp_outlined..27(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    br label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK:       omp.par.region.split:
+; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK:       omp.par.pre_finalize:
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+; CHECK:       omp_region.body5:
+; CHECK-NEXT:    br label [[SEQ_PAR_MERGED2:%.*]]
+; CHECK:       seq.par.merged2:
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    [[ADD_SEQ_OUTPUT_LOAD:%.*]] = load i32, i32* [[ADD_SEQ_OUTPUT_ALLOC]], align 4
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[ADD_SEQ_OUTPUT_LOAD]], [[TMP5]]
+; CHECK-NEXT:    store i32 [[ADD1]], i32* [[ADD1_SEQ_OUTPUT_ALLOC]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED_SPLIT_SPLIT_SPLIT:%.*]]
+; CHECK:       omp.par.merged.split.split.split:
+; CHECK-NEXT:    br label [[OMP_REGION_BODY5_SPLIT:%.*]]
+; CHECK:       omp_region.body5.split:
+; CHECK-NEXT:    call void @__kmpc_end_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3]])
+; CHECK-NEXT:    br label [[OMP_REGION_END4]]
+; CHECK:       omp_region.body:
+; CHECK-NEXT:    br label [[SEQ_PAR_MERGED:%.*]]
+; CHECK:       seq.par.merged:
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[ADD_SEQ_OUTPUT_ALLOC]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED_SPLIT:%.*]]
+; CHECK:       omp.par.merged.split:
+; CHECK-NEXT:    br label [[OMP_REGION_BODY_SPLIT:%.*]]
+; CHECK:       omp_region.body.split:
+; CHECK-NEXT:    call void @__kmpc_end_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    br label [[OMP_REGION_END]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..25
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..26
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..27
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@unmergable_3_seq_call
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..28 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    call void (...) @foo()
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..29 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    call void (...) @foo()
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..30 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..28
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..29
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..30
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@unmergable_3_proc_bind
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
+; CHECK-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..31 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    call void @__kmpc_push_proc_bind(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 [[TMP0]], i32 noundef 3)
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..32 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..33 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..31
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..32
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..33
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@unmergable_3_num_threads
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
+; CHECK-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..34 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    call void @__kmpc_push_num_threads(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 [[TMP0]], i32 [[TMP1]])
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..35 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..36 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..34
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..35
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..36
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_2_unmergable_1
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    br label [[OMP_PARALLEL:%.*]]
+; CHECK:       omp_parallel:
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_2_unmergable_1..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]])
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK:       omp.par.outlined.exit:
+; CHECK-NEXT:    br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK:       omp.par.exit.split:
+; CHECK-NEXT:    br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK:       entry.split.split:
+; CHECK-NEXT:    call void (...) @foo()
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..39 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_2_unmergable_1..omp_par
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[A_ADDR:%.*]]) [[ATTR0]] {
+; CHECK-NEXT:  omp.par.entry:
+; CHECK-NEXT:    [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT:    br label [[OMP_PAR_REGION:%.*]]
+; CHECK:       omp.par.outlined.exit.exitStub:
+; CHECK-NEXT:    ret void
+; CHECK:       omp.par.region:
+; CHECK-NEXT:    br label [[OMP_PAR_MERGED:%.*]]
+; CHECK:       omp.par.merged:
+; CHECK-NEXT:    call void @.omp_outlined..37(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT:    call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT:    call void @.omp_outlined..38(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT:    br label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK:       omp.par.region.split:
+; CHECK-NEXT:    br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK:       omp.par.pre_finalize:
+; CHECK-NEXT:    br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..37
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..38
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..39
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    call void @use(i32 [[TMP0]])
 ; CHECK-NEXT:    ret void
 ;