[llvm-branch-commits] [llvm] 9751705 - [OpenMPOpt][WIP] Expand parallel region merging
Giorgis Georgakoudis via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Jan 11 08:11:16 PST 2021
Author: Giorgis Georgakoudis
Date: 2021-01-11T08:06:23-08:00
New Revision: 975170551283559ebe5052b6f83b2cc9e50132db
URL: https://github.com/llvm/llvm-project/commit/975170551283559ebe5052b6f83b2cc9e50132db
DIFF: https://github.com/llvm/llvm-project/commit/975170551283559ebe5052b6f83b2cc9e50132db.diff
LOG: [OpenMPOpt][WIP] Expand parallel region merging
The existing implementation of parallel region merging applies only to
consecutive parallel regions that have speculatable sequential
instructions in-between. This patch lifts this limitation to expand
merging with any sequential instructions in-between, except calls to
unmergable OpenMP runtime functions. In-between sequential instructions
in the merged region are sequentialized in a "master" region and any
output values are broadcasted to the following parallel regions and the
sequential region continuation of the merged region.
Reviewed By: jdoerfert
Differential Revision: https://reviews.llvm.org/D90909
Added:
Modified:
llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
llvm/lib/Transforms/IPO/OpenMPOpt.cpp
llvm/test/Transforms/OpenMP/parallel_region_merging.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 68df3f2271d98..8e95226d3895e 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -38,7 +38,10 @@ class OpenMPIRBuilder {
void initialize();
/// Finalize the underlying module, e.g., by outlining regions.
- void finalize();
+ /// \param AllowExtractorSinking Flag to include sinking instructions,
+ /// emitted by CodeExtractor, in the
+ /// outlined region. Default is false.
+ void finalize(bool AllowExtractorSinking = false);
/// Add attributes known for \p FnID to \p Fn.
void addAttributes(omp::RuntimeFunction FnID, Function &Fn);
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index d120eaa17fb77..fdedb6beac359 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -127,7 +127,7 @@ Function *OpenMPIRBuilder::getOrCreateRuntimeFunctionPtr(RuntimeFunction FnID) {
void OpenMPIRBuilder::initialize() { initializeTypes(M); }
-void OpenMPIRBuilder::finalize() {
+void OpenMPIRBuilder::finalize(bool AllowExtractorSinking) {
SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
SmallVector<BasicBlock *, 32> Blocks;
for (OutlineInfo &OI : OutlineInfos) {
@@ -170,6 +170,25 @@ void OpenMPIRBuilder::finalize() {
BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
+ if (AllowExtractorSinking) {
+ // Move instructions from the to-be-deleted ArtificialEntry to the entry
+ // basic block of the parallel region. CodeExtractor may have sunk
+ // allocas/bitcasts for values that are solely used in the outlined
+ // region and do not escape.
+ assert(!ArtificialEntry.empty() &&
+ "Expected instructions to sink in the outlined region");
+ for (BasicBlock::iterator It = ArtificialEntry.begin(),
+ End = ArtificialEntry.end();
+ It != End;) {
+ Instruction &I = *It;
+ It++;
+
+ if (I.isTerminator())
+ continue;
+
+ I.moveBefore(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
+ }
+ }
OI.EntryBB->moveBefore(&ArtificialEntry);
ArtificialEntry.eraseFromParent();
}
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 389ed1ebad6d5..fc3cc90c6f808 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -28,6 +28,7 @@
#include "llvm/Transforms/IPO/Attributor.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/CallGraphUpdater.h"
+#include "llvm/Transforms/Utils/CodeExtractor.h"
using namespace llvm;
using namespace omp;
@@ -317,13 +318,17 @@ struct OMPInformationCache : public InformationCache {
return NumUses;
}
+ // Helper function to recollect uses of a runtime function.
+ void recollectUsesForFunction(RuntimeFunction RTF) {
+ auto &RFI = RFIs[RTF];
+ RFI.clearUsesMap();
+ collectUses(RFI, /*CollectStats*/ false);
+ }
+
// Helper function to recollect uses of all runtime functions.
void recollectUses() {
- for (int Idx = 0; Idx < RFIs.size(); ++Idx) {
- auto &RFI = RFIs[static_cast<RuntimeFunction>(Idx)];
- RFI.clearUsesMap();
- collectUses(RFI, /*CollectStats*/ false);
- }
+ for (int Idx = 0; Idx < RFIs.size(); ++Idx)
+ recollectUsesForFunction(static_cast<RuntimeFunction>(Idx));
}
/// Helper to initialize all runtime function information for those defined
@@ -601,15 +606,11 @@ struct OpenMPOpt {
if (!RFI.Declaration)
return false;
- // Check if there any __kmpc_push_proc_bind calls for explicit affinities.
- OMPInformationCache::RuntimeFunctionInfo &ProcBindRFI =
- OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind];
-
- // Defensively abort if explicit affinities are set.
- // TODO: Track ICV proc_bind to merge when mergable regions have the same
- // affinity.
- if (ProcBindRFI.Declaration)
- return false;
+ // Unmergable calls that prevent merging a parallel region.
+ OMPInformationCache::RuntimeFunctionInfo UnmergableCallsInfo[] = {
+ OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind],
+ OMPInfoCache.RFIs[OMPRTL___kmpc_push_num_threads],
+ };
bool Changed = false;
LoopInfo *LI = nullptr;
@@ -637,6 +638,90 @@ struct OpenMPOpt {
auto FiniCB = [&](InsertPointTy CodeGenIP) {};
+ /// Create a sequential execution region within a merged parallel region,
+ /// encapsulated in a master construct with a barrier for synchronization.
+ auto CreateSequentialRegion = [&](Function *OuterFn,
+ BasicBlock *OuterPredBB,
+ Instruction *SeqStartI,
+ Instruction *SeqEndI) {
+ // Isolate the instructions of the sequential region to a separate
+ // block.
+ BasicBlock *ParentBB = SeqStartI->getParent();
+ BasicBlock *SeqEndBB =
+ SplitBlock(ParentBB, SeqEndI->getNextNode(), DT, LI);
+ BasicBlock *SeqAfterBB =
+ SplitBlock(SeqEndBB, &*SeqEndBB->getFirstInsertionPt(), DT, LI);
+ BasicBlock *SeqStartBB =
+ SplitBlock(ParentBB, SeqStartI, DT, LI, nullptr, "seq.par.merged");
+
+ assert(ParentBB->getUniqueSuccessor() == SeqStartBB &&
+ "Expected a
diff erent CFG");
+ const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc();
+ ParentBB->getTerminator()->eraseFromParent();
+
+ auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
+ BasicBlock &ContinuationIP) {
+ BasicBlock *CGStartBB = CodeGenIP.getBlock();
+ BasicBlock *CGEndBB =
+ SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
+ assert(SeqStartBB != nullptr && "SeqStartBB should not be null");
+ CGStartBB->getTerminator()->setSuccessor(0, SeqStartBB);
+ assert(SeqEndBB != nullptr && "SeqEndBB should not be null");
+ SeqEndBB->getTerminator()->setSuccessor(0, CGEndBB);
+ };
+ auto FiniCB = [&](InsertPointTy CodeGenIP) {};
+
+ // Find outputs from the sequential region to outside users and
+ // broadcast their values to them.
+ for (Instruction &I : *SeqStartBB) {
+ SmallPtrSet<Instruction *, 4> OutsideUsers;
+ for (User *Usr : I.users()) {
+ Instruction &UsrI = *cast<Instruction>(Usr);
+ // Ignore outputs to LT intrinsics, code extraction for the merged
+ // parallel region will fix them.
+ if (UsrI.isLifetimeStartOrEnd())
+ continue;
+
+ if (UsrI.getParent() != SeqStartBB)
+ OutsideUsers.insert(&UsrI);
+ }
+
+ if (OutsideUsers.empty())
+ continue;
+
+ // Emit an alloca in the outer region to store the broadcasted
+ // value.
+ const DataLayout &DL = M.getDataLayout();
+ AllocaInst *AllocaI = new AllocaInst(
+ I.getType(), DL.getAllocaAddrSpace(), nullptr,
+ I.getName() + ".seq.output.alloc", &OuterFn->front().front());
+
+ // Emit a store instruction in the sequential BB to update the
+ // value.
+ new StoreInst(&I, AllocaI, SeqStartBB->getTerminator());
+
+ // Emit a load instruction and replace the use of the output value
+ // with it.
+ for (Instruction *UsrI : OutsideUsers) {
+ LoadInst *LoadI = new LoadInst(I.getType(), AllocaI,
+ I.getName() + ".seq.output.load", UsrI);
+ UsrI->replaceUsesOfWith(&I, LoadI);
+ }
+ }
+
+ OpenMPIRBuilder::LocationDescription Loc(
+ InsertPointTy(ParentBB, ParentBB->end()), DL);
+ InsertPointTy SeqAfterIP =
+ OMPInfoCache.OMPBuilder.createMaster(Loc, BodyGenCB, FiniCB);
+
+ OMPInfoCache.OMPBuilder.createBarrier(SeqAfterIP, OMPD_parallel);
+
+ BranchInst::Create(SeqAfterBB, SeqAfterIP.getBlock());
+
+ LLVM_DEBUG(dbgs() << TAG << "After sequential inlining " << *OuterFn
+ << "\n");
+ };
+
// Helper to merge the __kmpc_fork_call calls in MergableCIs. They are all
// contained in BB and only separated by instructions that can be
// redundantly executed in parallel. The block BB is split before the first
@@ -682,6 +767,21 @@ struct OpenMPOpt {
const DebugLoc DL = BB->getTerminator()->getDebugLoc();
BB->getTerminator()->eraseFromParent();
+ // Create sequential regions for sequential instructions that are
+ // in-between mergable parallel regions.
+ for (auto *It = MergableCIs.begin(), *End = MergableCIs.end() - 1;
+ It != End; ++It) {
+ Instruction *ForkCI = *It;
+ Instruction *NextForkCI = *(It + 1);
+
+ // Continue if there are not in-between instructions.
+ if (ForkCI->getNextNode() == NextForkCI)
+ continue;
+
+ CreateSequentialRegion(OriginalFn, BB, ForkCI->getNextNode(),
+ NextForkCI->getPrevNode());
+ }
+
OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB, BB->end()),
DL);
IRBuilder<>::InsertPoint AllocaIP(
@@ -695,7 +795,7 @@ struct OpenMPOpt {
BranchInst::Create(AfterBB, AfterIP.getBlock());
// Perform the actual outlining.
- OMPInfoCache.OMPBuilder.finalize();
+ OMPInfoCache.OMPBuilder.finalize(/* AllowExtractorSinking */ true);
Function *OutlinedFn = MergableCIs.front()->getCaller();
@@ -782,16 +882,75 @@ struct OpenMPOpt {
BasicBlock *BB = It.getFirst();
SmallVector<CallInst *, 4> MergableCIs;
+ /// Returns true if the instruction is mergable, false otherwise.
+ /// A terminator instruction is unmergable by definition since merging
+ /// works within a BB. Instructions before the mergable region are
+ /// mergable if they are not calls to OpenMP runtime functions that may
+ /// set
diff erent execution parameters for subsequent parallel regions.
+ /// Instructions in-between parallel regions are mergable if they are not
+ /// calls to any non-intrinsic function since that may call a non-mergable
+ /// OpenMP runtime function.
+ auto IsMergable = [&](Instruction &I, bool IsBeforeMergableRegion) {
+ // We do not merge across BBs, hence return false (unmergable) if the
+ // instruction is a terminator.
+ if (I.isTerminator())
+ return false;
+
+ if (!isa<CallInst>(&I))
+ return true;
+
+ CallInst *CI = cast<CallInst>(&I);
+ if (IsBeforeMergableRegion) {
+ Function *CalledFunction = CI->getCalledFunction();
+ if (!CalledFunction)
+ return false;
+ // Return false (unmergable) if the call before the parallel
+ // region calls an explicit affinity (proc_bind) or number of
+ // threads (num_threads) compiler-generated function. Those settings
+ // may be incompatible with following parallel regions.
+ // TODO: ICV tracking to detect compatibility.
+ for (const auto &RFI : UnmergableCallsInfo) {
+ if (CalledFunction == RFI.Declaration)
+ return false;
+ }
+ } else {
+ // Return false (unmergable) if there is a call instruction
+ // in-between parallel regions when it is not an intrinsic. It
+ // may call an unmergable OpenMP runtime function in its callpath.
+ // TODO: Keep track of possible OpenMP calls in the callpath.
+ if (!isa<IntrinsicInst>(CI))
+ return false;
+ }
+
+ return true;
+ };
// Find maximal number of parallel region CIs that are safe to merge.
- for (Instruction &I : *BB) {
+ for (auto It = BB->begin(), End = BB->end(); It != End;) {
+ Instruction &I = *It;
+ ++It;
+
if (CIs.count(&I)) {
MergableCIs.push_back(cast<CallInst>(&I));
continue;
}
- if (isSafeToSpeculativelyExecute(&I, &I, DT))
+ // Continue expanding if the instruction is mergable.
+ if (IsMergable(I, MergableCIs.empty()))
continue;
+ // Forward the instruction iterator to skip the next parallel region
+ // since there is an unmergable instruction which can affect it.
+ for (; It != End; ++It) {
+ Instruction &SkipI = *It;
+ if (CIs.count(&SkipI)) {
+ LLVM_DEBUG(dbgs() << TAG << "Skip parallel region " << SkipI
+ << " due to " << I << "\n");
+ ++It;
+ break;
+ }
+ }
+
+ // Store mergable regions found.
if (MergableCIs.size() > 1) {
MergableCIsVector.push_back(MergableCIs);
LLVM_DEBUG(dbgs() << TAG << "Found " << MergableCIs.size()
@@ -812,15 +971,12 @@ struct OpenMPOpt {
}
if (Changed) {
- // Update RFI info to set it up for later passes.
- RFI.clearUsesMap();
- OMPInfoCache.collectUses(RFI, /* CollectStats */ false);
-
- // Collect uses for the emitted barrier call.
- OMPInformationCache::RuntimeFunctionInfo &BarrierRFI =
- OMPInfoCache.RFIs[OMPRTL___kmpc_barrier];
- BarrierRFI.clearUsesMap();
- OMPInfoCache.collectUses(BarrierRFI, /* CollectStats */ false);
+ /// Re-collect use for fork calls, emitted barrier calls, and
+ /// any emitted master/end_master calls.
+ OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_fork_call);
+ OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_barrier);
+ OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_master);
+ OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_end_master);
}
return Changed;
diff --git a/llvm/test/Transforms/OpenMP/parallel_region_merging.ll b/llvm/test/Transforms/OpenMP/parallel_region_merging.ll
index 56e6edf9883e0..c7d267f6e355b 100644
--- a/llvm/test/Transforms/OpenMP/parallel_region_merging.ll
+++ b/llvm/test/Transforms/OpenMP/parallel_region_merging.ll
@@ -1,7 +1,232 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs
-; RUN: opt -S -attributor -openmpopt -openmp-opt-enable-merging < %s -enable-new-pm=0 | FileCheck %s
-; RUN: opt -S -passes='attributor,cgscc(openmpopt)' -openmp-opt-enable-merging < %s | FileCheck %s
-
+; RUN: opt -S -attributor -openmpopt -openmp-opt-enable-merging -enable-new-pm=0 < %s | FileCheck %s
+; RUN: opt -S -passes='attributor,cgscc(openmpopt)' -openmp-opt-enable-merging < %s | FileCheck %s
+; #include <omp.h>
+; void foo();
+; void use(int);
+; void usef(float);
+; void merge(int a) {
+; #pragma omp parallel
+; {
+; use(a);
+; }
+; #pragma omp parallel
+; {
+; use(a);
+; }
+; }
+; void unmergable_proc_bind(int a) {
+; #pragma omp parallel proc_bind(close)
+; {
+; use(a);
+; }
+; #pragma omp parallel
+; {
+; use(a);
+; }
+; }
+; void unmergable_num_threads(int a) {
+; #pragma omp parallel num_threads(a)
+; {
+; use(a);
+; }
+; #pragma omp parallel
+; {
+; use(a);
+; }
+; }
+; void unmergable_seq_call(int a) {
+; #pragma omp parallel
+; {
+; use(a);
+; }
+; foo();
+; #pragma omp parallel
+; {
+; use(a);
+; }
+; }
+; void merge_seq(int a) {
+; #pragma omp parallel
+; {
+; use(a);
+; }
+; a = a + 1;
+; #pragma omp parallel
+; {
+; use(a);
+; }
+; use(a);
+; }
+; void merge_seq_float(float f, float *p) {
+; #pragma omp parallel
+; {
+; use(f);
+; }
+; *p = f + 3.14f;
+; #pragma omp parallel
+; {
+; use(f);
+; }
+; }
+; void merge_seq_firstprivate(int a) {
+; #pragma omp parallel
+; {
+; use(a);
+; }
+; a = a + 1;
+; #pragma omp parallel firstprivate(a)
+; {
+; use(a);
+; }
+; use(a);
+; }
+; void merge_seq_sink_lt(int a) {
+; #pragma omp parallel
+; {
+; use(a);
+; }
+; {
+; int b = (int)&b;
+; }
+; #pragma omp parallel
+; {
+; use(a);
+; }
+; }
+; void merge_seq_par_use(int a) {
+; #pragma omp parallel
+; {
+; use(a);
+; }
+; int b = a + 1;
+; #pragma omp parallel
+; {
+; use(a);
+; use(b);
+; }
+; }
+; void merge_cancellable_regions(int cancel1, int cancel2)
+; {
+; #pragma omp parallel
+; {
+; if(cancel1) {
+; #pragma omp cancel parallel
+; }
+; }
+; #pragma omp parallel
+; {
+; if (cancel2) {
+; #pragma omp cancel parallel
+; }
+; }
+; }
+; void merge_cancellable_regions_seq(int cancel1, int cancel2)
+; {
+; #pragma omp parallel
+; {
+; if(cancel1) {
+; #pragma omp cancel parallel
+; }
+; }
+; cancel2 = !cancel1;
+; #pragma omp parallel
+; {
+; if (cancel2) {
+; #pragma omp cancel parallel
+; }
+; }
+; }
+; void merge_3(int a) {
+; #pragma omp parallel
+; {
+; use(a);
+; }
+; #pragma omp parallel
+; {
+; use(a);
+; }
+; #pragma omp parallel
+; {
+; use(a);
+; }
+; }
+; void merge_3_seq(int a, int b) {
+; #pragma omp parallel
+; {
+; use(a);
+; }
+; b = a + 1;
+; #pragma omp parallel
+; {
+; use(a);
+; }
+; b = b + a;
+; #pragma omp parallel
+; {
+; use(a);
+; }
+; use(b);
+; }
+; void unmergable_3_seq_call(int a) {
+; #pragma omp parallel
+; {
+; use(a);
+; }
+; foo();
+; #pragma omp parallel
+; {
+; use(a);
+; }
+; foo();
+; #pragma omp parallel
+; {
+; use(a);
+; }
+; }
+; void unmergable_3_proc_bind(int a) {
+; #pragma omp parallel
+; {
+; use(a);
+; }
+; #pragma omp parallel proc_bind(close)
+; {
+; use(a);
+; }
+; #pragma omp parallel
+; {
+; use(a);
+; }
+; }
+; void unmergable_3_num_threads(int a) {
+; #pragma omp parallel
+; {
+; use(a);
+; }
+; #pragma omp parallel num_threads(a)
+; {
+; use(a);
+; }
+; #pragma omp parallel
+; {
+; use(a);
+; }
+; }
+; void merge_2_unmergable_1(int a) {
+; #pragma omp parallel
+; {
+; use(a);
+; }
+; #pragma omp parallel
+; {
+; use(a);
+; }
+; foo();
+; #pragma omp parallel
+; {
+; use(a);
+; }
+; }
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
%struct.ident_t = type { i32, i32, i32, i32, i8* }
@@ -9,219 +234,583 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8
-; void merge_all() {
-; int a = 1;
-; #pragma omp parallel
-; {
-; a = 2;
-; }
-; #pragma omp parallel
-; {
-; a = 3;
-; }
-; }
-;
-; Merge all parallel regions.
-define dso_local void @merge_all() local_unnamed_addr {
- %1 = alloca i32, align 4
- %2 = bitcast i32* %1 to i8*
- store i32 1, i32* %1, align 4
- %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
- call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par to void (i32*, i32*, ...)*), i32* nonnull %1)
- %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
- call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par.1 to void (i32*, i32*, ...)*), i32* nonnull %1)
+define dso_local void @merge(i32 %a) local_unnamed_addr {
+entry:
+ %a.addr = alloca i32, align 4
+ store i32 %a, i32* %a.addr, align 4
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
ret void
}
-define internal void @merge_all..omp_par.1(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2) {
- store i32 3, i32* %2, align 4
+define internal void @.omp_outlined.(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) {
+entry:
+ %0 = load i32, i32* %a, align 4
+ call void @use(i32 %0)
ret void
}
-define internal void @merge_all..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2) {
- store i32 2, i32* %2, align 4
+declare dso_local void @use(i32) local_unnamed_addr
+
+declare !callback !1 void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) local_unnamed_addr
+
+define internal void @.omp_outlined..1(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) {
+entry:
+ %0 = load i32, i32* %a, align 4
+ call void @use(i32 %0)
ret void
}
+define dso_local void @unmergable_proc_bind(i32 %a) local_unnamed_addr {
+entry:
+ %a.addr = alloca i32, align 4
+ %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+ store i32 %a, i32* %a.addr, align 4
+ call void @__kmpc_push_proc_bind(%struct.ident_t* nonnull @1, i32 %0, i32 3)
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..2 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..3 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+ ret void
+}
+
+define internal void @.omp_outlined..2(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) {
+entry:
+ %0 = load i32, i32* %a, align 4
+ call void @use(i32 %0)
+ ret void
+}
declare i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr
-declare !callback !1 void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) local_unnamed_addr
+declare void @__kmpc_push_proc_bind(%struct.ident_t*, i32, i32) local_unnamed_addr
-; void merge_none() {
-; int a = 1;
-; #pragma omp parallel
-; {
-; a = 2;
-; }
-; a = 3;
-; #pragma omp parallel
-; {
-; a = 4;
-; }
-; }
-;
-; Does not merge parallel regions, in-between store
-; instruction is unsafe to execute in parallel.
-define dso_local void @merge_none() local_unnamed_addr {
- %1 = alloca i32, align 4
- %2 = bitcast i32* %1 to i8*
- store i32 1, i32* %1, align 4
- %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
- call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par to void (i32*, i32*, ...)*), i32* nonnull %1)
- store i32 3, i32* %1, align 4
- %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
- call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par.2 to void (i32*, i32*, ...)*), i32* nonnull %1)
+define internal void @.omp_outlined..3(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) {
+entry:
+ %0 = load i32, i32* %a, align 4
+ call void @use(i32 %0)
ret void
}
-define internal void @merge_none..omp_par.2(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2) {
- store i32 4, i32* %2, align 4
+define dso_local void @unmergable_num_threads(i32 %a) local_unnamed_addr {
+entry:
+ %a.addr = alloca i32, align 4
+ %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+ store i32 %a, i32* %a.addr, align 4
+ call void @__kmpc_push_num_threads(%struct.ident_t* nonnull @1, i32 %0, i32 %a)
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..4 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..5 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
ret void
}
-define internal void @merge_none..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2) {
- store i32 2, i32* %2, align 4
+define internal void @.omp_outlined..4(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) {
+entry:
+ %0 = load i32, i32* %a, align 4
+ call void @use(i32 %0)
ret void
}
-; void merge_some() {
-; int a = 1;
-; #pragma omp parallel
-; {
-; a = 2;
-; }
-; a = 3;
-; #pragma omp parallel
-; {
-; a = 4;
-; }
-; #pragma omp parallel
-; {
-; a = 5;
-; }
-; }
-;
-; Do not merge first parallel region, due to the
-; unsafe store, but merge the two next parallel
-; regions.
-define dso_local void @merge_some() local_unnamed_addr {
- %1 = alloca i32, align 4
- %2 = bitcast i32* %1 to i8*
- store i32 1, i32* %1, align 4
- %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
- call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par to void (i32*, i32*, ...)*), i32* nonnull %1)
- store i32 3, i32* %1, align 4
- %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
- call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par.3 to void (i32*, i32*, ...)*), i32* nonnull %1)
- %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
- call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par.4 to void (i32*, i32*, ...)*), i32* nonnull %1)
+declare void @__kmpc_push_num_threads(%struct.ident_t*, i32, i32) local_unnamed_addr
+
+define internal void @.omp_outlined..5(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) {
+entry:
+ %0 = load i32, i32* %a, align 4
+ call void @use(i32 %0)
ret void
}
-define internal void @merge_some..omp_par.4(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2) {
- store i32 5, i32* %2, align 4
+define dso_local void @unmergable_seq_call(i32 %a) local_unnamed_addr {
+entry:
+ %a.addr = alloca i32, align 4
+ store i32 %a, i32* %a.addr, align 4
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..6 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+ call void (...) @foo()
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..7 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
ret void
}
-define internal void @merge_some..omp_par.3(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2) {
- store i32 4, i32* %2, align 4
+define internal void @.omp_outlined..6(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) {
+entry:
+ %0 = load i32, i32* %a, align 4
+ call void @use(i32 %0)
ret void
}
-define internal void @merge_some..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture %2) {
- store i32 2, i32* %2, align 4
+declare dso_local void @foo(...) local_unnamed_addr
+
+define internal void @.omp_outlined..7(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) {
+entry:
+ %0 = load i32, i32* %a, align 4
+ call void @use(i32 %0)
ret void
}
-; void merge_cancellable_regions(int cancel1, int cancel2)
-; {
-; #pragma omp parallel
-; {
-; if(cancel1) {
-; #pragma omp cancel parallel
-; }
-; }
-; #pragma omp parallel
-; {
-; if (cancel2) {
-; #pragma omp cancel parallel
-; }
-; }
-; }
-;
-; Merge correctly cancellable regions.
-define dso_local void @merge_cancellable_regions(i32 %0, i32 %1) local_unnamed_addr {
- %3 = alloca i32, align 4
- %4 = alloca i32, align 4
- store i32 %0, i32* %3, align 4
- store i32 %1, i32* %4, align 4
- %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
- call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_cancellable_regions..omp_par to void (i32*, i32*, ...)*), i32* nonnull %3)
- %6 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
- call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_cancellable_regions..omp_par.5 to void (i32*, i32*, ...)*), i32* nonnull %4)
+define dso_local void @merge_seq(i32 %a) local_unnamed_addr {
+entry:
+ %a.addr = alloca i32, align 4
+ store i32 %a, i32* %a.addr, align 4
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..8 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+ %0 = load i32, i32* %a.addr, align 4
+ %add = add nsw i32 %0, 1
+ store i32 %add, i32* %a.addr, align 4
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..9 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+ %1 = load i32, i32* %a.addr, align 4
+ call void @use(i32 %1)
+ ret void
+}
+
+define internal void @.omp_outlined..8(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) {
+entry:
+ %0 = load i32, i32* %a, align 4
+ call void @use(i32 %0)
+ ret void
+}
+
+define internal void @.omp_outlined..9(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) {
+entry:
+ %0 = load i32, i32* %a, align 4
+ call void @use(i32 %0)
+ ret void
+}
+
+define dso_local void @merge_seq_float(float %f, float* nocapture %p) local_unnamed_addr {
+entry:
+ %f.addr = alloca float, align 4
+ store float %f, float* %f.addr, align 4
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, float*)* @.omp_outlined..10 to void (i32*, i32*, ...)*), float* nonnull %f.addr)
+ %0 = load float, float* %f.addr, align 4
+ %add = fadd float %0, 0x40091EB860000000
+ store float %add, float* %p, align 4
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, float*)* @.omp_outlined..11 to void (i32*, i32*, ...)*), float* nonnull %f.addr)
+ ret void
+}
+
+define internal void @.omp_outlined..10(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., float* nocapture nonnull readonly align 4 dereferenceable(4) %f) {
+entry:
+ %0 = load float, float* %f, align 4
+ %conv = fptosi float %0 to i32
+ call void @use(i32 %conv)
+ ret void
+}
+
+define internal void @.omp_outlined..11(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., float* nocapture nonnull readonly align 4 dereferenceable(4) %f) {
+entry:
+ %0 = load float, float* %f, align 4
+ %conv = fptosi float %0 to i32
+ call void @use(i32 %conv)
ret void
}
-define internal void @merge_cancellable_regions..omp_par.5(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture readonly %2) {
- %4 = load i32, i32* %2, align 4
- %5 = icmp eq i32 %4, 0
- br i1 %5, label %6, label %7
+define dso_local void @merge_seq_firstprivate(i32 %a) local_unnamed_addr {
+entry:
+ %a.addr = alloca i32, align 4
+ store i32 %a, i32* %a.addr, align 4
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..12 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+ %0 = load i32, i32* %a.addr, align 4
+ %add = add nsw i32 %0, 1
+ store i32 %add, i32* %a.addr, align 4
+ %a.casted.sroa.0.0.insert.ext = zext i32 %add to i64
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64)* @.omp_outlined..13 to void (i32*, i32*, ...)*), i64 %a.casted.sroa.0.0.insert.ext)
+ %1 = load i32, i32* %a.addr, align 4
+ call void @use(i32 %1)
+ ret void
+}
+
+define internal void @.omp_outlined..12(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) {
+entry:
+ %0 = load i32, i32* %a, align 4
+ call void @use(i32 %0)
+ ret void
+}
+
+define internal void @.omp_outlined..13(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i64 %a) {
+entry:
+ %a.addr.sroa.0.0.extract.trunc = trunc i64 %a to i32
+ call void @use(i32 %a.addr.sroa.0.0.extract.trunc)
+ ret void
+}
+
+define dso_local void @merge_seq_sink_lt(i32 %a) local_unnamed_addr {
+entry:
+ %a.addr = alloca i32, align 4
+ %b = alloca i32, align 4
+ store i32 %a, i32* %a.addr, align 4
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..14 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+ %0 = bitcast i32* %b to i8*
+ call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0)
+ %1 = ptrtoint i32* %b to i64
+ %2 = trunc i64 %1 to i32
+ store i32 %2, i32* %b, align 4
+ call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0)
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..15 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+ ret void
+}
+
+define internal void @.omp_outlined..14(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) {
+entry:
+ %0 = load i32, i32* %a, align 4
+ call void @use(i32 %0)
+ ret void
+}
+
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture)
-6: ; preds = %3
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture)
+
+define internal void @.omp_outlined..15(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) {
+entry:
+ %0 = load i32, i32* %a, align 4
+ call void @use(i32 %0)
ret void
+}
-7: ; preds = %3
- %8 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
- %9 = call i32 @__kmpc_cancel(%struct.ident_t* nonnull @1, i32 %8, i32 1)
+define dso_local void @merge_seq_par_use(i32 %a) local_unnamed_addr {
+entry:
+ %a.addr = alloca i32, align 4
+ %b = alloca i32, align 4
+ store i32 %a, i32* %a.addr, align 4
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..16 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+ %0 = bitcast i32* %b to i8*
+ call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0)
+ %1 = load i32, i32* %a.addr, align 4
+ %add = add nsw i32 %1, 1
+ store i32 %add, i32* %b, align 4
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @.omp_outlined..17 to void (i32*, i32*, ...)*), i32* nonnull %a.addr, i32* nonnull %b)
+ call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0)
ret void
}
-define internal void @merge_cancellable_regions..omp_par(i32* noalias nocapture readnone %0, i32* noalias nocapture readnone %1, i32* nocapture readonly %2) {
- %4 = load i32, i32* %2, align 4
- %5 = icmp eq i32 %4, 0
- br i1 %5, label %6, label %7
+define internal void @.omp_outlined..16(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) {
+entry:
+ %0 = load i32, i32* %a, align 4
+ call void @use(i32 %0)
+ ret void
+}
-6: ; preds = %3
+define internal void @.omp_outlined..17(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a, i32* nocapture nonnull readonly align 4 dereferenceable(4) %b) {
+entry:
+ %0 = load i32, i32* %a, align 4
+ call void @use(i32 %0)
+ %1 = load i32, i32* %b, align 4
+ call void @use(i32 %1)
ret void
+}
-7: ; preds = %3
- %8 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
- %9 = call i32 @__kmpc_cancel(%struct.ident_t* nonnull @1, i32 %8, i32 1)
+define dso_local void @merge_cancellable_regions(i32 %cancel1, i32 %cancel2) local_unnamed_addr {
+entry:
+ %cancel1.addr = alloca i32, align 4
+ %cancel2.addr = alloca i32, align 4
+ store i32 %cancel1, i32* %cancel1.addr, align 4
+ store i32 %cancel2, i32* %cancel2.addr, align 4
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..18 to void (i32*, i32*, ...)*), i32* nonnull %cancel1.addr)
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..19 to void (i32*, i32*, ...)*), i32* nonnull %cancel2.addr)
+ ret void
+}
+
+define internal void @.omp_outlined..18(i32* noalias nocapture readonly %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %cancel1) {
+entry:
+ %0 = load i32, i32* %cancel1, align 4
+ %tobool.not = icmp eq i32 %0, 0
+ br i1 %tobool.not, label %if.end, label %if.then
+
+if.then: ; preds = %entry
+ %1 = load i32, i32* %.global_tid., align 4
+ %2 = call i32 @__kmpc_cancel(%struct.ident_t* nonnull @1, i32 %1, i32 1)
+ ret void
+
+if.end: ; preds = %entry
ret void
}
declare i32 @__kmpc_cancel(%struct.ident_t*, i32, i32) local_unnamed_addr
+define internal void @.omp_outlined..19(i32* noalias nocapture readonly %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %cancel2) {
+entry:
+ %0 = load i32, i32* %cancel2, align 4
+ %tobool.not = icmp eq i32 %0, 0
+ br i1 %tobool.not, label %if.end, label %if.then
+
+if.then: ; preds = %entry
+ %1 = load i32, i32* %.global_tid., align 4
+ %2 = call i32 @__kmpc_cancel(%struct.ident_t* nonnull @1, i32 %1, i32 1)
+ ret void
+
+if.end: ; preds = %entry
+ ret void
+}
+
+define dso_local void @merge_cancellable_regions_seq(i32 %cancel1, i32 %cancel2) local_unnamed_addr {
+entry:
+ %cancel1.addr = alloca i32, align 4
+ %cancel2.addr = alloca i32, align 4
+ store i32 %cancel1, i32* %cancel1.addr, align 4
+ store i32 %cancel2, i32* %cancel2.addr, align 4
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..20 to void (i32*, i32*, ...)*), i32* nonnull %cancel1.addr)
+ %0 = load i32, i32* %cancel1.addr, align 4
+ %tobool.not = icmp eq i32 %0, 0
+ %lnot.ext = zext i1 %tobool.not to i32
+ store i32 %lnot.ext, i32* %cancel2.addr, align 4
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..21 to void (i32*, i32*, ...)*), i32* nonnull %cancel2.addr)
+ ret void
+}
+
+define internal void @.omp_outlined..20(i32* noalias nocapture readonly %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %cancel1) {
+entry:
+ %0 = load i32, i32* %cancel1, align 4
+ %tobool.not = icmp eq i32 %0, 0
+ br i1 %tobool.not, label %if.end, label %if.then
+
+if.then: ; preds = %entry
+ %1 = load i32, i32* %.global_tid., align 4
+ %2 = call i32 @__kmpc_cancel(%struct.ident_t* nonnull @1, i32 %1, i32 1)
+ ret void
+
+if.end: ; preds = %entry
+ ret void
+}
+
+define internal void @.omp_outlined..21(i32* noalias nocapture readonly %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %cancel2) {
+entry:
+ %0 = load i32, i32* %cancel2, align 4
+ %tobool.not = icmp eq i32 %0, 0
+ br i1 %tobool.not, label %if.end, label %if.then
+
+if.then: ; preds = %entry
+ %1 = load i32, i32* %.global_tid., align 4
+ %2 = call i32 @__kmpc_cancel(%struct.ident_t* nonnull @1, i32 %1, i32 1)
+ ret void
+
+if.end: ; preds = %entry
+ ret void
+}
+
+define dso_local void @merge_3(i32 %a) local_unnamed_addr {
+entry:
+ %a.addr = alloca i32, align 4
+ store i32 %a, i32* %a.addr, align 4
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..22 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..23 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..24 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+ ret void
+}
+
+define internal void @.omp_outlined..22(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) {
+entry:
+ %0 = load i32, i32* %a, align 4
+ call void @use(i32 %0)
+ ret void
+}
+
+define internal void @.omp_outlined..23(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) {
+entry:
+ %0 = load i32, i32* %a, align 4
+ call void @use(i32 %0)
+ ret void
+}
+
+define internal void @.omp_outlined..24(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) {
+entry:
+ %0 = load i32, i32* %a, align 4
+ call void @use(i32 %0)
+ ret void
+}
+
+define dso_local void @merge_3_seq(i32 %a, i32 %b) local_unnamed_addr {
+entry:
+ %a.addr = alloca i32, align 4
+ store i32 %a, i32* %a.addr, align 4
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..25 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+ %0 = load i32, i32* %a.addr, align 4
+ %add = add nsw i32 %0, 1
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..26 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+ %1 = load i32, i32* %a.addr, align 4
+ %add1 = add nsw i32 %add, %1
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..27 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+ call void @use(i32 %add1)
+ ret void
+}
+
+define internal void @.omp_outlined..25(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) {
+entry:
+ %0 = load i32, i32* %a, align 4
+ call void @use(i32 %0)
+ ret void
+}
+
+define internal void @.omp_outlined..26(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) {
+entry:
+ %0 = load i32, i32* %a, align 4
+ call void @use(i32 %0)
+ ret void
+}
+
+define internal void @.omp_outlined..27(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) {
+entry:
+ %0 = load i32, i32* %a, align 4
+ call void @use(i32 %0)
+ ret void
+}
+
+define dso_local void @unmergable_3_seq_call(i32 %a) local_unnamed_addr {
+entry:
+ %a.addr = alloca i32, align 4
+ store i32 %a, i32* %a.addr, align 4
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..28 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+ call void (...) @foo()
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..29 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+ call void (...) @foo()
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..30 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+ ret void
+}
+
+define internal void @.omp_outlined..28(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) {
+entry:
+ %0 = load i32, i32* %a, align 4
+ call void @use(i32 %0)
+ ret void
+}
+
+define internal void @.omp_outlined..29(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) {
+entry:
+ %0 = load i32, i32* %a, align 4
+ call void @use(i32 %0)
+ ret void
+}
+
+define internal void @.omp_outlined..30(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) {
+entry:
+ %0 = load i32, i32* %a, align 4
+ call void @use(i32 %0)
+ ret void
+}
+
+define dso_local void @unmergable_3_proc_bind(i32 %a) local_unnamed_addr {
+entry:
+ %a.addr = alloca i32, align 4
+ %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+ store i32 %a, i32* %a.addr, align 4
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..31 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+ call void @__kmpc_push_proc_bind(%struct.ident_t* nonnull @1, i32 %0, i32 3)
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..32 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..33 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+ ret void
+}
+
+define internal void @.omp_outlined..31(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) {
+entry:
+ %0 = load i32, i32* %a, align 4
+ call void @use(i32 %0)
+ ret void
+}
+
+define internal void @.omp_outlined..32(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) {
+entry:
+ %0 = load i32, i32* %a, align 4
+ call void @use(i32 %0)
+ ret void
+}
+
+define internal void @.omp_outlined..33(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) {
+entry:
+ %0 = load i32, i32* %a, align 4
+ call void @use(i32 %0)
+ ret void
+}
+
+define dso_local void @unmergable_3_num_threads(i32 %a) local_unnamed_addr {
+entry:
+ %a.addr = alloca i32, align 4
+ %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
+ store i32 %a, i32* %a.addr, align 4
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..34 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+ %1 = load i32, i32* %a.addr, align 4
+ call void @__kmpc_push_num_threads(%struct.ident_t* nonnull @1, i32 %0, i32 %1)
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..35 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..36 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+ ret void
+}
+
+define internal void @.omp_outlined..34(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) {
+entry:
+ %0 = load i32, i32* %a, align 4
+ call void @use(i32 %0)
+ ret void
+}
+
+define internal void @.omp_outlined..35(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) {
+entry:
+ %0 = load i32, i32* %a, align 4
+ call void @use(i32 %0)
+ ret void
+}
+
+define internal void @.omp_outlined..36(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) {
+entry:
+ %0 = load i32, i32* %a, align 4
+ call void @use(i32 %0)
+ ret void
+}
+
+define dso_local void @merge_2_unmergable_1(i32 %a) local_unnamed_addr {
+entry:
+ %a.addr = alloca i32, align 4
+ store i32 %a, i32* %a.addr, align 4
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..37 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..38 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+ call void (...) @foo()
+ call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @1, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined..39 to void (i32*, i32*, ...)*), i32* nonnull %a.addr)
+ ret void
+}
+
+define internal void @.omp_outlined..37(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) {
+entry:
+ %0 = load i32, i32* %a, align 4
+ call void @use(i32 %0)
+ ret void
+}
+
+define internal void @.omp_outlined..38(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) {
+entry:
+ %0 = load i32, i32* %a, align 4
+ call void @use(i32 %0)
+ ret void
+}
+
+define internal void @.omp_outlined..39(i32* noalias nocapture readnone %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) {
+entry:
+ %0 = load i32, i32* %a, align 4
+ call void @use(i32 %0)
+ ret void
+}
+
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{!2}
!2 = !{i64 2, i64 -1, i64 -1, i1 true}
-; CHECK-LABEL: define {{[^@]+}}@merge_all() local_unnamed_addr {
-; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1:@.*]])
-; CHECK-NEXT: [[TMP2:%.*]] = alloca i32, align 4
-; CHECK-NEXT: store i32 1, i32* [[TMP2]], align 4
-; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-LABEL: define {{[^@]+}}@merge
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1:@.*]])
; CHECK-NEXT: br label [[OMP_PARALLEL:%.*]]
; CHECK: omp_parallel:
-; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_all..omp_par.2 to void (i32*, i32*, ...)*), i32* [[TMP2]])
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]])
; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
; CHECK: omp.par.outlined.exit:
; CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]]
; CHECK: omp.par.exit.split:
-; CHECK-NEXT: br label [[DOTSPLIT_SPLIT:%.*]]
-; CHECK: .split.split:
+; CHECK-NEXT: br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK: entry.split.split:
; CHECK-NEXT: ret void
;
;
-; CHECK-LABEL: define {{[^@]+}}@merge_all..omp_par.2
-; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[TMP0:%.*]]) [[ATTR0:#.*]] {
+; CHECK-LABEL: define {{[^@]+}}@merge..omp_par
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[A_ADDR:%.*]]) [[ATTR0:#.*]] {
; CHECK-NEXT: omp.par.entry:
; CHECK-NEXT: [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TID_ADDR]], align 4
-; CHECK-NEXT: store i32 [[TMP1]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT: store i32 [[TMP0]], i32* [[TID_ADDR_LOCAL]], align 4
; CHECK-NEXT: [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
; CHECK-NEXT: br label [[OMP_PAR_REGION:%.*]]
; CHECK: omp.par.outlined.exit.exitStub:
@@ -229,12 +818,12 @@ declare i32 @__kmpc_cancel(%struct.ident_t*, i32, i32) local_unnamed_addr
; CHECK: omp.par.region:
; CHECK-NEXT: br label [[OMP_PAR_MERGED:%.*]]
; CHECK: omp.par.merged:
-; CHECK-NEXT: call void @merge_all..omp_par(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT: call void @.omp_outlined.(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2:@.*]], i32 [[OMP_GLOBAL_THREAD_NUM]])
-; CHECK-NEXT: call void @merge_all..omp_par.1(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
-; CHECK-NEXT: br label [[DOTSPLIT:%.*]]
-; CHECK: .split:
+; CHECK-NEXT: call void @.omp_outlined..1(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT: br label [[ENTRY_SPLIT:%.*]]
+; CHECK: entry.split:
; CHECK-NEXT: br label [[OMP_PAR_REGION_SPLIT:%.*]]
; CHECK: omp.par.region.split:
; CHECK-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]]
@@ -242,65 +831,212 @@ declare i32 @__kmpc_cancel(%struct.ident_t*, i32, i32) local_unnamed_addr
; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
;
;
-; CHECK-LABEL: define {{[^@]+}}@merge_all..omp_par.1
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1:#.*]] {
-; CHECK-NEXT: store i32 3, i32* [[TMP2]], align 4
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined.
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT: call void @use(i32 [[TMP0]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..1
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT: call void @use(i32 [[TMP0]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@unmergable_proc_bind
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
+; CHECK-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT: call void @__kmpc_push_proc_bind(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 [[TMP0]], i32 noundef 3)
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..2 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..3 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..2
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT: call void @use(i32 [[TMP0]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..3
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT: call void @use(i32 [[TMP0]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@unmergable_num_threads
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
+; CHECK-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 [[TMP0]], i32 [[A]])
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..4 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..5 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..4
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT: call void @use(i32 [[TMP0]])
; CHECK-NEXT: ret void
;
;
-; CHECK-LABEL: define {{[^@]+}}@merge_all..omp_par
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
-; CHECK-NEXT: store i32 2, i32* [[TMP2]], align 4
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..5
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT: call void @use(i32 [[TMP0]])
; CHECK-NEXT: ret void
;
;
-; CHECK-LABEL: define {{[^@]+}}@merge_none() local_unnamed_addr {
-; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
-; CHECK-NEXT: [[TMP2:%.*]] = alloca i32, align 4
-; CHECK-NEXT: store i32 1, i32* [[TMP2]], align 4
-; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2]])
-; CHECK-NEXT: store i32 3, i32* [[TMP2]], align 4
-; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @merge_none..omp_par.2 to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2]])
+; CHECK-LABEL: define {{[^@]+}}@unmergable_seq_call
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..6 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT: call void (...) @foo()
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..7 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
; CHECK-NEXT: ret void
;
;
-; CHECK-LABEL: define {{[^@]+}}@merge_none..omp_par.2
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
-; CHECK-NEXT: store i32 4, i32* [[TMP2]], align 4
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..6
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT: call void @use(i32 [[TMP0]])
; CHECK-NEXT: ret void
;
;
-; CHECK-LABEL: define {{[^@]+}}@merge_none..omp_par
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
-; CHECK-NEXT: store i32 2, i32* [[TMP2]], align 4
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..7
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT: call void @use(i32 [[TMP0]])
; CHECK-NEXT: ret void
;
;
-; CHECK-LABEL: define {{[^@]+}}@merge_some() local_unnamed_addr {
-; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
-; CHECK-NEXT: [[TMP2:%.*]] = alloca i32, align 4
-; CHECK-NEXT: store i32 1, i32* [[TMP2]], align 4
-; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2]])
-; CHECK-NEXT: store i32 3, i32* [[TMP2]], align 4
+; CHECK-LABEL: define {{[^@]+}}@merge_seq
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: br label [[OMP_PARALLEL:%.*]]
+; CHECK: omp_parallel:
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_seq..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]])
+; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK: omp.par.outlined.exit:
+; CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK: omp.par.exit.split:
+; CHECK-NEXT: br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK: entry.split.split:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4
+; CHECK-NEXT: call void @use(i32 [[TMP0]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_seq..omp_par
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[A_ADDR:%.*]]) [[ATTR0]] {
+; CHECK-NEXT: omp.par.entry:
+; CHECK-NEXT: [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT: store i32 [[TMP0]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT: [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT: br label [[OMP_PAR_REGION:%.*]]
+; CHECK: omp.par.outlined.exit.exitStub:
+; CHECK-NEXT: ret void
+; CHECK: omp.par.region:
+; CHECK-NEXT: br label [[OMP_PAR_MERGED:%.*]]
+; CHECK: omp.par.merged:
+; CHECK-NEXT: call void @.omp_outlined..8(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[TMP2]], label [[OMP_REGION_BODY:%.*]], label [[OMP_REGION_END:%.*]]
+; CHECK: omp_region.end:
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK-NEXT: br label [[OMP_PAR_MERGED_SPLIT_SPLIT:%.*]]
+; CHECK: omp.par.merged.split.split:
+; CHECK-NEXT: call void @.omp_outlined..9(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT: br label [[ENTRY_SPLIT:%.*]]
+; CHECK: entry.split:
+; CHECK-NEXT: br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK: omp.par.region.split:
+; CHECK-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK: omp.par.pre_finalize:
+; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+; CHECK: omp_region.body:
+; CHECK-NEXT: br label [[SEQ_PAR_MERGED:%.*]]
+; CHECK: seq.par.merged:
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[A_ADDR]], align 4
+; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+; CHECK-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT: br label [[OMP_PAR_MERGED_SPLIT:%.*]]
+; CHECK: omp.par.merged.split:
+; CHECK-NEXT: br label [[OMP_REGION_BODY_SPLIT:%.*]]
+; CHECK: omp_region.body.split:
+; CHECK-NEXT: call void @__kmpc_end_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT: br label [[OMP_REGION_END]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..8
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT: call void @use(i32 [[TMP0]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..9
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT: call void @use(i32 [[TMP0]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_seq_float
+; CHECK-SAME: (float [[F:%.*]], float* nocapture writeonly [[P:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[F_ADDR:%.*]] = alloca float, align 4
+; CHECK-NEXT: store float [[F]], float* [[F_ADDR]], align 4
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
; CHECK-NEXT: br label [[OMP_PARALLEL:%.*]]
; CHECK: omp_parallel:
-; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_some..omp_par.5 to void (i32*, i32*, ...)*), i32* [[TMP2]])
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, float*, float*)* @merge_seq_float..omp_par to void (i32*, i32*, ...)*), float* [[F_ADDR]], float* [[P]])
; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
; CHECK: omp.par.outlined.exit:
; CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]]
; CHECK: omp.par.exit.split:
-; CHECK-NEXT: br label [[DOTSPLIT_SPLIT:%.*]]
-; CHECK: .split.split:
+; CHECK-NEXT: br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK: entry.split.split:
; CHECK-NEXT: ret void
;
;
-; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par.5
-; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[TMP0:%.*]]) [[ATTR0]] {
+; CHECK-LABEL: define {{[^@]+}}@merge_seq_float..omp_par
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], float* [[F_ADDR:%.*]], float* [[P:%.*]]) [[ATTR0]] {
; CHECK-NEXT: omp.par.entry:
; CHECK-NEXT: [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TID_ADDR]], align 4
-; CHECK-NEXT: store i32 [[TMP1]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT: store i32 [[TMP0]], i32* [[TID_ADDR_LOCAL]], align 4
; CHECK-NEXT: [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
; CHECK-NEXT: br label [[OMP_PAR_REGION:%.*]]
; CHECK: omp.par.outlined.exit.exitStub:
@@ -308,63 +1044,345 @@ declare i32 @__kmpc_cancel(%struct.ident_t*, i32, i32) local_unnamed_addr
; CHECK: omp.par.region:
; CHECK-NEXT: br label [[OMP_PAR_MERGED:%.*]]
; CHECK: omp.par.merged:
-; CHECK-NEXT: call void @merge_some..omp_par.3(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT: call void @.omp_outlined..10(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], float* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[F_ADDR]])
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
-; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
-; CHECK-NEXT: call void @merge_some..omp_par.4(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0]])
-; CHECK-NEXT: br label [[DOTSPLIT:%.*]]
-; CHECK: .split:
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[TMP2]], label [[OMP_REGION_BODY:%.*]], label [[OMP_REGION_END:%.*]]
+; CHECK: omp_region.end:
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK-NEXT: br label [[OMP_PAR_MERGED_SPLIT_SPLIT:%.*]]
+; CHECK: omp.par.merged.split.split:
+; CHECK-NEXT: call void @.omp_outlined..11(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], float* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[F_ADDR]])
+; CHECK-NEXT: br label [[ENTRY_SPLIT:%.*]]
+; CHECK: entry.split:
; CHECK-NEXT: br label [[OMP_PAR_REGION_SPLIT:%.*]]
; CHECK: omp.par.region.split:
; CHECK-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]]
; CHECK: omp.par.pre_finalize:
; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+; CHECK: omp_region.body:
+; CHECK-NEXT: br label [[SEQ_PAR_MERGED:%.*]]
+; CHECK: seq.par.merged:
+; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[F_ADDR]], align 4
+; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP3]], 0x40091EB860000000
+; CHECK-NEXT: store float [[ADD]], float* [[P]], align 4
+; CHECK-NEXT: br label [[OMP_PAR_MERGED_SPLIT:%.*]]
+; CHECK: omp.par.merged.split:
+; CHECK-NEXT: br label [[OMP_REGION_BODY_SPLIT:%.*]]
+; CHECK: omp_region.body.split:
+; CHECK-NEXT: call void @__kmpc_end_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT: br label [[OMP_REGION_END]]
;
;
-; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par.4
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
-; CHECK-NEXT: store i32 5, i32* [[TMP2]], align 4
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..10
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], float* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[F:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[F]], align 4
+; CHECK-NEXT: [[CONV:%.*]] = fptosi float [[TMP0]] to i32
+; CHECK-NEXT: call void @use(i32 [[CONV]])
; CHECK-NEXT: ret void
;
;
-; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par.3
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
-; CHECK-NEXT: store i32 4, i32* [[TMP2]], align 4
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..11
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], float* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[F:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[F]], align 4
+; CHECK-NEXT: [[CONV:%.*]] = fptosi float [[TMP0]] to i32
+; CHECK-NEXT: call void @use(i32 [[CONV]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_seq_firstprivate
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[A_CASTED_SROA_0_0_INSERT_EXT_SEQ_OUTPUT_ALLOC:%.*]] = alloca i64, align 8
+; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: br label [[OMP_PARALLEL:%.*]]
+; CHECK: omp_parallel:
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i64*)* @merge_seq_firstprivate..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]], i64* [[A_CASTED_SROA_0_0_INSERT_EXT_SEQ_OUTPUT_ALLOC]])
+; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK: omp.par.outlined.exit:
+; CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK: omp.par.exit.split:
+; CHECK-NEXT: br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK: entry.split.split:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A_ADDR]], align 4
+; CHECK-NEXT: call void @use(i32 [[TMP0]])
; CHECK-NEXT: ret void
;
;
-; CHECK-LABEL: define {{[^@]+}}@merge_some..omp_par
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP2:%.*]]) [[ATTR1]] {
-; CHECK-NEXT: store i32 2, i32* [[TMP2]], align 4
+; CHECK-LABEL: define {{[^@]+}}@merge_seq_firstprivate..omp_par
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[A_ADDR:%.*]], i64* [[A_CASTED_SROA_0_0_INSERT_EXT_SEQ_OUTPUT_ALLOC:%.*]]) [[ATTR0]] {
+; CHECK-NEXT: omp.par.entry:
+; CHECK-NEXT: [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT: store i32 [[TMP0]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT: [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT: br label [[OMP_PAR_REGION:%.*]]
+; CHECK: omp.par.outlined.exit.exitStub:
+; CHECK-NEXT: ret void
+; CHECK: omp.par.region:
+; CHECK-NEXT: br label [[OMP_PAR_MERGED:%.*]]
+; CHECK: omp.par.merged:
+; CHECK-NEXT: call void @.omp_outlined..12(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[TMP2]], label [[OMP_REGION_BODY:%.*]], label [[OMP_REGION_END:%.*]]
+; CHECK: omp_region.end:
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK-NEXT: br label [[OMP_PAR_MERGED_SPLIT_SPLIT:%.*]]
+; CHECK: omp.par.merged.split.split:
+; CHECK-NEXT: [[A_CASTED_SROA_0_0_INSERT_EXT_SEQ_OUTPUT_LOAD:%.*]] = load i64, i64* [[A_CASTED_SROA_0_0_INSERT_EXT_SEQ_OUTPUT_ALLOC]], align 8
+; CHECK-NEXT: call void @.omp_outlined..13(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i64 [[A_CASTED_SROA_0_0_INSERT_EXT_SEQ_OUTPUT_LOAD]])
+; CHECK-NEXT: br label [[ENTRY_SPLIT:%.*]]
+; CHECK: entry.split:
+; CHECK-NEXT: br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK: omp.par.region.split:
+; CHECK-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK: omp.par.pre_finalize:
+; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+; CHECK: omp_region.body:
+; CHECK-NEXT: br label [[SEQ_PAR_MERGED:%.*]]
+; CHECK: seq.par.merged:
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[A_ADDR]], align 4
+; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
+; CHECK-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT: [[A_CASTED_SROA_0_0_INSERT_EXT:%.*]] = zext i32 [[ADD]] to i64
+; CHECK-NEXT: store i64 [[A_CASTED_SROA_0_0_INSERT_EXT]], i64* [[A_CASTED_SROA_0_0_INSERT_EXT_SEQ_OUTPUT_ALLOC]], align 8
+; CHECK-NEXT: br label [[OMP_PAR_MERGED_SPLIT:%.*]]
+; CHECK: omp.par.merged.split:
+; CHECK-NEXT: br label [[OMP_REGION_BODY_SPLIT:%.*]]
+; CHECK: omp_region.body.split:
+; CHECK-NEXT: call void @__kmpc_end_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT: br label [[OMP_REGION_END]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..12
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT: call void @use(i32 [[TMP0]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..13
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i64 [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[A_ADDR_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A]] to i32
+; CHECK-NEXT: call void @use(i32 [[A_ADDR_SROA_0_0_EXTRACT_TRUNC]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_seq_sink_lt
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: br label [[OMP_PARALLEL:%.*]]
+; CHECK: omp_parallel:
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_seq_sink_lt..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]])
+; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK: omp.par.outlined.exit:
+; CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK: omp.par.exit.split:
+; CHECK-NEXT: br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK: entry.split.split:
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_seq_sink_lt..omp_par
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[A_ADDR:%.*]]) [[ATTR0]] {
+; CHECK-NEXT: omp.par.entry:
+; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT: store i32 [[TMP0]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT: [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT: br label [[OMP_PAR_REGION:%.*]]
+; CHECK: omp.par.outlined.exit.exitStub:
+; CHECK-NEXT: ret void
+; CHECK: omp.par.region:
+; CHECK-NEXT: br label [[OMP_PAR_MERGED:%.*]]
+; CHECK: omp.par.merged:
+; CHECK-NEXT: call void @.omp_outlined..14(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[TMP2]], label [[OMP_REGION_BODY:%.*]], label [[OMP_REGION_END:%.*]]
+; CHECK: omp_region.end:
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK-NEXT: br label [[OMP_PAR_MERGED_SPLIT_SPLIT:%.*]]
+; CHECK: omp.par.merged.split.split:
+; CHECK-NEXT: call void @.omp_outlined..15(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT: br label [[ENTRY_SPLIT:%.*]]
+; CHECK: entry.split:
+; CHECK-NEXT: br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK: omp.par.region.split:
+; CHECK-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK: omp.par.pre_finalize:
+; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+; CHECK: omp_region.body:
+; CHECK-NEXT: br label [[SEQ_PAR_MERGED:%.*]]
+; CHECK: seq.par.merged:
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[B]] to i8*
+; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 noundef 4, i8* noundef nonnull [[TMP3]])
+; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint i32* [[B]] to i64
+; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP4]] to i32
+; CHECK-NEXT: store i32 [[TMP5]], i32* [[B]], align 4
+; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 noundef 4, i8* noundef nonnull [[TMP3]])
+; CHECK-NEXT: br label [[OMP_PAR_MERGED_SPLIT:%.*]]
+; CHECK: omp.par.merged.split:
+; CHECK-NEXT: br label [[OMP_REGION_BODY_SPLIT:%.*]]
+; CHECK: omp_region.body.split:
+; CHECK-NEXT: call void @__kmpc_end_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT: br label [[OMP_REGION_END]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..14
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT: call void @use(i32 [[TMP0]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..15
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT: call void @use(i32 [[TMP0]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_seq_par_use
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: br label [[OMP_PARALLEL:%.*]]
+; CHECK: omp_parallel:
+; CHECK-NEXT: [[LT_CAST3:%.*]] = bitcast i32* [[B]] to i8*
+; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST3]])
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @merge_seq_par_use..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]], i32* [[B]])
+; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK: omp.par.outlined.exit:
+; CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK: omp.par.exit.split:
+; CHECK-NEXT: br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK: entry.split.split:
+; CHECK-NEXT: [[LT_CAST:%.*]] = bitcast i32* [[B]] to i8*
+; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 noundef 4, i8* noundef nonnull [[LT_CAST]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_seq_par_use..omp_par
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[A_ADDR:%.*]], i32* [[B:%.*]]) [[ATTR0]] {
+; CHECK-NEXT: omp.par.entry:
+; CHECK-NEXT: [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT: store i32 [[TMP0]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT: [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT: br label [[OMP_PAR_REGION:%.*]]
+; CHECK: omp.par.outlined.exit.exitStub:
+; CHECK-NEXT: ret void
+; CHECK: omp.par.region:
+; CHECK-NEXT: br label [[OMP_PAR_MERGED:%.*]]
+; CHECK: omp.par.merged:
+; CHECK-NEXT: call void @.omp_outlined..16(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[TMP2]], label [[OMP_REGION_BODY:%.*]], label [[OMP_REGION_END:%.*]]
+; CHECK: omp_region.end:
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK-NEXT: br label [[OMP_PAR_MERGED_SPLIT_SPLIT:%.*]]
+; CHECK: omp.par.merged.split.split:
+; CHECK-NEXT: call void @.omp_outlined..17(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[B]])
+; CHECK-NEXT: br label [[ENTRY_SPLIT:%.*]]
+; CHECK: entry.split:
+; CHECK-NEXT: br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK: omp.par.region.split:
+; CHECK-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK: omp.par.pre_finalize:
+; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+; CHECK: omp_region.body:
+; CHECK-NEXT: br label [[SEQ_PAR_MERGED:%.*]]
+; CHECK: seq.par.merged:
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[B]] to i8*
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[A_ADDR]], align 4
+; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP4]], 1
+; CHECK-NEXT: store i32 [[ADD]], i32* [[B]], align 4
+; CHECK-NEXT: br label [[OMP_PAR_MERGED_SPLIT:%.*]]
+; CHECK: omp.par.merged.split:
+; CHECK-NEXT: br label [[OMP_REGION_BODY_SPLIT:%.*]]
+; CHECK: omp_region.body.split:
+; CHECK-NEXT: call void @__kmpc_end_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT: br label [[OMP_REGION_END]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..16
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT: call void @use(i32 [[TMP0]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..17
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[B:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT: call void @use(i32 [[TMP0]])
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[B]], align 4
+; CHECK-NEXT: call void @use(i32 [[TMP1]])
; CHECK-NEXT: ret void
;
;
; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions
-; CHECK-SAME: (i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) local_unnamed_addr {
-; CHECK-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
-; CHECK-NEXT: [[TMP4:%.*]] = alloca i32, align 4
-; CHECK-NEXT: [[TMP5:%.*]] = alloca i32, align 4
-; CHECK-NEXT: store i32 [[TMP0]], i32* [[TMP4]], align 4
-; CHECK-NEXT: store i32 [[TMP1]], i32* [[TMP5]], align 4
+; CHECK-SAME: (i32 [[CANCEL1:%.*]], i32 [[CANCEL2:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CANCEL1_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[CANCEL2_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT: store i32 [[CANCEL1]], i32* [[CANCEL1_ADDR]], align 4
+; CHECK-NEXT: store i32 [[CANCEL2]], i32* [[CANCEL2_ADDR]], align 4
; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
; CHECK-NEXT: br label [[OMP_PARALLEL:%.*]]
; CHECK: omp_parallel:
-; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @merge_cancellable_regions..omp_par.6 to void (i32*, i32*, ...)*), i32* [[TMP4]], i32* [[TMP5]])
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @merge_cancellable_regions..omp_par to void (i32*, i32*, ...)*), i32* [[CANCEL1_ADDR]], i32* [[CANCEL2_ADDR]])
; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
; CHECK: omp.par.outlined.exit:
; CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]]
; CHECK: omp.par.exit.split:
-; CHECK-NEXT: br label [[DOTSPLIT_SPLIT:%.*]]
-; CHECK: .split.split:
+; CHECK-NEXT: br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK: entry.split.split:
; CHECK-NEXT: ret void
;
;
-; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par.6
-; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[TMP0:%.*]], i32* [[TMP1:%.*]]) [[ATTR0]] {
+; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[CANCEL1_ADDR:%.*]], i32* [[CANCEL2_ADDR:%.*]]) [[ATTR0]] {
; CHECK-NEXT: omp.par.entry:
; CHECK-NEXT: [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
-; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[TID_ADDR]], align 4
-; CHECK-NEXT: store i32 [[TMP2]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT: store i32 [[TMP0]], i32* [[TID_ADDR_LOCAL]], align 4
; CHECK-NEXT: [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
; CHECK-NEXT: br label [[OMP_PAR_REGION:%.*]]
; CHECK: omp.par.outlined.exit.exitStub:
@@ -372,12 +1390,12 @@ declare i32 @__kmpc_cancel(%struct.ident_t*, i32, i32) local_unnamed_addr
; CHECK: omp.par.region:
; CHECK-NEXT: br label [[OMP_PAR_MERGED:%.*]]
; CHECK: omp.par.merged:
-; CHECK-NEXT: call void @merge_cancellable_regions..omp_par(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP0]])
+; CHECK-NEXT: call void @.omp_outlined..18(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[CANCEL1_ADDR]])
; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
-; CHECK-NEXT: call void @merge_cancellable_regions..omp_par.5(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP1]])
-; CHECK-NEXT: br label [[DOTSPLIT:%.*]]
-; CHECK: .split:
+; CHECK-NEXT: call void @.omp_outlined..19(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[CANCEL2_ADDR]])
+; CHECK-NEXT: br label [[ENTRY_SPLIT:%.*]]
+; CHECK: entry.split:
; CHECK-NEXT: br label [[OMP_PAR_REGION_SPLIT:%.*]]
; CHECK: omp.par.region.split:
; CHECK-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]]
@@ -385,28 +1403,497 @@ declare i32 @__kmpc_cancel(%struct.ident_t*, i32, i32) local_unnamed_addr
; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
;
;
-; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par.5
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP2:%.*]]) {
-; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4
-; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
-; CHECK-NEXT: br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
-; CHECK: 6:
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..18
+; CHECK-SAME: (i32* noalias nocapture readonly [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[CANCEL1:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[CANCEL1]], align 4
+; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* noundef nonnull [[GLOB1]], i32 [[TMP1]], i32 noundef 1)
; CHECK-NEXT: ret void
-; CHECK: 7:
-; CHECK-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull [[GLOB1]])
-; CHECK-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* noundef nonnull [[GLOB1]], i32 [[TMP8]], i32 noundef 1)
+; CHECK: if.end:
; CHECK-NEXT: ret void
;
;
-; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions..omp_par
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[TMP0:%.*]], i32* noalias nocapture nofree readnone [[TMP1:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[TMP2:%.*]]) {
-; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4
-; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
-; CHECK-NEXT: br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP7:%.*]]
-; CHECK: 6:
-; CHECK-NEXT: ret void
-; CHECK: 7:
-; CHECK-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull [[GLOB1]])
-; CHECK-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* noundef nonnull [[GLOB1]], i32 [[TMP8]], i32 noundef 1)
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..19
+; CHECK-SAME: (i32* noalias nocapture readonly [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[CANCEL2:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[CANCEL2]], align 4
+; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* noundef nonnull [[GLOB1]], i32 [[TMP1]], i32 noundef 1)
+; CHECK-NEXT: ret void
+; CHECK: if.end:
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions_seq
+; CHECK-SAME: (i32 [[CANCEL1:%.*]], i32 [[CANCEL2:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CANCEL1_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[CANCEL2_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT: store i32 [[CANCEL1]], i32* [[CANCEL1_ADDR]], align 4
+; CHECK-NEXT: store i32 [[CANCEL2]], i32* [[CANCEL2_ADDR]], align 4
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: br label [[OMP_PARALLEL:%.*]]
+; CHECK: omp_parallel:
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @merge_cancellable_regions_seq..omp_par to void (i32*, i32*, ...)*), i32* [[CANCEL1_ADDR]], i32* [[CANCEL2_ADDR]])
+; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK: omp.par.outlined.exit:
+; CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK: omp.par.exit.split:
+; CHECK-NEXT: br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK: entry.split.split:
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_cancellable_regions_seq..omp_par
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[CANCEL1_ADDR:%.*]], i32* [[CANCEL2_ADDR:%.*]]) [[ATTR0]] {
+; CHECK-NEXT: omp.par.entry:
+; CHECK-NEXT: [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT: store i32 [[TMP0]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT: [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT: br label [[OMP_PAR_REGION:%.*]]
+; CHECK: omp.par.outlined.exit.exitStub:
+; CHECK-NEXT: ret void
+; CHECK: omp.par.region:
+; CHECK-NEXT: br label [[OMP_PAR_MERGED:%.*]]
+; CHECK: omp.par.merged:
+; CHECK-NEXT: call void @.omp_outlined..20(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[CANCEL1_ADDR]])
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[TMP2]], label [[OMP_REGION_BODY:%.*]], label [[OMP_REGION_END:%.*]]
+; CHECK: omp_region.end:
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK-NEXT: br label [[OMP_PAR_MERGED_SPLIT_SPLIT:%.*]]
+; CHECK: omp.par.merged.split.split:
+; CHECK-NEXT: call void @.omp_outlined..21(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[CANCEL2_ADDR]])
+; CHECK-NEXT: br label [[ENTRY_SPLIT:%.*]]
+; CHECK: entry.split:
+; CHECK-NEXT: br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK: omp.par.region.split:
+; CHECK-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK: omp.par.pre_finalize:
+; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+; CHECK: omp_region.body:
+; CHECK-NEXT: br label [[SEQ_PAR_MERGED:%.*]]
+; CHECK: seq.par.merged:
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[CANCEL1_ADDR]], align 4
+; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP3]], 0
+; CHECK-NEXT: [[LNOT_EXT:%.*]] = zext i1 [[TOBOOL_NOT]] to i32
+; CHECK-NEXT: store i32 [[LNOT_EXT]], i32* [[CANCEL2_ADDR]], align 4
+; CHECK-NEXT: br label [[OMP_PAR_MERGED_SPLIT:%.*]]
+; CHECK: omp.par.merged.split:
+; CHECK-NEXT: br label [[OMP_REGION_BODY_SPLIT:%.*]]
+; CHECK: omp_region.body.split:
+; CHECK-NEXT: call void @__kmpc_end_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT: br label [[OMP_REGION_END]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..20
+; CHECK-SAME: (i32* noalias nocapture readonly [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[CANCEL1:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[CANCEL1]], align 4
+; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* noundef nonnull [[GLOB1]], i32 [[TMP1]], i32 noundef 1)
+; CHECK-NEXT: ret void
+; CHECK: if.end:
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..21
+; CHECK-SAME: (i32* noalias nocapture readonly [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[CANCEL2:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[CANCEL2]], align 4
+; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* noundef nonnull [[GLOB1]], i32 [[TMP1]], i32 noundef 1)
+; CHECK-NEXT: ret void
+; CHECK: if.end:
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_3
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: br label [[OMP_PARALLEL:%.*]]
+; CHECK: omp_parallel:
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_3..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]])
+; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK: omp.par.outlined.exit:
+; CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK: omp.par.exit.split:
+; CHECK-NEXT: br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK: entry.split.split:
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_3..omp_par
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[A_ADDR:%.*]]) [[ATTR0]] {
+; CHECK-NEXT: omp.par.entry:
+; CHECK-NEXT: [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT: store i32 [[TMP0]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT: [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT: br label [[OMP_PAR_REGION:%.*]]
+; CHECK: omp.par.outlined.exit.exitStub:
+; CHECK-NEXT: ret void
+; CHECK: omp.par.region:
+; CHECK-NEXT: br label [[OMP_PAR_MERGED:%.*]]
+; CHECK: omp.par.merged:
+; CHECK-NEXT: call void @.omp_outlined..22(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT: call void @.omp_outlined..23(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK-NEXT: call void @.omp_outlined..24(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT: br label [[ENTRY_SPLIT:%.*]]
+; CHECK: entry.split:
+; CHECK-NEXT: br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK: omp.par.region.split:
+; CHECK-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK: omp.par.pre_finalize:
+; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..22
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT: call void @use(i32 [[TMP0]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..23
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT: call void @use(i32 [[TMP0]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..24
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT: call void @use(i32 [[TMP0]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_3_seq
+; CHECK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[ADD1_SEQ_OUTPUT_ALLOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[ADD_SEQ_OUTPUT_ALLOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM7:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: br label [[OMP_PARALLEL:%.*]]
+; CHECK: omp_parallel:
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*, i32*)* @merge_3_seq..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]], i32* [[ADD_SEQ_OUTPUT_ALLOC]], i32* [[ADD1_SEQ_OUTPUT_ALLOC]])
+; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK: omp.par.outlined.exit:
+; CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK: omp.par.exit.split:
+; CHECK-NEXT: br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK: entry.split.split:
+; CHECK-NEXT: [[ADD1_SEQ_OUTPUT_LOAD:%.*]] = load i32, i32* [[ADD1_SEQ_OUTPUT_ALLOC]], align 4
+; CHECK-NEXT: call void @use(i32 [[ADD1_SEQ_OUTPUT_LOAD]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_3_seq..omp_par
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[A_ADDR:%.*]], i32* [[ADD_SEQ_OUTPUT_ALLOC:%.*]], i32* [[ADD1_SEQ_OUTPUT_ALLOC:%.*]]) [[ATTR0]] {
+; CHECK-NEXT: omp.par.entry:
+; CHECK-NEXT: [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT: store i32 [[TMP0]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT: [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT: br label [[OMP_PAR_REGION:%.*]]
+; CHECK: omp.par.outlined.exit.exitStub:
+; CHECK-NEXT: ret void
+; CHECK: omp.par.region:
+; CHECK-NEXT: br label [[OMP_PAR_MERGED:%.*]]
+; CHECK: omp.par.merged:
+; CHECK-NEXT: call void @.omp_outlined..25(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM2]])
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[TMP2]], label [[OMP_REGION_BODY:%.*]], label [[OMP_REGION_END:%.*]]
+; CHECK: omp_region.end:
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM1]])
+; CHECK-NEXT: br label [[OMP_PAR_MERGED_SPLIT_SPLIT:%.*]]
+; CHECK: omp.par.merged.split.split:
+; CHECK-NEXT: call void @.omp_outlined..26(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM4]])
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3]])
+; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
+; CHECK-NEXT: br i1 [[TMP4]], label [[OMP_REGION_BODY5:%.*]], label [[OMP_REGION_END4:%.*]]
+; CHECK: omp_region.end4:
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM6]])
+; CHECK-NEXT: br label [[OMP_PAR_MERGED_SPLIT_SPLIT_SPLIT_SPLIT:%.*]]
+; CHECK: omp.par.merged.split.split.split.split:
+; CHECK-NEXT: call void @.omp_outlined..27(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT: br label [[ENTRY_SPLIT:%.*]]
+; CHECK: entry.split:
+; CHECK-NEXT: br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK: omp.par.region.split:
+; CHECK-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK: omp.par.pre_finalize:
+; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+; CHECK: omp_region.body5:
+; CHECK-NEXT: br label [[SEQ_PAR_MERGED2:%.*]]
+; CHECK: seq.par.merged2:
+; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[A_ADDR]], align 4
+; CHECK-NEXT: [[ADD_SEQ_OUTPUT_LOAD:%.*]] = load i32, i32* [[ADD_SEQ_OUTPUT_ALLOC]], align 4
+; CHECK-NEXT: [[ADD1:%.*]] = add nsw i32 [[ADD_SEQ_OUTPUT_LOAD]], [[TMP5]]
+; CHECK-NEXT: store i32 [[ADD1]], i32* [[ADD1_SEQ_OUTPUT_ALLOC]], align 4
+; CHECK-NEXT: br label [[OMP_PAR_MERGED_SPLIT_SPLIT_SPLIT:%.*]]
+; CHECK: omp.par.merged.split.split.split:
+; CHECK-NEXT: br label [[OMP_REGION_BODY5_SPLIT:%.*]]
+; CHECK: omp_region.body5.split:
+; CHECK-NEXT: call void @__kmpc_end_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3]])
+; CHECK-NEXT: br label [[OMP_REGION_END4]]
+; CHECK: omp_region.body:
+; CHECK-NEXT: br label [[SEQ_PAR_MERGED:%.*]]
+; CHECK: seq.par.merged:
+; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[A_ADDR]], align 4
+; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
+; CHECK-NEXT: store i32 [[ADD]], i32* [[ADD_SEQ_OUTPUT_ALLOC]], align 4
+; CHECK-NEXT: br label [[OMP_PAR_MERGED_SPLIT:%.*]]
+; CHECK: omp.par.merged.split:
+; CHECK-NEXT: br label [[OMP_REGION_BODY_SPLIT:%.*]]
+; CHECK: omp_region.body.split:
+; CHECK-NEXT: call void @__kmpc_end_master(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT: br label [[OMP_REGION_END]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..25
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT: call void @use(i32 [[TMP0]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..26
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT: call void @use(i32 [[TMP0]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..27
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT: call void @use(i32 [[TMP0]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@unmergable_3_seq_call
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..28 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT: call void (...) @foo()
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..29 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT: call void (...) @foo()
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..30 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..28
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT: call void @use(i32 [[TMP0]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..29
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT: call void @use(i32 [[TMP0]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..30
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT: call void @use(i32 [[TMP0]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@unmergable_3_proc_bind
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
+; CHECK-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..31 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT: call void @__kmpc_push_proc_bind(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 [[TMP0]], i32 noundef 3)
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..32 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..33 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..31
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT: call void @use(i32 [[TMP0]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..32
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT: call void @use(i32 [[TMP0]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..33
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT: call void @use(i32 [[TMP0]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@unmergable_3_num_threads
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]])
+; CHECK-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..34 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[A_ADDR]], align 4
+; CHECK-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 [[TMP0]], i32 [[TMP1]])
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..35 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..36 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..34
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT: call void @use(i32 [[TMP0]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..35
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT: call void @use(i32 [[TMP0]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..36
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT: call void @use(i32 [[TMP0]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_2_unmergable_1
+; CHECK-SAME: (i32 [[A:%.*]]) local_unnamed_addr {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: br label [[OMP_PARALLEL:%.*]]
+; CHECK: omp_parallel:
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @merge_2_unmergable_1..omp_par to void (i32*, i32*, ...)*), i32* [[A_ADDR]])
+; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT:%.*]]
+; CHECK: omp.par.outlined.exit:
+; CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]]
+; CHECK: omp.par.exit.split:
+; CHECK-NEXT: br label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK: entry.split.split:
+; CHECK-NEXT: call void (...) @foo()
+; CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1]], i32 noundef 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..39 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@merge_2_unmergable_1..omp_par
+; CHECK-SAME: (i32* noalias [[TID_ADDR:%.*]], i32* noalias [[ZERO_ADDR:%.*]], i32* [[A_ADDR:%.*]]) [[ATTR0]] {
+; CHECK-NEXT: omp.par.entry:
+; CHECK-NEXT: [[TID_ADDR_LOCAL:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[TID_ADDR]], align 4
+; CHECK-NEXT: store i32 [[TMP0]], i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT: [[TID:%.*]] = load i32, i32* [[TID_ADDR_LOCAL]], align 4
+; CHECK-NEXT: br label [[OMP_PAR_REGION:%.*]]
+; CHECK: omp.par.outlined.exit.exitStub:
+; CHECK-NEXT: ret void
+; CHECK: omp.par.region:
+; CHECK-NEXT: br label [[OMP_PAR_MERGED:%.*]]
+; CHECK: omp.par.merged:
+; CHECK-NEXT: call void @.omp_outlined..37(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]])
+; CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM]])
+; CHECK-NEXT: call void @.omp_outlined..38(i32* [[TID_ADDR]], i32* [[ZERO_ADDR]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A_ADDR]])
+; CHECK-NEXT: br label [[ENTRY_SPLIT:%.*]]
+; CHECK: entry.split:
+; CHECK-NEXT: br label [[OMP_PAR_REGION_SPLIT:%.*]]
+; CHECK: omp.par.region.split:
+; CHECK-NEXT: br label [[OMP_PAR_PRE_FINALIZE:%.*]]
+; CHECK: omp.par.pre_finalize:
+; CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT_EXITSTUB:%.*]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..37
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT: call void @use(i32 [[TMP0]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..38
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT: call void @use(i32 [[TMP0]])
+; CHECK-NEXT: ret void
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..39
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT: call void @use(i32 [[TMP0]])
; CHECK-NEXT: ret void
;
More information about the llvm-branch-commits
mailing list