[clang] eb61bde - [OpenMP][CodeGen] Add codegen for combined 'loop' directives.
Dave Pagan via cfe-commits
cfe-commits at lists.llvm.org
Wed Jul 5 10:32:41 PDT 2023
Author: Dave Pagan
Date: 2023-07-05T12:31:59-05:00
New Revision: eb61bde829bcd33346bc70c87fcfe321627bbc31
URL: https://github.com/llvm/llvm-project/commit/eb61bde829bcd33346bc70c87fcfe321627bbc31
DIFF: https://github.com/llvm/llvm-project/commit/eb61bde829bcd33346bc70c87fcfe321627bbc31.diff
LOG: [OpenMP][CodeGen] Add codegen for combined 'loop' directives.
The loop directive is a descriptive construct which allows the compiler
flexibility in how it generates code for the directive's associated
loop(s). See OpenMP specification 5.2 [257:8-9].
Codegen added in this patch for the combined 'loop' directives are:
'target teams loop' -> 'target teams distribute parallel for'
'teams loop' -> 'teams distribute parallel for'
'target parallel loop' -> 'target parallel for'
'parallel loop' -> 'parallel for'
NOTE: The implementation of the 'loop' directive itself is unchanged.
Differential Revision: https://reviews.llvm.org/D145823
Added:
clang/test/OpenMP/generic_loop_codegen.cpp
clang/test/OpenMP/nvptx_target_teams_generic_loop_codegen.cpp
clang/test/OpenMP/nvptx_target_teams_generic_loop_generic_mode_codegen.cpp
clang/test/OpenMP/parallel_generic_loop_codegen.cpp
clang/test/OpenMP/target_parallel_generic_loop_codegen-1.cpp
clang/test/OpenMP/target_parallel_generic_loop_codegen-2.cpp
clang/test/OpenMP/target_parallel_generic_loop_codegen-3.cpp
clang/test/OpenMP/target_parallel_generic_loop_codegen.cpp
clang/test/OpenMP/target_parallel_generic_loop_depend_codegen.cpp
clang/test/OpenMP/target_parallel_generic_loop_uses_allocators_codegen.cpp
clang/test/OpenMP/target_teams_generic_loop_codegen-1.cpp
clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
clang/test/OpenMP/target_teams_generic_loop_collapse_codegen.cpp
clang/test/OpenMP/target_teams_generic_loop_depend_codegen.cpp
clang/test/OpenMP/target_teams_generic_loop_if_codegen.cpp
clang/test/OpenMP/target_teams_generic_loop_order_codegen.cpp
clang/test/OpenMP/target_teams_generic_loop_private_codegen.cpp
clang/test/OpenMP/target_teams_generic_loop_reduction_codegen.cpp
clang/test/OpenMP/target_teams_generic_loop_uses_allocators_codegen.cpp
clang/test/OpenMP/teams_generic_loop_codegen-1.cpp
clang/test/OpenMP/teams_generic_loop_codegen.cpp
clang/test/OpenMP/teams_generic_loop_collapse_codgen.cpp
clang/test/OpenMP/teams_generic_loop_private_codegen.cpp
clang/test/OpenMP/teams_generic_loop_reduction_codegen.cpp
Modified:
clang/lib/AST/StmtOpenMP.cpp
clang/lib/Basic/OpenMPKinds.cpp
clang/lib/CodeGen/CGOpenMPRuntime.cpp
clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
clang/lib/CodeGen/CGStmt.cpp
clang/lib/CodeGen/CGStmtOpenMP.cpp
clang/lib/CodeGen/CodeGenFunction.h
clang/lib/Sema/SemaOpenMP.cpp
Removed:
################################################################################
diff --git a/clang/lib/AST/StmtOpenMP.cpp b/clang/lib/AST/StmtOpenMP.cpp
index 7c5b9f23fc260d..a544732bb4c219 100644
--- a/clang/lib/AST/StmtOpenMP.cpp
+++ b/clang/lib/AST/StmtOpenMP.cpp
@@ -2374,6 +2374,10 @@ OMPTeamsGenericLoopDirective *OMPTeamsGenericLoopDirective::Create(
Dir->setNextLowerBound(Exprs.NLB);
Dir->setNextUpperBound(Exprs.NUB);
Dir->setNumIterations(Exprs.NumIterations);
+ Dir->setPrevLowerBoundVariable(Exprs.PrevLB);
+ Dir->setPrevUpperBoundVariable(Exprs.PrevUB);
+ Dir->setDistInc(Exprs.DistInc);
+ Dir->setPrevEnsureUpperBound(Exprs.PrevEUB);
Dir->setCounters(Exprs.Counters);
Dir->setPrivateCounters(Exprs.PrivateCounters);
Dir->setInits(Exprs.Inits);
@@ -2383,6 +2387,15 @@ OMPTeamsGenericLoopDirective *OMPTeamsGenericLoopDirective::Create(
Dir->setDependentInits(Exprs.DependentInits);
Dir->setFinalsConditions(Exprs.FinalsConditions);
Dir->setPreInits(Exprs.PreInits);
+ Dir->setCombinedLowerBoundVariable(Exprs.DistCombinedFields.LB);
+ Dir->setCombinedUpperBoundVariable(Exprs.DistCombinedFields.UB);
+ Dir->setCombinedEnsureUpperBound(Exprs.DistCombinedFields.EUB);
+ Dir->setCombinedInit(Exprs.DistCombinedFields.Init);
+ Dir->setCombinedCond(Exprs.DistCombinedFields.Cond);
+ Dir->setCombinedNextLowerBound(Exprs.DistCombinedFields.NLB);
+ Dir->setCombinedNextUpperBound(Exprs.DistCombinedFields.NUB);
+ Dir->setCombinedDistCond(Exprs.DistCombinedFields.DistCond);
+ Dir->setCombinedParForInDistCond(Exprs.DistCombinedFields.ParForInDistCond);
return Dir;
}
@@ -2418,6 +2431,10 @@ OMPTargetTeamsGenericLoopDirective *OMPTargetTeamsGenericLoopDirective::Create(
Dir->setNextLowerBound(Exprs.NLB);
Dir->setNextUpperBound(Exprs.NUB);
Dir->setNumIterations(Exprs.NumIterations);
+ Dir->setPrevLowerBoundVariable(Exprs.PrevLB);
+ Dir->setPrevUpperBoundVariable(Exprs.PrevUB);
+ Dir->setDistInc(Exprs.DistInc);
+ Dir->setPrevEnsureUpperBound(Exprs.PrevEUB);
Dir->setCounters(Exprs.Counters);
Dir->setPrivateCounters(Exprs.PrivateCounters);
Dir->setInits(Exprs.Inits);
@@ -2427,6 +2444,15 @@ OMPTargetTeamsGenericLoopDirective *OMPTargetTeamsGenericLoopDirective::Create(
Dir->setDependentInits(Exprs.DependentInits);
Dir->setFinalsConditions(Exprs.FinalsConditions);
Dir->setPreInits(Exprs.PreInits);
+ Dir->setCombinedLowerBoundVariable(Exprs.DistCombinedFields.LB);
+ Dir->setCombinedUpperBoundVariable(Exprs.DistCombinedFields.UB);
+ Dir->setCombinedEnsureUpperBound(Exprs.DistCombinedFields.EUB);
+ Dir->setCombinedInit(Exprs.DistCombinedFields.Init);
+ Dir->setCombinedCond(Exprs.DistCombinedFields.Cond);
+ Dir->setCombinedNextLowerBound(Exprs.DistCombinedFields.NLB);
+ Dir->setCombinedNextUpperBound(Exprs.DistCombinedFields.NUB);
+ Dir->setCombinedDistCond(Exprs.DistCombinedFields.DistCond);
+ Dir->setCombinedParForInDistCond(Exprs.DistCombinedFields.ParForInDistCond);
return Dir;
}
diff --git a/clang/lib/Basic/OpenMPKinds.cpp b/clang/lib/Basic/OpenMPKinds.cpp
index 48c9c2189a938a..1c59a9091b4a4f 100644
--- a/clang/lib/Basic/OpenMPKinds.cpp
+++ b/clang/lib/Basic/OpenMPKinds.cpp
@@ -603,7 +603,9 @@ bool clang::isOpenMPWorksharingDirective(OpenMPDirectiveKind DKind) {
DKind == OMPD_teams_distribute_parallel_for_simd ||
DKind == OMPD_teams_distribute_parallel_for ||
DKind == OMPD_target_teams_distribute_parallel_for ||
- DKind == OMPD_target_teams_distribute_parallel_for_simd;
+ DKind == OMPD_target_teams_distribute_parallel_for_simd ||
+ DKind == OMPD_parallel_loop || DKind == OMPD_teams_loop ||
+ DKind == OMPD_target_parallel_loop || DKind == OMPD_target_teams_loop;
}
bool clang::isOpenMPTaskLoopDirective(OpenMPDirectiveKind DKind) {
@@ -632,7 +634,8 @@ bool clang::isOpenMPParallelDirective(OpenMPDirectiveKind DKind) {
DKind == OMPD_parallel_master_taskloop_simd ||
DKind == OMPD_parallel_masked_taskloop ||
DKind == OMPD_parallel_masked_taskloop_simd ||
- DKind == OMPD_parallel_loop || DKind == OMPD_target_parallel_loop;
+ DKind == OMPD_parallel_loop || DKind == OMPD_target_parallel_loop ||
+ DKind == OMPD_teams_loop;
}
bool clang::isOpenMPTargetExecutionDirective(OpenMPDirectiveKind DKind) {
@@ -729,7 +732,8 @@ bool clang::isOpenMPLoopBoundSharingDirective(OpenMPDirectiveKind Kind) {
Kind == OMPD_teams_distribute_parallel_for_simd ||
Kind == OMPD_teams_distribute_parallel_for ||
Kind == OMPD_target_teams_distribute_parallel_for ||
- Kind == OMPD_target_teams_distribute_parallel_for_simd;
+ Kind == OMPD_target_teams_distribute_parallel_for_simd ||
+ Kind == OMPD_teams_loop || Kind == OMPD_target_teams_loop;
}
bool clang::isOpenMPLoopTransformationDirective(OpenMPDirectiveKind DKind) {
@@ -766,7 +770,6 @@ void clang::getOpenMPCaptureRegions(
case OMPD_target_teams:
case OMPD_target_teams_distribute:
case OMPD_target_teams_distribute_simd:
- case OMPD_target_teams_loop:
CaptureRegions.push_back(OMPD_task);
CaptureRegions.push_back(OMPD_target);
CaptureRegions.push_back(OMPD_teams);
@@ -781,6 +784,7 @@ void clang::getOpenMPCaptureRegions(
CaptureRegions.push_back(OMPD_task);
CaptureRegions.push_back(OMPD_target);
break;
+ case OMPD_teams_loop:
case OMPD_teams_distribute_parallel_for:
case OMPD_teams_distribute_parallel_for_simd:
CaptureRegions.push_back(OMPD_teams);
@@ -815,6 +819,7 @@ void clang::getOpenMPCaptureRegions(
CaptureRegions.push_back(OMPD_parallel);
CaptureRegions.push_back(OMPD_taskloop);
break;
+ case OMPD_target_teams_loop:
case OMPD_target_teams_distribute_parallel_for:
case OMPD_target_teams_distribute_parallel_for_simd:
CaptureRegions.push_back(OMPD_task);
@@ -822,9 +827,6 @@ void clang::getOpenMPCaptureRegions(
CaptureRegions.push_back(OMPD_teams);
CaptureRegions.push_back(OMPD_parallel);
break;
- case OMPD_teams_loop:
- CaptureRegions.push_back(OMPD_teams);
- break;
case OMPD_nothing:
CaptureRegions.push_back(OMPD_nothing);
break;
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 180f3265ab1938..4c32dc1fdb81bd 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -2812,7 +2812,7 @@ void CGOpenMPRuntime::emitForStaticInit(CodeGenFunction &CGF,
const StaticRTInput &Values) {
OpenMPSchedType ScheduleNum = getRuntimeSchedule(
ScheduleKind.Schedule, Values.Chunk != nullptr, Values.Ordered);
- assert(isOpenMPWorksharingDirective(DKind) &&
+ assert((isOpenMPWorksharingDirective(DKind) || (DKind == OMPD_loop)) &&
"Expected loop-based or sections-based directive.");
llvm::Value *UpdatedLocation = emitUpdateLocation(CGF, Loc,
isOpenMPLoopDirective(DKind)
@@ -6206,6 +6206,7 @@ const Expr *CGOpenMPRuntime::getNumTeamsExprForTargetDirective(
DefaultVal = -1;
return nullptr;
}
+ case OMPD_target_teams_loop:
case OMPD_target_teams:
case OMPD_target_teams_distribute:
case OMPD_target_teams_distribute_simd:
@@ -6225,12 +6226,14 @@ const Expr *CGOpenMPRuntime::getNumTeamsExprForTargetDirective(
case OMPD_target_parallel:
case OMPD_target_parallel_for:
case OMPD_target_parallel_for_simd:
+ case OMPD_target_parallel_loop:
case OMPD_target_simd:
DefaultVal = 1;
return nullptr;
case OMPD_parallel:
case OMPD_for:
case OMPD_parallel_for:
+ case OMPD_parallel_loop:
case OMPD_parallel_master:
case OMPD_parallel_sections:
case OMPD_for_simd:
@@ -6447,6 +6450,8 @@ const Expr *CGOpenMPRuntime::getNumThreadsExprForTargetDirective(
return ThreadLimit;
}
return nullptr;
+ case OMPD_target_teams_loop:
+ case OMPD_target_parallel_loop:
case OMPD_target_parallel:
case OMPD_target_parallel_for:
case OMPD_target_parallel_for_simd:
@@ -6649,6 +6654,8 @@ llvm::Value *CGOpenMPRuntime::emitNumThreadsForTargetDirective(
getNumThreads(CGF, D.getInnermostCapturedStmt(), ThreadLimitVal))
return NumThreads;
return Bld.getInt32(0);
+ case OMPD_target_teams_loop:
+ case OMPD_target_parallel_loop:
case OMPD_target_parallel:
case OMPD_target_parallel_for:
case OMPD_target_parallel_for_simd:
@@ -9072,7 +9079,8 @@ getNestedDistributeDirective(ASTContext &Ctx, const OMPExecutableDirective &D) {
OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
switch (D.getDirectiveKind()) {
case OMPD_target:
- if (isOpenMPDistributeDirective(DKind))
+ // For now, just treat 'target teams loop' as if it's distributed.
+ if (isOpenMPDistributeDirective(DKind) || DKind == OMPD_teams_loop)
return NestedDir;
if (DKind == OMPD_teams) {
Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
@@ -9556,7 +9564,8 @@ llvm::Value *CGOpenMPRuntime::emitTargetNumIterationsCall(
OpenMPDirectiveKind Kind = D.getDirectiveKind();
const OMPExecutableDirective *TD = &D;
// Get nested teams distribute kind directive, if any.
- if (!isOpenMPDistributeDirective(Kind) || !isOpenMPTeamsDirective(Kind))
+ if ((!isOpenMPDistributeDirective(Kind) || !isOpenMPTeamsDirective(Kind)) &&
+ Kind != OMPD_target_teams_loop)
TD = getNestedDistributeDirective(CGM.getContext(), D);
if (!TD)
return llvm::ConstantInt::get(CGF.Int64Ty, 0);
@@ -9945,6 +9954,14 @@ void CGOpenMPRuntime::scanForTargetRegionsFunctions(const Stmt *S,
CGM, ParentName,
cast<OMPTargetTeamsDistributeParallelForSimdDirective>(E));
break;
+ case OMPD_target_teams_loop:
+ CodeGenFunction::EmitOMPTargetTeamsGenericLoopDeviceFunction(
+ CGM, ParentName, cast<OMPTargetTeamsGenericLoopDirective>(E));
+ break;
+ case OMPD_target_parallel_loop:
+ CodeGenFunction::EmitOMPTargetParallelGenericLoopDeviceFunction(
+ CGM, ParentName, cast<OMPTargetParallelGenericLoopDirective>(E));
+ break;
case OMPD_parallel:
case OMPD_for:
case OMPD_parallel_for:
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index c1b27ad47d59b8..7fa92068f45cd5 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -653,6 +653,8 @@ static bool supportsSPMDExecutionMode(ASTContext &Ctx,
case OMPD_target:
case OMPD_target_teams:
return hasNestedSPMDDirective(Ctx, D);
+ case OMPD_target_teams_loop:
+ case OMPD_target_parallel_loop:
case OMPD_target_parallel:
case OMPD_target_parallel_for:
case OMPD_target_parallel_for_simd:
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index bdcf5dc162055b..2184b8600d764c 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -416,16 +416,19 @@ void CodeGenFunction::EmitStmt(const Stmt *S, ArrayRef<const Attr *> Attrs) {
EmitOMPGenericLoopDirective(cast<OMPGenericLoopDirective>(*S));
break;
case Stmt::OMPTeamsGenericLoopDirectiveClass:
- llvm_unreachable("teams loop directive not supported yet.");
+ EmitOMPTeamsGenericLoopDirective(cast<OMPTeamsGenericLoopDirective>(*S));
break;
case Stmt::OMPTargetTeamsGenericLoopDirectiveClass:
- llvm_unreachable("target teams loop directive not supported yet.");
+ EmitOMPTargetTeamsGenericLoopDirective(
+ cast<OMPTargetTeamsGenericLoopDirective>(*S));
break;
case Stmt::OMPParallelGenericLoopDirectiveClass:
- llvm_unreachable("parallel loop directive not supported yet.");
+ EmitOMPParallelGenericLoopDirective(
+ cast<OMPParallelGenericLoopDirective>(*S));
break;
case Stmt::OMPTargetParallelGenericLoopDirectiveClass:
- llvm_unreachable("target parallel loop directive not supported yet.");
+ EmitOMPTargetParallelGenericLoopDirective(
+ cast<OMPTargetParallelGenericLoopDirective>(*S));
break;
case Stmt::OMPParallelMaskedDirectiveClass:
EmitOMPParallelMaskedDirective(cast<OMPParallelMaskedDirective>(*S));
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 347ec1f4dd847a..d4198965a90ea7 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -7852,6 +7852,148 @@ void CodeGenFunction::EmitOMPGenericLoopDirective(
CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_loop, CodeGen);
}
+void CodeGenFunction::EmitOMPParallelGenericLoopDirective(
+ const OMPLoopDirective &S) {
+ // Emit combined directive as if its consituent constructs are 'parallel'
+ // and 'for'.
+ auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
+ Action.Enter(CGF);
+ emitOMPCopyinClause(CGF, S);
+ (void)emitWorksharingDirective(CGF, S, /*HasCancel=*/false);
+ };
+ {
+ auto LPCRegion =
+ CGOpenMPRuntime::LastprivateConditionalRAII::disable(*this, S);
+ emitCommonOMPParallelDirective(*this, S, OMPD_for, CodeGen,
+ emitEmptyBoundParameters);
+ }
+ // Check for outer lastprivate conditional update.
+ checkForLastprivateConditionalUpdate(*this, S);
+}
+
+void CodeGenFunction::EmitOMPTeamsGenericLoopDirective(
+ const OMPTeamsGenericLoopDirective &S) {
+ // To be consistent with current behavior of 'target teams loop', emit
+ // 'teams loop' as if its constituent constructs are 'distribute,
+ // 'parallel, and 'for'.
+ auto &&CodeGenDistribute = [&S](CodeGenFunction &CGF, PrePostActionTy &) {
+ CGF.EmitOMPDistributeLoop(S, emitInnerParallelForWhenCombined,
+ S.getDistInc());
+ };
+
+ // Emit teams region as a standalone region.
+ auto &&CodeGen = [&S, &CodeGenDistribute](CodeGenFunction &CGF,
+ PrePostActionTy &Action) {
+ Action.Enter(CGF);
+ OMPPrivateScope PrivateScope(CGF);
+ CGF.EmitOMPReductionClauseInit(S, PrivateScope);
+ (void)PrivateScope.Privatize();
+ CGF.CGM.getOpenMPRuntime().emitInlinedDirective(CGF, OMPD_distribute,
+ CodeGenDistribute);
+ CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_teams);
+ };
+ emitCommonOMPTeamsDirective(*this, S, OMPD_distribute_parallel_for, CodeGen);
+ emitPostUpdateForReductionClause(*this, S,
+ [](CodeGenFunction &) { return nullptr; });
+}
+
+static void
+emitTargetTeamsGenericLoopRegion(CodeGenFunction &CGF,
+ const OMPTargetTeamsGenericLoopDirective &S,
+ PrePostActionTy &Action) {
+ Action.Enter(CGF);
+ // Emit 'teams loop' as if its constituent constructs are 'distribute,
+ // 'parallel, and 'for'.
+ auto &&CodeGenDistribute = [&S](CodeGenFunction &CGF, PrePostActionTy &) {
+ CGF.EmitOMPDistributeLoop(S, emitInnerParallelForWhenCombined,
+ S.getDistInc());
+ };
+
+ // Emit teams region as a standalone region.
+ auto &&CodeGenTeams = [&S, &CodeGenDistribute](CodeGenFunction &CGF,
+ PrePostActionTy &Action) {
+ Action.Enter(CGF);
+ CodeGenFunction::OMPPrivateScope PrivateScope(CGF);
+ CGF.EmitOMPReductionClauseInit(S, PrivateScope);
+ (void)PrivateScope.Privatize();
+ CGF.CGM.getOpenMPRuntime().emitInlinedDirective(
+ CGF, OMPD_distribute, CodeGenDistribute, /*HasCancel=*/false);
+ CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_teams);
+ };
+
+ emitCommonOMPTeamsDirective(CGF, S, OMPD_distribute_parallel_for,
+ CodeGenTeams);
+ emitPostUpdateForReductionClause(CGF, S,
+ [](CodeGenFunction &) { return nullptr; });
+}
+
+/// Emit combined directive 'target teams loop' as if its constituent
+/// constructs are 'target', 'teams', 'distribute', 'parallel', and 'for'.
+void CodeGenFunction::EmitOMPTargetTeamsGenericLoopDirective(
+ const OMPTargetTeamsGenericLoopDirective &S) {
+ auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
+ emitTargetTeamsGenericLoopRegion(CGF, S, Action);
+ };
+ emitCommonOMPTargetDirective(*this, S, CodeGen);
+}
+
+void CodeGenFunction::EmitOMPTargetTeamsGenericLoopDeviceFunction(
+ CodeGenModule &CGM, StringRef ParentName,
+ const OMPTargetTeamsGenericLoopDirective &S) {
+ // Emit SPMD target parallel loop region as a standalone region.
+ auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
+ emitTargetTeamsGenericLoopRegion(CGF, S, Action);
+ };
+ llvm::Function *Fn;
+ llvm::Constant *Addr;
+ // Emit target region as a standalone region.
+ CGM.getOpenMPRuntime().emitTargetOutlinedFunction(
+ S, ParentName, Fn, Addr, /*IsOffloadEntry=*/true, CodeGen);
+ assert(Fn && Addr &&
+ "Target device function emission failed for 'target teams loop'.");
+}
+
+static void emitTargetParallelGenericLoopRegion(
+ CodeGenFunction &CGF, const OMPTargetParallelGenericLoopDirective &S,
+ PrePostActionTy &Action) {
+ Action.Enter(CGF);
+ // Emit as 'parallel for'.
+ auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
+ Action.Enter(CGF);
+ CodeGenFunction::OMPCancelStackRAII CancelRegion(
+ CGF, OMPD_target_parallel_loop, /*hasCancel=*/false);
+ CGF.EmitOMPWorksharingLoop(S, S.getEnsureUpperBound(), emitForLoopBounds,
+ emitDispatchForLoopBounds);
+ };
+ emitCommonOMPParallelDirective(CGF, S, OMPD_for, CodeGen,
+ emitEmptyBoundParameters);
+}
+
+void CodeGenFunction::EmitOMPTargetParallelGenericLoopDeviceFunction(
+ CodeGenModule &CGM, StringRef ParentName,
+ const OMPTargetParallelGenericLoopDirective &S) {
+ // Emit target parallel loop region as a standalone region.
+ auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
+ emitTargetParallelGenericLoopRegion(CGF, S, Action);
+ };
+ llvm::Function *Fn;
+ llvm::Constant *Addr;
+ // Emit target region as a standalone region.
+ CGM.getOpenMPRuntime().emitTargetOutlinedFunction(
+ S, ParentName, Fn, Addr, /*IsOffloadEntry=*/true, CodeGen);
+ assert(Fn && Addr && "Target device function emission failed.");
+}
+
+/// Emit combined directive 'target parallel loop' as if its constituent
+/// constructs are 'target', 'parallel', and 'for'.
+void CodeGenFunction::EmitOMPTargetParallelGenericLoopDirective(
+ const OMPTargetParallelGenericLoopDirective &S) {
+ auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
+ emitTargetParallelGenericLoopRegion(CGF, S, Action);
+ };
+ emitCommonOMPTargetDirective(*this, S, CodeGen);
+}
+
void CodeGenFunction::EmitSimpleOMPExecutableDirective(
const OMPExecutableDirective &D) {
if (const auto *SD = dyn_cast<OMPScanDirective>(&D)) {
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 1ff6528cb38b74..688a8f2ab63e46 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -3580,6 +3580,12 @@ class CodeGenFunction : public CodeGenTypeCache {
void EmitOMPTargetTeamsDistributeSimdDirective(
const OMPTargetTeamsDistributeSimdDirective &S);
void EmitOMPGenericLoopDirective(const OMPGenericLoopDirective &S);
+ void EmitOMPParallelGenericLoopDirective(const OMPLoopDirective &S);
+ void EmitOMPTargetParallelGenericLoopDirective(
+ const OMPTargetParallelGenericLoopDirective &S);
+ void EmitOMPTargetTeamsGenericLoopDirective(
+ const OMPTargetTeamsGenericLoopDirective &S);
+ void EmitOMPTeamsGenericLoopDirective(const OMPTeamsGenericLoopDirective &S);
void EmitOMPInteropDirective(const OMPInteropDirective &S);
void EmitOMPParallelMaskedDirective(const OMPParallelMaskedDirective &S);
@@ -3620,6 +3626,16 @@ class CodeGenFunction : public CodeGenTypeCache {
CodeGenModule &CGM, StringRef ParentName,
const OMPTargetTeamsDistributeParallelForSimdDirective &S);
+ /// Emit device code for the target teams loop directive.
+ static void EmitOMPTargetTeamsGenericLoopDeviceFunction(
+ CodeGenModule &CGM, StringRef ParentName,
+ const OMPTargetTeamsGenericLoopDirective &S);
+
+ /// Emit device code for the target parallel loop directive.
+ static void EmitOMPTargetParallelGenericLoopDeviceFunction(
+ CodeGenModule &CGM, StringRef ParentName,
+ const OMPTargetParallelGenericLoopDirective &S);
+
static void EmitOMPTargetTeamsDistributeParallelForDeviceFunction(
CodeGenModule &CGM, StringRef ParentName,
const OMPTargetTeamsDistributeParallelForDirective &S);
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 2d7d6cf81732a9..17cedd6061c8aa 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -4199,7 +4199,6 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
case OMPD_target_parallel:
case OMPD_target_parallel_for:
case OMPD_target_parallel_for_simd:
- case OMPD_target_teams_loop:
case OMPD_target_parallel_loop:
case OMPD_target_teams_distribute:
case OMPD_target_teams_distribute_simd: {
@@ -4448,6 +4447,7 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
Params);
break;
}
+ case OMPD_target_teams_loop:
case OMPD_target_teams_distribute_parallel_for:
case OMPD_target_teams_distribute_parallel_for_simd: {
QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1).withConst();
@@ -4506,22 +4506,7 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
break;
}
- case OMPD_teams_loop: {
- QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1).withConst();
- QualType KmpInt32PtrTy =
- Context.getPointerType(KmpInt32Ty).withConst().withRestrict();
-
- Sema::CapturedParamNameType ParamsTeams[] = {
- std::make_pair(".global_tid.", KmpInt32PtrTy),
- std::make_pair(".bound_tid.", KmpInt32PtrTy),
- std::make_pair(StringRef(), QualType()) // __context with shared vars
- };
- // Start a captured region for 'teams'.
- ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
- ParamsTeams, /*OpenMPCaptureLevel=*/0);
- break;
- }
-
+ case OMPD_teams_loop:
case OMPD_teams_distribute_parallel_for:
case OMPD_teams_distribute_parallel_for_simd: {
QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1).withConst();
@@ -15381,6 +15366,7 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
break;
}
[[fallthrough]];
+ case OMPD_target_teams_loop:
case OMPD_target_teams_distribute_parallel_for:
// If this clause applies to the nested 'parallel' region, capture within
// the 'teams' region, otherwise do not capture.
@@ -15473,7 +15459,6 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
case OMPD_target:
case OMPD_target_teams:
case OMPD_target_teams_distribute:
- case OMPD_target_teams_loop:
case OMPD_distribute_parallel_for:
case OMPD_task:
case OMPD_taskloop:
diff --git a/clang/test/OpenMP/generic_loop_codegen.cpp b/clang/test/OpenMP/generic_loop_codegen.cpp
new file mode 100644
index 00000000000000..e73f457ca583cd
--- /dev/null
+++ b/clang/test/OpenMP/generic_loop_codegen.cpp
@@ -0,0 +1,117 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]"
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp %s
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -emit-llvm %s -o - | FileCheck %s --check-prefix=IR
+
+// Check same results after serialization round-trip
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -emit-pch -o %t %s
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=IR-PCH
+
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+void foo(int t) {
+
+ int i, j, z;
+ #pragma omp loop collapse(2) reduction(+:z) lastprivate(j) bind(thread)
+ for (int i = 0; i<t; ++i)
+ for (j = 0; j<t; ++j)
+ z += i+j;
+}
+#endif
+// IR-LABEL: define {{[^@]+}}@_Z3fooi
+// IR-SAME: (i32 noundef [[T:%.*]]) #[[ATTR0:[0-9]+]] {
+// IR-NEXT: entry:
+// IR-NEXT: [[T_ADDR:%.*]] = alloca i32, align 4
+// IR-NEXT: [[I:%.*]] = alloca i32, align 4
+// IR-NEXT: [[J:%.*]] = alloca i32, align 4
+// IR-NEXT: [[Z:%.*]] = alloca i32, align 4
+// IR-NEXT: [[I1:%.*]] = alloca i32, align 4
+// IR-NEXT: store i32 [[T]], ptr [[T_ADDR]], align 4
+// IR-NEXT: store i32 0, ptr [[I1]], align 4
+// IR-NEXT: br label [[FOR_COND:%.*]]
+// IR: for.cond:
+// IR-NEXT: [[TMP0:%.*]] = load i32, ptr [[I1]], align 4
+// IR-NEXT: [[TMP1:%.*]] = load i32, ptr [[T_ADDR]], align 4
+// IR-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], [[TMP1]]
+// IR-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END8:%.*]]
+// IR: for.body:
+// IR-NEXT: store i32 0, ptr [[J]], align 4
+// IR-NEXT: br label [[FOR_COND2:%.*]]
+// IR: for.cond2:
+// IR-NEXT: [[TMP2:%.*]] = load i32, ptr [[J]], align 4
+// IR-NEXT: [[TMP3:%.*]] = load i32, ptr [[T_ADDR]], align 4
+// IR-NEXT: [[CMP3:%.*]] = icmp slt i32 [[TMP2]], [[TMP3]]
+// IR-NEXT: br i1 [[CMP3]], label [[FOR_BODY4:%.*]], label [[FOR_END:%.*]]
+// IR: for.body4:
+// IR-NEXT: [[TMP4:%.*]] = load i32, ptr [[I1]], align 4
+// IR-NEXT: [[TMP5:%.*]] = load i32, ptr [[J]], align 4
+// IR-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP4]], [[TMP5]]
+// IR-NEXT: [[TMP6:%.*]] = load i32, ptr [[Z]], align 4
+// IR-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP6]], [[ADD]]
+// IR-NEXT: store i32 [[ADD5]], ptr [[Z]], align 4
+// IR-NEXT: br label [[FOR_INC:%.*]]
+// IR: for.inc:
+// IR-NEXT: [[TMP7:%.*]] = load i32, ptr [[J]], align 4
+// IR-NEXT: [[INC:%.*]] = add nsw i32 [[TMP7]], 1
+// IR-NEXT: store i32 [[INC]], ptr [[J]], align 4
+// IR-NEXT: br label [[FOR_COND2]], !llvm.loop [[LOOP3:![0-9]+]]
+// IR: for.end:
+// IR-NEXT: br label [[FOR_INC6:%.*]]
+// IR: for.inc6:
+// IR-NEXT: [[TMP8:%.*]] = load i32, ptr [[I1]], align 4
+// IR-NEXT: [[INC7:%.*]] = add nsw i32 [[TMP8]], 1
+// IR-NEXT: store i32 [[INC7]], ptr [[I1]], align 4
+// IR-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]]
+// IR: for.end8:
+// IR-NEXT: ret void
+//
+//
+// IR-PCH-LABEL: define {{[^@]+}}@_Z3fooi
+// IR-PCH-SAME: (i32 noundef [[T:%.*]]) #[[ATTR0:[0-9]+]] {
+// IR-PCH-NEXT: entry:
+// IR-PCH-NEXT: [[T_ADDR:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[I:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[J:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[Z:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[I1:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: store i32 [[T]], ptr [[T_ADDR]], align 4
+// IR-PCH-NEXT: store i32 0, ptr [[I1]], align 4
+// IR-PCH-NEXT: br label [[FOR_COND:%.*]]
+// IR-PCH: for.cond:
+// IR-PCH-NEXT: [[TMP0:%.*]] = load i32, ptr [[I1]], align 4
+// IR-PCH-NEXT: [[TMP1:%.*]] = load i32, ptr [[T_ADDR]], align 4
+// IR-PCH-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], [[TMP1]]
+// IR-PCH-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END8:%.*]]
+// IR-PCH: for.body:
+// IR-PCH-NEXT: store i32 0, ptr [[J]], align 4
+// IR-PCH-NEXT: br label [[FOR_COND2:%.*]]
+// IR-PCH: for.cond2:
+// IR-PCH-NEXT: [[TMP2:%.*]] = load i32, ptr [[J]], align 4
+// IR-PCH-NEXT: [[TMP3:%.*]] = load i32, ptr [[T_ADDR]], align 4
+// IR-PCH-NEXT: [[CMP3:%.*]] = icmp slt i32 [[TMP2]], [[TMP3]]
+// IR-PCH-NEXT: br i1 [[CMP3]], label [[FOR_BODY4:%.*]], label [[FOR_END:%.*]]
+// IR-PCH: for.body4:
+// IR-PCH-NEXT: [[TMP4:%.*]] = load i32, ptr [[I1]], align 4
+// IR-PCH-NEXT: [[TMP5:%.*]] = load i32, ptr [[J]], align 4
+// IR-PCH-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP4]], [[TMP5]]
+// IR-PCH-NEXT: [[TMP6:%.*]] = load i32, ptr [[Z]], align 4
+// IR-PCH-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP6]], [[ADD]]
+// IR-PCH-NEXT: store i32 [[ADD5]], ptr [[Z]], align 4
+// IR-PCH-NEXT: br label [[FOR_INC:%.*]]
+// IR-PCH: for.inc:
+// IR-PCH-NEXT: [[TMP7:%.*]] = load i32, ptr [[J]], align 4
+// IR-PCH-NEXT: [[INC:%.*]] = add nsw i32 [[TMP7]], 1
+// IR-PCH-NEXT: store i32 [[INC]], ptr [[J]], align 4
+// IR-PCH-NEXT: br label [[FOR_COND2]], !llvm.loop [[LOOP3:![0-9]+]]
+// IR-PCH: for.end:
+// IR-PCH-NEXT: br label [[FOR_INC6:%.*]]
+// IR-PCH: for.inc6:
+// IR-PCH-NEXT: [[TMP8:%.*]] = load i32, ptr [[I1]], align 4
+// IR-PCH-NEXT: [[INC7:%.*]] = add nsw i32 [[TMP8]], 1
+// IR-PCH-NEXT: store i32 [[INC7]], ptr [[I1]], align 4
+// IR-PCH-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]]
+// IR-PCH: for.end8:
+// IR-PCH-NEXT: ret void
+//
diff --git a/clang/test/OpenMP/nvptx_target_teams_generic_loop_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_generic_loop_codegen.cpp
new file mode 100644
index 00000000000000..822ad520171db6
--- /dev/null
+++ b/clang/test/OpenMP/nvptx_target_teams_generic_loop_codegen.cpp
@@ -0,0 +1,3746 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - | FileCheck %s --check-prefix=CHECK2
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK3
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK3
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+#define N 1000
+#define M 10
+
+template<typename tx>
+tx ftemplate(int n) {
+ tx a[N];
+ short aa[N];
+ tx b[10];
+ tx c[M][M];
+ tx f = n;
+ tx l;
+ int k;
+ tx *v;
+
+#pragma omp target teams loop map(tofrom: aa) num_teams(M) thread_limit(64)
+ for(int i = 0; i < n; i++) {
+ aa[i] += 1;
+ }
+
+#pragma omp target teams loop map(tofrom:a, aa, b) if(target: n>40)
+ for(int i = 0; i < 10; i++) {
+ b[i] += 1;
+ }
+
+#pragma omp target teams loop collapse(2) firstprivate(f) private(k)
+ for(int i = 0; i < M; i++) {
+ for(int j = 0; j < M; j++) {
+ k = M;
+ c[i][j] = i + j * f + k;
+ }
+ }
+
+#pragma omp target teams loop collapse(2)
+ for(int i = 0; i < n; i++) {
+ for(int j = 0; j < n; j++) {
+ c[i][j] = i + j;
+ }
+ }
+
+#pragma omp target teams loop map(a, v[:N])
+ for(int i = 0; i < n; i++)
+ a[i] = v[i];
+ return a[0];
+}
+
+int bar(int n){
+ int a = 0;
+
+ a += ftemplate<int>(n);
+
+ return a;
+}
+
+#endif
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28
+// CHECK1-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[AA]], ptr [[AA_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1:[0-9]+]], i8 2, i1 false)
+// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK1: user_code.entry:
+// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP3]], ptr [[N_CASTED]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]]) #[[ATTR2:[0-9]+]]
+// CHECK1-NEXT: call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK1-NEXT: ret void
+// CHECK1: worker.exit:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28_omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[AA]], ptr [[AA_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK1-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK1: omp.precond.then:
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK1-NEXT: call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP6]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK1-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1
+// CHECK1-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]]
+// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64
+// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP18]], ptr [[N_CASTED]], align 4
+// CHECK1-NEXT: [[TMP19:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK1-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to ptr
+// CHECK1-NEXT: store ptr [[TMP21]], ptr [[TMP20]], align 8
+// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK1-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to ptr
+// CHECK1-NEXT: store ptr [[TMP23]], ptr [[TMP22]], align 8
+// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
+// CHECK1-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK1-NEXT: store ptr [[TMP25]], ptr [[TMP24]], align 8
+// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3
+// CHECK1-NEXT: store ptr [[TMP0]], ptr [[TMP26]], align 8
+// CHECK1-NEXT: [[TMP27:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
+// CHECK1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP28]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4)
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]]
+// CHECK1-NEXT: store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]]
+// CHECK1-NEXT: store i32 [[ADD7]], ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP34:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]]
+// CHECK1-NEXT: store i32 [[ADD8]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP35:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]]
+// CHECK1-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]]
+// CHECK1: cond.true10:
+// CHECK1-NEXT: [[TMP37:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: br label [[COND_END12:%.*]]
+// CHECK1: cond.false11:
+// CHECK1-NEXT: [[TMP38:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END12]]
+// CHECK1: cond.end12:
+// CHECK1-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ]
+// CHECK1-NEXT: store i32 [[COND13]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP39:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP39]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: [[TMP40:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP41:%.*]] = load i32, ptr [[TMP40]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3:[0-9]+]], i32 [[TMP41]])
+// CHECK1-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK1: omp.precond.end:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28_omp_outlined_omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I4:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[AA]], ptr [[AA_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK1-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK1: omp.precond.then:
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP5:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP5]] to i32
+// CHECK1-NEXT: [[TMP6:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP6]] to i32
+// CHECK1-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[CONV3]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB3]], i32 [[TMP8]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP9]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[CONV5:%.*]] = sext i32 [[TMP10]] to i64
+// CHECK1-NEXT: [[TMP11:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: [[CMP6:%.*]] = icmp ule i64 [[CONV5]], [[TMP11]]
+// CHECK1-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[I4]], align 4
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[I4]], align 4
+// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64
+// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK1-NEXT: [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+// CHECK1-NEXT: [[CONV7:%.*]] = sext i16 [[TMP14]] to i32
+// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 [[CONV7]], 1
+// CHECK1-NEXT: [[CONV9:%.*]] = trunc i32 [[ADD8]] to i16
+// CHECK1-NEXT: store i16 [[CONV9]], ptr [[ARRAYIDX]], align 2
+// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1: omp.body.continue:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK1-NEXT: store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP18]])
+// CHECK1-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK1: omp.precond.end:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK1: user_code.entry:
+// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK1-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]]) #[[ATTR2]]
+// CHECK1-NEXT: call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK1-NEXT: ret void
+// CHECK1: worker.exit:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33_omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x ptr], align 8
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 9, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK1-NEXT: call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2]], i32 [[TMP2]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10
+// CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK1-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK1-NEXT: store ptr [[TMP12]], ptr [[TMP11]], align 8
+// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK1-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK1-NEXT: store ptr [[TMP14]], ptr [[TMP13]], align 8
+// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
+// CHECK1-NEXT: store ptr [[TMP0]], ptr [[TMP15]], align 8
+// CHECK1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 3)
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
+// CHECK1-NEXT: store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK1-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9
+// CHECK1-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK1: cond.true5:
+// CHECK1-NEXT: br label [[COND_END7:%.*]]
+// CHECK1: cond.false6:
+// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END7]]
+// CHECK1: cond.end7:
+// CHECK1-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ]
+// CHECK1-NEXT: store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP24]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP2]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33_omp_outlined_omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK1-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32
+// CHECK1-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[CONV1]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB3]], i32 [[TMP4]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[CONV2:%.*]] = sext i32 [[TMP6]] to i64
+// CHECK1-NEXT: [[TMP7:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]]
+// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64
+// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK1-NEXT: store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4
+// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1: omp.body.continue:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK1-NEXT: store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP4]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i64 noundef [[F:%.*]]) #[[ATTR4]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
+// CHECK1-NEXT: store i64 [[F]], ptr [[F_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK1: user_code.entry:
+// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[F_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP3]], ptr [[F_CASTED]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i64, ptr [[F_CASTED]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]], i64 [[TMP4]]) #[[ATTR2]]
+// CHECK1-NEXT: call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK1-NEXT: ret void
+// CHECK1: worker.exit:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38_omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i64 noundef [[F:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[K:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
+// CHECK1-NEXT: store i64 [[F]], ptr [[F_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 99, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK1-NEXT: call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2]], i32 [[TMP2]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100
+// CHECK1-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[F_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP11]], ptr [[F_CASTED]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = load i64, ptr [[F_CASTED]], align 8
+// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK1-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK1-NEXT: store ptr [[TMP14]], ptr [[TMP13]], align 8
+// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK1-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK1-NEXT: store ptr [[TMP16]], ptr [[TMP15]], align 8
+// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
+// CHECK1-NEXT: store ptr [[TMP0]], ptr [[TMP17]], align 8
+// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3
+// CHECK1-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP12]] to ptr
+// CHECK1-NEXT: store ptr [[TMP19]], ptr [[TMP18]], align 8
+// CHECK1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4)
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK1-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]]
+// CHECK1-NEXT: store i32 [[ADD4]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99
+// CHECK1-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]]
+// CHECK1: cond.true6:
+// CHECK1-NEXT: br label [[COND_END8:%.*]]
+// CHECK1: cond.false7:
+// CHECK1-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END8]]
+// CHECK1: cond.end8:
+// CHECK1-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ]
+// CHECK1-NEXT: store i32 [[COND9]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP28]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP2]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38_omp_outlined_omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i64 noundef [[F:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[K:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
+// CHECK1-NEXT: store i64 [[F]], ptr [[F_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 99, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK1-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP2]] to i32
+// CHECK1-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[CONV2]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB3]], i32 [[TMP4]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[CONV3:%.*]] = sext i32 [[TMP6]] to i64
+// CHECK1-NEXT: [[TMP7:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV3]], [[TMP7]]
+// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[DIV4:%.*]] = sdiv i32 [[TMP10]], 10
+// CHECK1-NEXT: [[MUL5:%.*]] = mul nsw i32 [[DIV4]], 10
+// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL5]]
+// CHECK1-NEXT: [[MUL6:%.*]] = mul nsw i32 [[SUB]], 1
+// CHECK1-NEXT: [[ADD7:%.*]] = add nsw i32 0, [[MUL6]]
+// CHECK1-NEXT: store i32 [[ADD7]], ptr [[J]], align 4
+// CHECK1-NEXT: store i32 10, ptr [[K]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[J]], align 4
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[F_ADDR]], align 4
+// CHECK1-NEXT: [[MUL8:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]]
+// CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP11]], [[MUL8]]
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[K]], align 4
+// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD9]], [[TMP14]]
+// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64
+// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[J]], align 4
+// CHECK1-NEXT: [[IDXPROM11:%.*]] = sext i32 [[TMP16]] to i64
+// CHECK1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM11]]
+// CHECK1-NEXT: store i32 [[ADD10]], ptr [[ARRAYIDX12]], align 4
+// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1: omp.body.continue:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK1-NEXT: store i32 [[ADD13]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP4]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46
+// CHECK1-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR4]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK1: user_code.entry:
+// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP3]], ptr [[N_CASTED]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]]) #[[ATTR2]]
+// CHECK1-NEXT: call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK1-NEXT: ret void
+// CHECK1: worker.exit:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I9:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[J10:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0
+// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK1-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0
+// CHECK1-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1
+// CHECK1-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]]
+// CHECK1-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1
+// CHECK1-NEXT: store i64 [[SUB7]], ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[J]], align 4
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]]
+// CHECK1-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK1: land.lhs.true:
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]]
+// CHECK1-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]]
+// CHECK1: omp.precond.then:
+// CHECK1-NEXT: store i64 0, ptr [[DOTOMP_COMB_LB]], align 8
+// CHECK1-NEXT: [[TMP7:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK1-NEXT: store i64 [[TMP7]], ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK1-NEXT: store i64 1, ptr [[DOTOMP_STRIDE]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK1-NEXT: [[CONV11:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64
+// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4
+// CHECK1-NEXT: call void @__kmpc_distribute_static_init_8(ptr @[[GLOB2]], i32 [[TMP9]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i64 1, i64 [[CONV11]])
+// CHECK1-NEXT: [[TMP10:%.*]] = load i64, ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK1-NEXT: [[TMP11:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK1-NEXT: [[CMP12:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]]
+// CHECK1-NEXT: br i1 [[CMP12]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: [[TMP12:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP13:%.*]] = load i64, ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i64 [[COND]], ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK1-NEXT: [[TMP14:%.*]] = load i64, ptr [[DOTOMP_COMB_LB]], align 8
+// CHECK1-NEXT: store i64 [[TMP14]], ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP15:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT: [[TMP16:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1
+// CHECK1-NEXT: [[CMP13:%.*]] = icmp slt i64 [[TMP15]], [[ADD]]
+// CHECK1-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP17:%.*]] = load i64, ptr [[DOTOMP_COMB_LB]], align 8
+// CHECK1-NEXT: [[TMP18:%.*]] = load i64, ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP19]], ptr [[N_CASTED]], align 4
+// CHECK1-NEXT: [[TMP20:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK1-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP17]] to ptr
+// CHECK1-NEXT: store ptr [[TMP22]], ptr [[TMP21]], align 8
+// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK1-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP18]] to ptr
+// CHECK1-NEXT: store ptr [[TMP24]], ptr [[TMP23]], align 8
+// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
+// CHECK1-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK1-NEXT: store ptr [[TMP26]], ptr [[TMP25]], align 8
+// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3
+// CHECK1-NEXT: store ptr [[TMP0]], ptr [[TMP27]], align 8
+// CHECK1-NEXT: [[TMP28:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP28]], align 4
+// CHECK1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP29]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4)
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP30:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT: [[TMP31:%.*]] = load i64, ptr [[DOTOMP_STRIDE]], align 8
+// CHECK1-NEXT: [[ADD14:%.*]] = add nsw i64 [[TMP30]], [[TMP31]]
+// CHECK1-NEXT: store i64 [[ADD14]], ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT: [[TMP32:%.*]] = load i64, ptr [[DOTOMP_COMB_LB]], align 8
+// CHECK1-NEXT: [[TMP33:%.*]] = load i64, ptr [[DOTOMP_STRIDE]], align 8
+// CHECK1-NEXT: [[ADD15:%.*]] = add nsw i64 [[TMP32]], [[TMP33]]
+// CHECK1-NEXT: store i64 [[ADD15]], ptr [[DOTOMP_COMB_LB]], align 8
+// CHECK1-NEXT: [[TMP34:%.*]] = load i64, ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK1-NEXT: [[TMP35:%.*]] = load i64, ptr [[DOTOMP_STRIDE]], align 8
+// CHECK1-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP34]], [[TMP35]]
+// CHECK1-NEXT: store i64 [[ADD16]], ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK1-NEXT: [[TMP36:%.*]] = load i64, ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK1-NEXT: [[TMP37:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK1-NEXT: [[CMP17:%.*]] = icmp sgt i64 [[TMP36]], [[TMP37]]
+// CHECK1-NEXT: br i1 [[CMP17]], label [[COND_TRUE18:%.*]], label [[COND_FALSE19:%.*]]
+// CHECK1: cond.true18:
+// CHECK1-NEXT: [[TMP38:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK1-NEXT: br label [[COND_END20:%.*]]
+// CHECK1: cond.false19:
+// CHECK1-NEXT: [[TMP39:%.*]] = load i64, ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK1-NEXT: br label [[COND_END20]]
+// CHECK1: cond.end20:
+// CHECK1-NEXT: [[COND21:%.*]] = phi i64 [ [[TMP38]], [[COND_TRUE18]] ], [ [[TMP39]], [[COND_FALSE19]] ]
+// CHECK1-NEXT: store i64 [[COND21]], ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK1-NEXT: [[TMP40:%.*]] = load i64, ptr [[DOTOMP_COMB_LB]], align 8
+// CHECK1-NEXT: store i64 [[TMP40]], ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: [[TMP41:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP42:%.*]] = load i32, ptr [[TMP41]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP42]])
+// CHECK1-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK1: omp.precond.end:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_omp_outlined_omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I9:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[J10:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0
+// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK1-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0
+// CHECK1-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1
+// CHECK1-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]]
+// CHECK1-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1
+// CHECK1-NEXT: store i64 [[SUB7]], ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[J]], align 4
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]]
+// CHECK1-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK1: land.lhs.true:
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]]
+// CHECK1-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]]
+// CHECK1: omp.precond.then:
+// CHECK1-NEXT: store i64 0, ptr [[DOTOMP_LB]], align 8
+// CHECK1-NEXT: [[TMP7:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK1-NEXT: store i64 [[TMP7]], ptr [[DOTOMP_UB]], align 8
+// CHECK1-NEXT: [[TMP8:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: [[TMP9:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[TMP8]], ptr [[DOTOMP_LB]], align 8
+// CHECK1-NEXT: store i64 [[TMP9]], ptr [[DOTOMP_UB]], align 8
+// CHECK1-NEXT: store i64 1, ptr [[DOTOMP_STRIDE]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_8(ptr @[[GLOB3]], i32 [[TMP11]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i64 1, i64 1)
+// CHECK1-NEXT: [[TMP12:%.*]] = load i64, ptr [[DOTOMP_LB]], align 8
+// CHECK1-NEXT: store i64 [[TMP12]], ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP13:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT: [[TMP14:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: [[CMP11:%.*]] = icmp ule i64 [[TMP13]], [[TMP14]]
+// CHECK1-NEXT: br i1 [[CMP11]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP15:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT: [[SUB12:%.*]] = sub nsw i32 [[TMP16]], 0
+// CHECK1-NEXT: [[DIV13:%.*]] = sdiv i32 [[SUB12]], 1
+// CHECK1-NEXT: [[MUL14:%.*]] = mul nsw i32 1, [[DIV13]]
+// CHECK1-NEXT: [[CONV15:%.*]] = sext i32 [[MUL14]] to i64
+// CHECK1-NEXT: [[DIV16:%.*]] = sdiv i64 [[TMP15]], [[CONV15]]
+// CHECK1-NEXT: [[MUL17:%.*]] = mul nsw i64 [[DIV16]], 1
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL17]]
+// CHECK1-NEXT: [[CONV18:%.*]] = trunc i64 [[ADD]] to i32
+// CHECK1-NEXT: store i32 [[CONV18]], ptr [[I9]], align 4
+// CHECK1-NEXT: [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT: [[TMP18:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT: [[SUB19:%.*]] = sub nsw i32 [[TMP19]], 0
+// CHECK1-NEXT: [[DIV20:%.*]] = sdiv i32 [[SUB19]], 1
+// CHECK1-NEXT: [[MUL21:%.*]] = mul nsw i32 1, [[DIV20]]
+// CHECK1-NEXT: [[CONV22:%.*]] = sext i32 [[MUL21]] to i64
+// CHECK1-NEXT: [[DIV23:%.*]] = sdiv i64 [[TMP18]], [[CONV22]]
+// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT: [[SUB24:%.*]] = sub nsw i32 [[TMP20]], 0
+// CHECK1-NEXT: [[DIV25:%.*]] = sdiv i32 [[SUB24]], 1
+// CHECK1-NEXT: [[MUL26:%.*]] = mul nsw i32 1, [[DIV25]]
+// CHECK1-NEXT: [[CONV27:%.*]] = sext i32 [[MUL26]] to i64
+// CHECK1-NEXT: [[MUL28:%.*]] = mul nsw i64 [[DIV23]], [[CONV27]]
+// CHECK1-NEXT: [[SUB29:%.*]] = sub nsw i64 [[TMP17]], [[MUL28]]
+// CHECK1-NEXT: [[MUL30:%.*]] = mul nsw i64 [[SUB29]], 1
+// CHECK1-NEXT: [[ADD31:%.*]] = add nsw i64 0, [[MUL30]]
+// CHECK1-NEXT: [[CONV32:%.*]] = trunc i64 [[ADD31]] to i32
+// CHECK1-NEXT: store i32 [[CONV32]], ptr [[J10]], align 4
+// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[I9]], align 4
+// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[J10]], align 4
+// CHECK1-NEXT: [[ADD33:%.*]] = add nsw i32 [[TMP21]], [[TMP22]]
+// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[I9]], align 4
+// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP23]] to i64
+// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[J10]], align 4
+// CHECK1-NEXT: [[IDXPROM34:%.*]] = sext i32 [[TMP24]] to i64
+// CHECK1-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM34]]
+// CHECK1-NEXT: store i32 [[ADD33]], ptr [[ARRAYIDX35]], align 4
+// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1: omp.body.continue:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP25:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT: [[TMP26:%.*]] = load i64, ptr [[DOTOMP_STRIDE]], align 8
+// CHECK1-NEXT: [[ADD36:%.*]] = add nsw i64 [[TMP25]], [[TMP26]]
+// CHECK1-NEXT: store i64 [[ADD36]], ptr [[DOTOMP_IV]], align 8
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: [[TMP27:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP28]])
+// CHECK1-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK1: omp.precond.end:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l53
+// CHECK1-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[V:%.*]]) #[[ATTR4]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[V_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[V]], ptr [[V_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK1: user_code.entry:
+// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP3]], ptr [[N_CASTED]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[V_ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l53_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]], ptr [[TMP5]]) #[[ATTR2]]
+// CHECK1-NEXT: call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK1-NEXT: ret void
+// CHECK1: worker.exit:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l53_omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[V:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[V_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x ptr], align 8
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[V]], ptr [[V_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK1-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK1: omp.precond.then:
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK1-NEXT: call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2]], i32 [[TMP6]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK1-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1
+// CHECK1-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]]
+// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64
+// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP18]], ptr [[N_CASTED]], align 4
+// CHECK1-NEXT: [[TMP19:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK1-NEXT: [[TMP20:%.*]] = load ptr, ptr [[V_ADDR]], align 8
+// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK1-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP15]] to ptr
+// CHECK1-NEXT: store ptr [[TMP22]], ptr [[TMP21]], align 8
+// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK1-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP17]] to ptr
+// CHECK1-NEXT: store ptr [[TMP24]], ptr [[TMP23]], align 8
+// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
+// CHECK1-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK1-NEXT: store ptr [[TMP26]], ptr [[TMP25]], align 8
+// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3
+// CHECK1-NEXT: store ptr [[TMP0]], ptr [[TMP27]], align 8
+// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 4
+// CHECK1-NEXT: store ptr [[TMP20]], ptr [[TMP28]], align 8
+// CHECK1-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[TMP29]], align 4
+// CHECK1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l53_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 5)
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP31]], [[TMP32]]
+// CHECK1-NEXT: store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP34:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP33]], [[TMP34]]
+// CHECK1-NEXT: store i32 [[ADD7]], ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP35:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP35]], [[TMP36]]
+// CHECK1-NEXT: store i32 [[ADD8]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP37:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP38:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]]
+// CHECK1-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]]
+// CHECK1: cond.true10:
+// CHECK1-NEXT: [[TMP39:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: br label [[COND_END12:%.*]]
+// CHECK1: cond.false11:
+// CHECK1-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END12]]
+// CHECK1: cond.end12:
+// CHECK1-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE10]] ], [ [[TMP40]], [[COND_FALSE11]] ]
+// CHECK1-NEXT: store i32 [[COND13]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP41]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: [[TMP42:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP43:%.*]] = load i32, ptr [[TMP42]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP43]])
+// CHECK1-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK1: omp.precond.end:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l53_omp_outlined_omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[V:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[V_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I4:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[V]], ptr [[V_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK1-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK1: omp.precond.then:
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP5:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP5]] to i32
+// CHECK1-NEXT: [[TMP6:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP6]] to i32
+// CHECK1-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[CONV3]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB3]], i32 [[TMP8]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP9]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[CONV5:%.*]] = sext i32 [[TMP10]] to i64
+// CHECK1-NEXT: [[TMP11:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: [[CMP6:%.*]] = icmp ule i64 [[CONV5]], [[TMP11]]
+// CHECK1-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[I4]], align 4
+// CHECK1-NEXT: [[TMP13:%.*]] = load ptr, ptr [[V_ADDR]], align 8
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[I4]], align 4
+// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64
+// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[IDXPROM]]
+// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[I4]], align 4
+// CHECK1-NEXT: [[IDXPROM7:%.*]] = sext i32 [[TMP16]] to i64
+// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM7]]
+// CHECK1-NEXT: store i32 [[TMP15]], ptr [[ARRAYIDX8]], align 4
+// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1: omp.body.continue:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK1-NEXT: store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: [[TMP19:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP20]])
+// CHECK1-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK1: omp.precond.end:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28
+// CHECK2-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK2-NEXT: entry:
+// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK2-NEXT: store ptr [[AA]], ptr [[AA_ADDR]], align 8
+// CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 8
+// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1:[0-9]+]], i8 2, i1 false)
+// CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK2: user_code.entry:
+// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP3]], ptr [[N_CASTED]], align 4
+// CHECK2-NEXT: [[TMP4:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK2-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]]) #[[ATTR2:[0-9]+]]
+// CHECK2-NEXT: call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK2-NEXT: ret void
+// CHECK2: worker.exit:
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28_omp_outlined
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK2-NEXT: entry:
+// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
+// CHECK2-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK2-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK2-NEXT: store ptr [[AA]], ptr [[AA_ADDR]], align 8
+// CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 8
+// CHECK2-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK2-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK2: omp.precond.then:
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK2-NEXT: [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK2-NEXT: call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP6]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK2-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK2-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK2: cond.true:
+// CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: br label [[COND_END:%.*]]
+// CHECK2: cond.false:
+// CHECK2-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: br label [[COND_END]]
+// CHECK2: cond.end:
+// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ]
+// CHECK2-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK2: omp.inner.for.cond:
+// CHECK2-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1
+// CHECK2-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]]
+// CHECK2-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK2: omp.inner.for.body:
+// CHECK2-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64
+// CHECK2-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64
+// CHECK2-NEXT: [[TMP18:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP18]], ptr [[N_CASTED]], align 4
+// CHECK2-NEXT: [[TMP19:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK2-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK2-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to ptr
+// CHECK2-NEXT: store ptr [[TMP21]], ptr [[TMP20]], align 8
+// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK2-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to ptr
+// CHECK2-NEXT: store ptr [[TMP23]], ptr [[TMP22]], align 8
+// CHECK2-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
+// CHECK2-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK2-NEXT: store ptr [[TMP25]], ptr [[TMP24]], align 8
+// CHECK2-NEXT: [[TMP26:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3
+// CHECK2-NEXT: store ptr [[TMP0]], ptr [[TMP26]], align 8
+// CHECK2-NEXT: [[TMP27:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
+// CHECK2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP28]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4)
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK2: omp.inner.for.inc:
+// CHECK2-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]]
+// CHECK2-NEXT: store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]]
+// CHECK2-NEXT: store i32 [[ADD7]], ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP34:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]]
+// CHECK2-NEXT: store i32 [[ADD8]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP35:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]]
+// CHECK2-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]]
+// CHECK2: cond.true10:
+// CHECK2-NEXT: [[TMP37:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: br label [[COND_END12:%.*]]
+// CHECK2: cond.false11:
+// CHECK2-NEXT: [[TMP38:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: br label [[COND_END12]]
+// CHECK2: cond.end12:
+// CHECK2-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ]
+// CHECK2-NEXT: store i32 [[COND13]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP39:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: store i32 [[TMP39]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK2: omp.inner.for.end:
+// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK2: omp.loop.exit:
+// CHECK2-NEXT: [[TMP40:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: [[TMP41:%.*]] = load i32, ptr [[TMP40]], align 4
+// CHECK2-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3:[0-9]+]], i32 [[TMP41]])
+// CHECK2-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK2: omp.precond.end:
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28_omp_outlined_omp_outlined
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR1]] {
+// CHECK2-NEXT: entry:
+// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[I4:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK2-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK2-NEXT: store ptr [[AA]], ptr [[AA_ADDR]], align 8
+// CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 8
+// CHECK2-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK2-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK2: omp.precond.then:
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT: [[TMP5:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK2-NEXT: [[CONV:%.*]] = trunc i64 [[TMP5]] to i32
+// CHECK2-NEXT: [[TMP6:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK2-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP6]] to i32
+// CHECK2-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK2-NEXT: store i32 [[CONV3]], ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK2-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK2-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB3]], i32 [[TMP8]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK2-NEXT: store i32 [[TMP9]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK2: omp.inner.for.cond:
+// CHECK2-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[CONV5:%.*]] = sext i32 [[TMP10]] to i64
+// CHECK2-NEXT: [[TMP11:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK2-NEXT: [[CMP6:%.*]] = icmp ule i64 [[CONV5]], [[TMP11]]
+// CHECK2-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK2: omp.inner.for.body:
+// CHECK2-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1
+// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK2-NEXT: store i32 [[ADD]], ptr [[I4]], align 4
+// CHECK2-NEXT: [[TMP13:%.*]] = load i32, ptr [[I4]], align 4
+// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64
+// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK2-NEXT: [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+// CHECK2-NEXT: [[CONV7:%.*]] = sext i16 [[TMP14]] to i32
+// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[CONV7]], 1
+// CHECK2-NEXT: [[CONV9:%.*]] = trunc i32 [[ADD8]] to i16
+// CHECK2-NEXT: store i16 [[CONV9]], ptr [[ARRAYIDX]], align 2
+// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK2: omp.body.continue:
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK2: omp.inner.for.inc:
+// CHECK2-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK2-NEXT: store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK2: omp.inner.for.end:
+// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK2: omp.loop.exit:
+// CHECK2-NEXT: [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
+// CHECK2-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP18]])
+// CHECK2-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK2: omp.precond.end:
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33
+// CHECK2-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK2-NEXT: entry:
+// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8
+// CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK2: user_code.entry:
+// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK2-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]]) #[[ATTR2]]
+// CHECK2-NEXT: call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK2-NEXT: ret void
+// CHECK2: worker.exit:
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33_omp_outlined
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK2-NEXT: entry:
+// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x ptr], align 8
+// CHECK2-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK2-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8
+// CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: store i32 9, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK2-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK2-NEXT: call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2]], i32 [[TMP2]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9
+// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK2: cond.true:
+// CHECK2-NEXT: br label [[COND_END:%.*]]
+// CHECK2: cond.false:
+// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: br label [[COND_END]]
+// CHECK2: cond.end:
+// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK2-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK2: omp.inner.for.cond:
+// CHECK2-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10
+// CHECK2-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK2: omp.inner.for.body:
+// CHECK2-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK2-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK2-NEXT: store ptr [[TMP12]], ptr [[TMP11]], align 8
+// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK2-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK2-NEXT: store ptr [[TMP14]], ptr [[TMP13]], align 8
+// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
+// CHECK2-NEXT: store ptr [[TMP0]], ptr [[TMP15]], align 8
+// CHECK2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 3)
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK2: omp.inner.for.inc:
+// CHECK2-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+// CHECK2-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
+// CHECK2-NEXT: store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK2-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9
+// CHECK2-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK2: cond.true5:
+// CHECK2-NEXT: br label [[COND_END7:%.*]]
+// CHECK2: cond.false6:
+// CHECK2-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: br label [[COND_END7]]
+// CHECK2: cond.end7:
+// CHECK2-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ]
+// CHECK2-NEXT: store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP24:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: store i32 [[TMP24]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK2: omp.inner.for.end:
+// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK2: omp.loop.exit:
+// CHECK2-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP2]])
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33_omp_outlined_omp_outlined
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK2-NEXT: entry:
+// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK2-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8
+// CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK2-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK2-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK2-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK2-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32
+// CHECK2-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK2-NEXT: store i32 [[CONV1]], ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+// CHECK2-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB3]], i32 [[TMP4]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK2-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK2: omp.inner.for.cond:
+// CHECK2-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[CONV2:%.*]] = sext i32 [[TMP6]] to i64
+// CHECK2-NEXT: [[TMP7:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK2-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]]
+// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK2: omp.inner.for.body:
+// CHECK2-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
+// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK2-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64
+// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK2-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK2-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK2-NEXT: store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4
+// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK2: omp.body.continue:
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK2: omp.inner.for.inc:
+// CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK2-NEXT: store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK2: omp.inner.for.end:
+// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK2: omp.loop.exit:
+// CHECK2-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP4]])
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38
+// CHECK2-SAME: (ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i64 noundef [[F:%.*]]) #[[ATTR4]] {
+// CHECK2-NEXT: entry:
+// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
+// CHECK2-NEXT: store i64 [[F]], ptr [[F_ADDR]], align 8
+// CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK2: user_code.entry:
+// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[F_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP3]], ptr [[F_CASTED]], align 4
+// CHECK2-NEXT: [[TMP4:%.*]] = load i64, ptr [[F_CASTED]], align 8
+// CHECK2-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]], i64 [[TMP4]]) #[[ATTR2]]
+// CHECK2-NEXT: call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK2-NEXT: ret void
+// CHECK2: worker.exit:
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38_omp_outlined
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i64 noundef [[F:%.*]]) #[[ATTR1]] {
+// CHECK2-NEXT: entry:
+// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[K:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[F_CASTED:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
+// CHECK2-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK2-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
+// CHECK2-NEXT: store i64 [[F]], ptr [[F_ADDR]], align 8
+// CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: store i32 99, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK2-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK2-NEXT: call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2]], i32 [[TMP2]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99
+// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK2: cond.true:
+// CHECK2-NEXT: br label [[COND_END:%.*]]
+// CHECK2: cond.false:
+// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: br label [[COND_END]]
+// CHECK2: cond.end:
+// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK2-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK2: omp.inner.for.cond:
+// CHECK2-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100
+// CHECK2-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK2: omp.inner.for.body:
+// CHECK2-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[F_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP11]], ptr [[F_CASTED]], align 4
+// CHECK2-NEXT: [[TMP12:%.*]] = load i64, ptr [[F_CASTED]], align 8
+// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK2-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to ptr
+// CHECK2-NEXT: store ptr [[TMP14]], ptr [[TMP13]], align 8
+// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK2-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to ptr
+// CHECK2-NEXT: store ptr [[TMP16]], ptr [[TMP15]], align 8
+// CHECK2-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
+// CHECK2-NEXT: store ptr [[TMP0]], ptr [[TMP17]], align 8
+// CHECK2-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3
+// CHECK2-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP12]] to ptr
+// CHECK2-NEXT: store ptr [[TMP19]], ptr [[TMP18]], align 8
+// CHECK2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4)
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK2: omp.inner.for.inc:
+// CHECK2-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK2-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK2-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: [[TMP24:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]]
+// CHECK2-NEXT: store i32 [[ADD4]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99
+// CHECK2-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]]
+// CHECK2: cond.true6:
+// CHECK2-NEXT: br label [[COND_END8:%.*]]
+// CHECK2: cond.false7:
+// CHECK2-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: br label [[COND_END8]]
+// CHECK2: cond.end8:
+// CHECK2-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ]
+// CHECK2-NEXT: store i32 [[COND9]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: store i32 [[TMP28]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK2: omp.inner.for.end:
+// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK2: omp.loop.exit:
+// CHECK2-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP2]])
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38_omp_outlined_omp_outlined
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i64 noundef [[F:%.*]]) #[[ATTR1]] {
+// CHECK2-NEXT: entry:
+// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[F_ADDR:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[K:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK2-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
+// CHECK2-NEXT: store i64 [[F]], ptr [[F_ADDR]], align 8
+// CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK2-NEXT: store i32 99, ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK2-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK2-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK2-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP2]] to i32
+// CHECK2-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK2-NEXT: store i32 [[CONV2]], ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+// CHECK2-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB3]], i32 [[TMP4]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK2-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK2: omp.inner.for.cond:
+// CHECK2-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[CONV3:%.*]] = sext i32 [[TMP6]] to i64
+// CHECK2-NEXT: [[TMP7:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK2-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV3]], [[TMP7]]
+// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK2: omp.inner.for.body:
+// CHECK2-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10
+// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
+// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK2-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[DIV4:%.*]] = sdiv i32 [[TMP10]], 10
+// CHECK2-NEXT: [[MUL5:%.*]] = mul nsw i32 [[DIV4]], 10
+// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL5]]
+// CHECK2-NEXT: [[MUL6:%.*]] = mul nsw i32 [[SUB]], 1
+// CHECK2-NEXT: [[ADD7:%.*]] = add nsw i32 0, [[MUL6]]
+// CHECK2-NEXT: store i32 [[ADD7]], ptr [[J]], align 4
+// CHECK2-NEXT: store i32 10, ptr [[K]], align 4
+// CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT: [[TMP12:%.*]] = load i32, ptr [[J]], align 4
+// CHECK2-NEXT: [[TMP13:%.*]] = load i32, ptr [[F_ADDR]], align 4
+// CHECK2-NEXT: [[MUL8:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]]
+// CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP11]], [[MUL8]]
+// CHECK2-NEXT: [[TMP14:%.*]] = load i32, ptr [[K]], align 4
+// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD9]], [[TMP14]]
+// CHECK2-NEXT: [[TMP15:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64
+// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK2-NEXT: [[TMP16:%.*]] = load i32, ptr [[J]], align 4
+// CHECK2-NEXT: [[IDXPROM11:%.*]] = sext i32 [[TMP16]] to i64
+// CHECK2-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM11]]
+// CHECK2-NEXT: store i32 [[ADD10]], ptr [[ARRAYIDX12]], align 4
+// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK2: omp.body.continue:
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK2: omp.inner.for.inc:
+// CHECK2-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK2-NEXT: store i32 [[ADD13]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK2: omp.inner.for.end:
+// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK2: omp.loop.exit:
+// CHECK2-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP4]])
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46
+// CHECK2-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR4]] {
+// CHECK2-NEXT: entry:
+// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK2-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
+// CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK2: user_code.entry:
+// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP3]], ptr [[N_CASTED]], align 4
+// CHECK2-NEXT: [[TMP4:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK2-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]]) #[[ATTR2]]
+// CHECK2-NEXT: call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK2-NEXT: ret void
+// CHECK2: worker.exit:
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_omp_outlined
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR1]] {
+// CHECK2-NEXT: entry:
+// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[I8:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[J9:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
+// CHECK2-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK2-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK2-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
+// CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// CHECK2-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0
+// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0
+// CHECK2-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1
+// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], [[DIV5]]
+// CHECK2-NEXT: [[SUB6:%.*]] = sub nsw i32 [[MUL]], 1
+// CHECK2-NEXT: store i32 [[SUB6]], ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[J]], align 4
+// CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]]
+// CHECK2-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK2: land.lhs.true:
+// CHECK2-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT: [[CMP7:%.*]] = icmp slt i32 0, [[TMP6]]
+// CHECK2-NEXT: br i1 [[CMP7]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]]
+// CHECK2: omp.precond.then:
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK2-NEXT: store i32 [[TMP7]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK2-NEXT: [[TMP8:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4
+// CHECK2-NEXT: call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2]], i32 [[TMP9]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK2-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK2-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP10]], [[TMP11]]
+// CHECK2-NEXT: br i1 [[CMP10]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK2: cond.true:
+// CHECK2-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK2-NEXT: br label [[COND_END:%.*]]
+// CHECK2: cond.false:
+// CHECK2-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: br label [[COND_END]]
+// CHECK2: cond.end:
+// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ]
+// CHECK2-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: store i32 [[TMP14]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK2: omp.inner.for.cond:
+// CHECK2-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], 1
+// CHECK2-NEXT: [[CMP11:%.*]] = icmp slt i32 [[TMP15]], [[ADD]]
+// CHECK2-NEXT: br i1 [[CMP11]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK2: omp.inner.for.body:
+// CHECK2-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: [[TMP18:%.*]] = zext i32 [[TMP17]] to i64
+// CHECK2-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64
+// CHECK2-NEXT: [[TMP21:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP21]], ptr [[N_CASTED]], align 4
+// CHECK2-NEXT: [[TMP22:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK2-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP18]] to ptr
+// CHECK2-NEXT: store ptr [[TMP24]], ptr [[TMP23]], align 8
+// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK2-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP20]] to ptr
+// CHECK2-NEXT: store ptr [[TMP26]], ptr [[TMP25]], align 8
+// CHECK2-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
+// CHECK2-NEXT: [[TMP28:%.*]] = inttoptr i64 [[TMP22]] to ptr
+// CHECK2-NEXT: store ptr [[TMP28]], ptr [[TMP27]], align 8
+// CHECK2-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3
+// CHECK2-NEXT: store ptr [[TMP0]], ptr [[TMP29]], align 8
+// CHECK2-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: [[TMP31:%.*]] = load i32, ptr [[TMP30]], align 4
+// CHECK2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP31]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4)
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK2: omp.inner.for.inc:
+// CHECK2-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP32]], [[TMP33]]
+// CHECK2-NEXT: store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[TMP34:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: [[TMP35:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP34]], [[TMP35]]
+// CHECK2-NEXT: store i32 [[ADD13]], ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP37:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP36]], [[TMP37]]
+// CHECK2-NEXT: store i32 [[ADD14]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP38:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP39:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK2-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[TMP38]], [[TMP39]]
+// CHECK2-NEXT: br i1 [[CMP15]], label [[COND_TRUE16:%.*]], label [[COND_FALSE17:%.*]]
+// CHECK2: cond.true16:
+// CHECK2-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK2-NEXT: br label [[COND_END18:%.*]]
+// CHECK2: cond.false17:
+// CHECK2-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: br label [[COND_END18]]
+// CHECK2: cond.end18:
+// CHECK2-NEXT: [[COND19:%.*]] = phi i32 [ [[TMP40]], [[COND_TRUE16]] ], [ [[TMP41]], [[COND_FALSE17]] ]
+// CHECK2-NEXT: store i32 [[COND19]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP42:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: store i32 [[TMP42]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK2: omp.inner.for.end:
+// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK2: omp.loop.exit:
+// CHECK2-NEXT: [[TMP43:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: [[TMP44:%.*]] = load i32, ptr [[TMP43]], align 4
+// CHECK2-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP44]])
+// CHECK2-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK2: omp.precond.end:
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_omp_outlined_omp_outlined
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR1]] {
+// CHECK2-NEXT: entry:
+// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[I9:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[J10:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK2-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK2-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
+// CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// CHECK2-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0
+// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0
+// CHECK2-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1
+// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], [[DIV5]]
+// CHECK2-NEXT: [[SUB6:%.*]] = sub nsw i32 [[MUL]], 1
+// CHECK2-NEXT: store i32 [[SUB6]], ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[J]], align 4
+// CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]]
+// CHECK2-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK2: land.lhs.true:
+// CHECK2-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT: [[CMP7:%.*]] = icmp slt i32 0, [[TMP6]]
+// CHECK2-NEXT: br i1 [[CMP7]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]]
+// CHECK2: omp.precond.then:
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK2-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK2-NEXT: store i32 [[TMP7]], ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT: [[TMP8:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK2-NEXT: [[CONV:%.*]] = trunc i64 [[TMP8]] to i32
+// CHECK2-NEXT: [[TMP9:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK2-NEXT: [[CONV8:%.*]] = trunc i64 [[TMP9]] to i32
+// CHECK2-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK2-NEXT: store i32 [[CONV8]], ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK2-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
+// CHECK2-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB3]], i32 [[TMP11]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK2-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK2-NEXT: store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK2: omp.inner.for.cond:
+// CHECK2-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[CONV11:%.*]] = sext i32 [[TMP13]] to i64
+// CHECK2-NEXT: [[TMP14:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK2-NEXT: [[CMP12:%.*]] = icmp ule i64 [[CONV11]], [[TMP14]]
+// CHECK2-NEXT: br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK2: omp.inner.for.body:
+// CHECK2-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT: [[SUB13:%.*]] = sub nsw i32 [[TMP16]], 0
+// CHECK2-NEXT: [[DIV14:%.*]] = sdiv i32 [[SUB13]], 1
+// CHECK2-NEXT: [[MUL15:%.*]] = mul nsw i32 1, [[DIV14]]
+// CHECK2-NEXT: [[DIV16:%.*]] = sdiv i32 [[TMP15]], [[MUL15]]
+// CHECK2-NEXT: [[MUL17:%.*]] = mul nsw i32 [[DIV16]], 1
+// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL17]]
+// CHECK2-NEXT: store i32 [[ADD]], ptr [[I9]], align 4
+// CHECK2-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT: [[SUB18:%.*]] = sub nsw i32 [[TMP19]], 0
+// CHECK2-NEXT: [[DIV19:%.*]] = sdiv i32 [[SUB18]], 1
+// CHECK2-NEXT: [[MUL20:%.*]] = mul nsw i32 1, [[DIV19]]
+// CHECK2-NEXT: [[DIV21:%.*]] = sdiv i32 [[TMP18]], [[MUL20]]
+// CHECK2-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT: [[SUB22:%.*]] = sub nsw i32 [[TMP20]], 0
+// CHECK2-NEXT: [[DIV23:%.*]] = sdiv i32 [[SUB22]], 1
+// CHECK2-NEXT: [[MUL24:%.*]] = mul nsw i32 1, [[DIV23]]
+// CHECK2-NEXT: [[MUL25:%.*]] = mul nsw i32 [[DIV21]], [[MUL24]]
+// CHECK2-NEXT: [[SUB26:%.*]] = sub nsw i32 [[TMP17]], [[MUL25]]
+// CHECK2-NEXT: [[MUL27:%.*]] = mul nsw i32 [[SUB26]], 1
+// CHECK2-NEXT: [[ADD28:%.*]] = add nsw i32 0, [[MUL27]]
+// CHECK2-NEXT: store i32 [[ADD28]], ptr [[J10]], align 4
+// CHECK2-NEXT: [[TMP21:%.*]] = load i32, ptr [[I9]], align 4
+// CHECK2-NEXT: [[TMP22:%.*]] = load i32, ptr [[J10]], align 4
+// CHECK2-NEXT: [[ADD29:%.*]] = add nsw i32 [[TMP21]], [[TMP22]]
+// CHECK2-NEXT: [[TMP23:%.*]] = load i32, ptr [[I9]], align 4
+// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP23]] to i64
+// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK2-NEXT: [[TMP24:%.*]] = load i32, ptr [[J10]], align 4
+// CHECK2-NEXT: [[IDXPROM30:%.*]] = sext i32 [[TMP24]] to i64
+// CHECK2-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM30]]
+// CHECK2-NEXT: store i32 [[ADD29]], ptr [[ARRAYIDX31]], align 4
+// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK2: omp.body.continue:
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK2: omp.inner.for.inc:
+// CHECK2-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: [[ADD32:%.*]] = add nsw i32 [[TMP25]], [[TMP26]]
+// CHECK2-NEXT: store i32 [[ADD32]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK2: omp.inner.for.end:
+// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK2: omp.loop.exit:
+// CHECK2-NEXT: [[TMP27:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
+// CHECK2-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP28]])
+// CHECK2-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK2: omp.precond.end:
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l53
+// CHECK2-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[V:%.*]]) #[[ATTR4]] {
+// CHECK2-NEXT: entry:
+// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[V_ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK2-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK2-NEXT: store ptr [[V]], ptr [[V_ADDR]], align 8
+// CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK2: user_code.entry:
+// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP3]], ptr [[N_CASTED]], align 4
+// CHECK2-NEXT: [[TMP4:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK2-NEXT: [[TMP5:%.*]] = load ptr, ptr [[V_ADDR]], align 8
+// CHECK2-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l53_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]], ptr [[TMP5]]) #[[ATTR2]]
+// CHECK2-NEXT: call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK2-NEXT: ret void
+// CHECK2: worker.exit:
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l53_omp_outlined
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[V:%.*]]) #[[ATTR1]] {
+// CHECK2-NEXT: entry:
+// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[V_ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x ptr], align 8
+// CHECK2-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK2-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK2-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK2-NEXT: store ptr [[V]], ptr [[V_ADDR]], align 8
+// CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK2-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK2-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK2: omp.precond.then:
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK2-NEXT: [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK2-NEXT: call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2]], i32 [[TMP6]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK2-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK2-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK2: cond.true:
+// CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: br label [[COND_END:%.*]]
+// CHECK2: cond.false:
+// CHECK2-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: br label [[COND_END]]
+// CHECK2: cond.end:
+// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ]
+// CHECK2-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK2: omp.inner.for.cond:
+// CHECK2-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1
+// CHECK2-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]]
+// CHECK2-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK2: omp.inner.for.body:
+// CHECK2-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64
+// CHECK2-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64
+// CHECK2-NEXT: [[TMP18:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP18]], ptr [[N_CASTED]], align 4
+// CHECK2-NEXT: [[TMP19:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK2-NEXT: [[TMP20:%.*]] = load ptr, ptr [[V_ADDR]], align 8
+// CHECK2-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK2-NEXT: [[TMP22:%.*]] = inttoptr i64 [[TMP15]] to ptr
+// CHECK2-NEXT: store ptr [[TMP22]], ptr [[TMP21]], align 8
+// CHECK2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK2-NEXT: [[TMP24:%.*]] = inttoptr i64 [[TMP17]] to ptr
+// CHECK2-NEXT: store ptr [[TMP24]], ptr [[TMP23]], align 8
+// CHECK2-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
+// CHECK2-NEXT: [[TMP26:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK2-NEXT: store ptr [[TMP26]], ptr [[TMP25]], align 8
+// CHECK2-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3
+// CHECK2-NEXT: store ptr [[TMP0]], ptr [[TMP27]], align 8
+// CHECK2-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 4
+// CHECK2-NEXT: store ptr [[TMP20]], ptr [[TMP28]], align 8
+// CHECK2-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: [[TMP30:%.*]] = load i32, ptr [[TMP29]], align 4
+// CHECK2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l53_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 5)
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK2: omp.inner.for.inc:
+// CHECK2-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP31]], [[TMP32]]
+// CHECK2-NEXT: store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: [[TMP34:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP33]], [[TMP34]]
+// CHECK2-NEXT: store i32 [[ADD7]], ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: [[TMP35:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP35]], [[TMP36]]
+// CHECK2-NEXT: store i32 [[ADD8]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP37:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP38:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]]
+// CHECK2-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]]
+// CHECK2: cond.true10:
+// CHECK2-NEXT: [[TMP39:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: br label [[COND_END12:%.*]]
+// CHECK2: cond.false11:
+// CHECK2-NEXT: [[TMP40:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: br label [[COND_END12]]
+// CHECK2: cond.end12:
+// CHECK2-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE10]] ], [ [[TMP40]], [[COND_FALSE11]] ]
+// CHECK2-NEXT: store i32 [[COND13]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP41:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: store i32 [[TMP41]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK2: omp.inner.for.end:
+// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK2: omp.loop.exit:
+// CHECK2-NEXT: [[TMP42:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: [[TMP43:%.*]] = load i32, ptr [[TMP42]], align 4
+// CHECK2-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP43]])
+// CHECK2-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK2: omp.precond.end:
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l53_omp_outlined_omp_outlined
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[V:%.*]]) #[[ATTR1]] {
+// CHECK2-NEXT: entry:
+// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[V_ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[I4:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK2-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK2-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK2-NEXT: store ptr [[V]], ptr [[V_ADDR]], align 8
+// CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK2-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK2-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK2: omp.precond.then:
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT: [[TMP5:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK2-NEXT: [[CONV:%.*]] = trunc i64 [[TMP5]] to i32
+// CHECK2-NEXT: [[TMP6:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK2-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP6]] to i32
+// CHECK2-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK2-NEXT: store i32 [[CONV3]], ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK2-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK2-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB3]], i32 [[TMP8]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK2-NEXT: store i32 [[TMP9]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK2: omp.inner.for.cond:
+// CHECK2-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[CONV5:%.*]] = sext i32 [[TMP10]] to i64
+// CHECK2-NEXT: [[TMP11:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK2-NEXT: [[CMP6:%.*]] = icmp ule i64 [[CONV5]], [[TMP11]]
+// CHECK2-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK2: omp.inner.for.body:
+// CHECK2-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1
+// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK2-NEXT: store i32 [[ADD]], ptr [[I4]], align 4
+// CHECK2-NEXT: [[TMP13:%.*]] = load ptr, ptr [[V_ADDR]], align 8
+// CHECK2-NEXT: [[TMP14:%.*]] = load i32, ptr [[I4]], align 4
+// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64
+// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[IDXPROM]]
+// CHECK2-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK2-NEXT: [[TMP16:%.*]] = load i32, ptr [[I4]], align 4
+// CHECK2-NEXT: [[IDXPROM7:%.*]] = sext i32 [[TMP16]] to i64
+// CHECK2-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM7]]
+// CHECK2-NEXT: store i32 [[TMP15]], ptr [[ARRAYIDX8]], align 4
+// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK2: omp.body.continue:
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK2: omp.inner.for.inc:
+// CHECK2-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK2-NEXT: store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK2: omp.inner.for.end:
+// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK2: omp.loop.exit:
+// CHECK2-NEXT: [[TMP19:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
+// CHECK2-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP20]])
+// CHECK2-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK2: omp.precond.end:
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28
+// CHECK3-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store ptr [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1:[0-9]+]], i8 2, i1 false)
+// CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK3: user_code.entry:
+// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP3]], ptr [[N_CASTED]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]]) #[[ATTR2:[0-9]+]]
+// CHECK3-NEXT: call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK3-NEXT: ret void
+// CHECK3: worker.exit:
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28_omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store ptr [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK3-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK3: omp.precond.then:
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK3-NEXT: [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK3-NEXT: call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP6]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK3: cond.true:
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: br label [[COND_END:%.*]]
+// CHECK3: cond.false:
+// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: br label [[COND_END]]
+// CHECK3: cond.end:
+// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ]
+// CHECK3-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3: omp.inner.for.cond:
+// CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1
+// CHECK3-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]]
+// CHECK3-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3: omp.inner.for.body:
+// CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP16]], ptr [[N_CASTED]], align 4
+// CHECK3-NEXT: [[TMP17:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to ptr
+// CHECK3-NEXT: store ptr [[TMP19]], ptr [[TMP18]], align 4
+// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK3-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to ptr
+// CHECK3-NEXT: store ptr [[TMP21]], ptr [[TMP20]], align 4
+// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 2
+// CHECK3-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to ptr
+// CHECK3-NEXT: store ptr [[TMP23]], ptr [[TMP22]], align 4
+// CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 3
+// CHECK3-NEXT: store ptr [[TMP0]], ptr [[TMP24]], align 4
+// CHECK3-NEXT: [[TMP25:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
+// CHECK3-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP26]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 4)
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3: omp.inner.for.inc:
+// CHECK3-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP27]], [[TMP28]]
+// CHECK3-NEXT: store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP29]], [[TMP30]]
+// CHECK3-NEXT: store i32 [[ADD7]], ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP31]], [[TMP32]]
+// CHECK3-NEXT: store i32 [[ADD8]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP34:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP33]], [[TMP34]]
+// CHECK3-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]]
+// CHECK3: cond.true10:
+// CHECK3-NEXT: [[TMP35:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: br label [[COND_END12:%.*]]
+// CHECK3: cond.false11:
+// CHECK3-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: br label [[COND_END12]]
+// CHECK3: cond.end12:
+// CHECK3-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP35]], [[COND_TRUE10]] ], [ [[TMP36]], [[COND_FALSE11]] ]
+// CHECK3-NEXT: store i32 [[COND13]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP37:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP37]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK3: omp.inner.for.end:
+// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3: omp.loop.exit:
+// CHECK3-NEXT: [[TMP38:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP39:%.*]] = load i32, ptr [[TMP38]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3:[0-9]+]], i32 [[TMP39]])
+// CHECK3-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK3: omp.precond.end:
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28_omp_outlined_omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store ptr [[AA]], ptr [[AA_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK3-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK3: omp.precond.then:
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB3]], i32 [[TMP8]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP9]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3: omp.inner.for.cond:
+// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]]
+// CHECK3-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3: omp.inner.for.body:
+// CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[I3]], align 4
+// CHECK3-NEXT: [[TMP13:%.*]] = load i32, ptr [[I3]], align 4
+// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i32 0, i32 [[TMP13]]
+// CHECK3-NEXT: [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+// CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP14]] to i32
+// CHECK3-NEXT: [[ADD5:%.*]] = add nsw i32 [[CONV]], 1
+// CHECK3-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16
+// CHECK3-NEXT: store i16 [[CONV6]], ptr [[ARRAYIDX]], align 2
+// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK3: omp.body.continue:
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3: omp.inner.for.inc:
+// CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// CHECK3-NEXT: store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK3: omp.inner.for.end:
+// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3: omp.loop.exit:
+// CHECK3-NEXT: [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP18]])
+// CHECK3-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK3: omp.precond.end:
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33
+// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK3: user_code.entry:
+// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK3-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]]) #[[ATTR2]]
+// CHECK3-NEXT: call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK3-NEXT: ret void
+// CHECK3: worker.exit:
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33_omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x ptr], align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: store i32 9, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK3-NEXT: call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2]], i32 [[TMP2]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9
+// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK3: cond.true:
+// CHECK3-NEXT: br label [[COND_END:%.*]]
+// CHECK3: cond.false:
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: br label [[COND_END]]
+// CHECK3: cond.end:
+// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK3-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3: omp.inner.for.cond:
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10
+// CHECK3-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3: omp.inner.for.body:
+// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to ptr
+// CHECK3-NEXT: store ptr [[TMP10]], ptr [[TMP9]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK3-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to ptr
+// CHECK3-NEXT: store ptr [[TMP12]], ptr [[TMP11]], align 4
+// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 2
+// CHECK3-NEXT: store ptr [[TMP0]], ptr [[TMP13]], align 4
+// CHECK3-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 3)
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3: omp.inner.for.inc:
+// CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+// CHECK3-NEXT: store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
+// CHECK3-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP20]], 9
+// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
+// CHECK3: cond.true5:
+// CHECK3-NEXT: br label [[COND_END7:%.*]]
+// CHECK3: cond.false6:
+// CHECK3-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: br label [[COND_END7]]
+// CHECK3: cond.end7:
+// CHECK3-NEXT: [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP21]], [[COND_FALSE6]] ]
+// CHECK3-NEXT: store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP22]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK3: omp.inner.for.end:
+// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3: omp.loop.exit:
+// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP2]])
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33_omp_outlined_omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB3]], i32 [[TMP4]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3: omp.inner.for.cond:
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]]
+// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3: omp.inner.for.body:
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[I]], align 4
+// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 [[TMP9]]
+// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK3-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK3-NEXT: store i32 [[ADD1]], ptr [[ARRAYIDX]], align 4
+// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK3: omp.body.continue:
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3: omp.inner.for.inc:
+// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK3-NEXT: store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK3: omp.inner.for.end:
+// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3: omp.loop.exit:
+// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP4]])
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38
+// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[F:%.*]]) #[[ATTR4]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[F]], ptr [[F_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK3: user_code.entry:
+// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[F_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP3]], ptr [[F_CASTED]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[F_CASTED]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]], i32 [[TMP4]]) #[[ATTR2]]
+// CHECK3-NEXT: call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK3-NEXT: ret void
+// CHECK3: worker.exit:
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38_omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[F:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[K:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[F_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[F]], ptr [[F_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: store i32 99, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK3-NEXT: call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2]], i32 [[TMP2]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 99
+// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK3: cond.true:
+// CHECK3-NEXT: br label [[COND_END:%.*]]
+// CHECK3: cond.false:
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: br label [[COND_END]]
+// CHECK3: cond.end:
+// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK3-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3: omp.inner.for.cond:
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100
+// CHECK3-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3: omp.inner.for.body:
+// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[F_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP9]], ptr [[F_CASTED]], align 4
+// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[F_CASTED]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to ptr
+// CHECK3-NEXT: store ptr [[TMP12]], ptr [[TMP11]], align 4
+// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK3-NEXT: [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to ptr
+// CHECK3-NEXT: store ptr [[TMP14]], ptr [[TMP13]], align 4
+// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 2
+// CHECK3-NEXT: store ptr [[TMP0]], ptr [[TMP15]], align 4
+// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 3
+// CHECK3-NEXT: [[TMP17:%.*]] = inttoptr i32 [[TMP10]] to ptr
+// CHECK3-NEXT: store ptr [[TMP17]], ptr [[TMP16]], align 4
+// CHECK3-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 4)
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3: omp.inner.for.inc:
+// CHECK3-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK3-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK3-NEXT: store i32 [[ADD4]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP24:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP24]], 99
+// CHECK3-NEXT: br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]]
+// CHECK3: cond.true6:
+// CHECK3-NEXT: br label [[COND_END8:%.*]]
+// CHECK3: cond.false7:
+// CHECK3-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: br label [[COND_END8]]
+// CHECK3: cond.end8:
+// CHECK3-NEXT: [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP25]], [[COND_FALSE7]] ]
+// CHECK3-NEXT: store i32 [[COND9]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP26]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK3: omp.inner.for.end:
+// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3: omp.loop.exit:
+// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP2]])
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38_omp_outlined_omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[F:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[F_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[K:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[F]], ptr [[F_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 99, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB3]], i32 [[TMP4]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3: omp.inner.for.cond:
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]]
+// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3: omp.inner.for.body:
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 10
+// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10
+// CHECK3-NEXT: [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10
+// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]]
+// CHECK3-NEXT: [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1
+// CHECK3-NEXT: [[ADD5:%.*]] = add nsw i32 0, [[MUL4]]
+// CHECK3-NEXT: store i32 [[ADD5]], ptr [[J]], align 4
+// CHECK3-NEXT: store i32 10, ptr [[K]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4
+// CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[J]], align 4
+// CHECK3-NEXT: [[TMP13:%.*]] = load i32, ptr [[F_ADDR]], align 4
+// CHECK3-NEXT: [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]]
+// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]]
+// CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[K]], align 4
+// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]]
+// CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[I]], align 4
+// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP0]], i32 0, i32 [[TMP15]]
+// CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[J]], align 4
+// CHECK3-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i32 0, i32 [[TMP16]]
+// CHECK3-NEXT: store i32 [[ADD8]], ptr [[ARRAYIDX9]], align 4
+// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK3: omp.body.continue:
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3: omp.inner.for.inc:
+// CHECK3-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK3-NEXT: store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK3: omp.inner.for.end:
+// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3: omp.loop.exit:
+// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP4]])
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46
+// CHECK3-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR4]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK3: user_code.entry:
+// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP3]], ptr [[N_CASTED]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]]) #[[ATTR2]]
+// CHECK3-NEXT: call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK3-NEXT: ret void
+// CHECK3: worker.exit:
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8
+// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8
+// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8
+// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8
+// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I9:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[J10:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0
+// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK3-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK3-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0
+// CHECK3-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1
+// CHECK3-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64
+// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]]
+// CHECK3-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1
+// CHECK3-NEXT: store i64 [[SUB7]], ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK3-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[J]], align 4
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]]
+// CHECK3-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK3: land.lhs.true:
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK3-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]]
+// CHECK3-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]]
+// CHECK3: omp.precond.then:
+// CHECK3-NEXT: store i64 0, ptr [[DOTOMP_COMB_LB]], align 8
+// CHECK3-NEXT: [[TMP7:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK3-NEXT: store i64 [[TMP7]], ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK3-NEXT: store i64 1, ptr [[DOTOMP_STRIDE]], align 8
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK3-NEXT: [[CONV11:%.*]] = zext i32 [[NVPTX_NUM_THREADS]] to i64
+// CHECK3-NEXT: [[TMP8:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4
+// CHECK3-NEXT: call void @__kmpc_distribute_static_init_8(ptr @[[GLOB2]], i32 [[TMP9]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i64 1, i64 [[CONV11]])
+// CHECK3-NEXT: [[TMP10:%.*]] = load i64, ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK3-NEXT: [[TMP11:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK3-NEXT: [[CMP12:%.*]] = icmp sgt i64 [[TMP10]], [[TMP11]]
+// CHECK3-NEXT: br i1 [[CMP12]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK3: cond.true:
+// CHECK3-NEXT: [[TMP12:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK3-NEXT: br label [[COND_END:%.*]]
+// CHECK3: cond.false:
+// CHECK3-NEXT: [[TMP13:%.*]] = load i64, ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK3-NEXT: br label [[COND_END]]
+// CHECK3: cond.end:
+// CHECK3-NEXT: [[COND:%.*]] = phi i64 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ]
+// CHECK3-NEXT: store i64 [[COND]], ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK3-NEXT: [[TMP14:%.*]] = load i64, ptr [[DOTOMP_COMB_LB]], align 8
+// CHECK3-NEXT: store i64 [[TMP14]], ptr [[DOTOMP_IV]], align 8
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3: omp.inner.for.cond:
+// CHECK3-NEXT: [[TMP15:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK3-NEXT: [[TMP16:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], 1
+// CHECK3-NEXT: [[CMP13:%.*]] = icmp slt i64 [[TMP15]], [[ADD]]
+// CHECK3-NEXT: br i1 [[CMP13]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3: omp.inner.for.body:
+// CHECK3-NEXT: [[TMP17:%.*]] = load i64, ptr [[DOTOMP_COMB_LB]], align 8
+// CHECK3-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
+// CHECK3-NEXT: [[TMP19:%.*]] = load i64, ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK3-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32
+// CHECK3-NEXT: [[TMP21:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP21]], ptr [[N_CASTED]], align 4
+// CHECK3-NEXT: [[TMP22:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP18]] to ptr
+// CHECK3-NEXT: store ptr [[TMP24]], ptr [[TMP23]], align 4
+// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK3-NEXT: [[TMP26:%.*]] = inttoptr i32 [[TMP20]] to ptr
+// CHECK3-NEXT: store ptr [[TMP26]], ptr [[TMP25]], align 4
+// CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 2
+// CHECK3-NEXT: [[TMP28:%.*]] = inttoptr i32 [[TMP22]] to ptr
+// CHECK3-NEXT: store ptr [[TMP28]], ptr [[TMP27]], align 4
+// CHECK3-NEXT: [[TMP29:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 3
+// CHECK3-NEXT: store ptr [[TMP0]], ptr [[TMP29]], align 4
+// CHECK3-NEXT: [[TMP30:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP31:%.*]] = load i32, ptr [[TMP30]], align 4
+// CHECK3-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP31]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 4)
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3: omp.inner.for.inc:
+// CHECK3-NEXT: [[TMP32:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK3-NEXT: [[TMP33:%.*]] = load i64, ptr [[DOTOMP_STRIDE]], align 8
+// CHECK3-NEXT: [[ADD14:%.*]] = add nsw i64 [[TMP32]], [[TMP33]]
+// CHECK3-NEXT: store i64 [[ADD14]], ptr [[DOTOMP_IV]], align 8
+// CHECK3-NEXT: [[TMP34:%.*]] = load i64, ptr [[DOTOMP_COMB_LB]], align 8
+// CHECK3-NEXT: [[TMP35:%.*]] = load i64, ptr [[DOTOMP_STRIDE]], align 8
+// CHECK3-NEXT: [[ADD15:%.*]] = add nsw i64 [[TMP34]], [[TMP35]]
+// CHECK3-NEXT: store i64 [[ADD15]], ptr [[DOTOMP_COMB_LB]], align 8
+// CHECK3-NEXT: [[TMP36:%.*]] = load i64, ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK3-NEXT: [[TMP37:%.*]] = load i64, ptr [[DOTOMP_STRIDE]], align 8
+// CHECK3-NEXT: [[ADD16:%.*]] = add nsw i64 [[TMP36]], [[TMP37]]
+// CHECK3-NEXT: store i64 [[ADD16]], ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK3-NEXT: [[TMP38:%.*]] = load i64, ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK3-NEXT: [[TMP39:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK3-NEXT: [[CMP17:%.*]] = icmp sgt i64 [[TMP38]], [[TMP39]]
+// CHECK3-NEXT: br i1 [[CMP17]], label [[COND_TRUE18:%.*]], label [[COND_FALSE19:%.*]]
+// CHECK3: cond.true18:
+// CHECK3-NEXT: [[TMP40:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK3-NEXT: br label [[COND_END20:%.*]]
+// CHECK3: cond.false19:
+// CHECK3-NEXT: [[TMP41:%.*]] = load i64, ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK3-NEXT: br label [[COND_END20]]
+// CHECK3: cond.end20:
+// CHECK3-NEXT: [[COND21:%.*]] = phi i64 [ [[TMP40]], [[COND_TRUE18]] ], [ [[TMP41]], [[COND_FALSE19]] ]
+// CHECK3-NEXT: store i64 [[COND21]], ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK3-NEXT: [[TMP42:%.*]] = load i64, ptr [[DOTOMP_COMB_LB]], align 8
+// CHECK3-NEXT: store i64 [[TMP42]], ptr [[DOTOMP_IV]], align 8
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK3: omp.inner.for.end:
+// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3: omp.loop.exit:
+// CHECK3-NEXT: [[TMP43:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP44:%.*]] = load i32, ptr [[TMP43]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP44]])
+// CHECK3-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK3: omp.precond.end:
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_omp_outlined_omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8
+// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8
+// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8
+// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8
+// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I11:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[J12:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0
+// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK3-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK3-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP4]], 0
+// CHECK3-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1
+// CHECK3-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64
+// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]]
+// CHECK3-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1
+// CHECK3-NEXT: store i64 [[SUB7]], ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK3-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[J]], align 4
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]]
+// CHECK3-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK3: land.lhs.true:
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK3-NEXT: [[CMP8:%.*]] = icmp slt i32 0, [[TMP6]]
+// CHECK3-NEXT: br i1 [[CMP8]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]]
+// CHECK3: omp.precond.then:
+// CHECK3-NEXT: store i64 0, ptr [[DOTOMP_LB]], align 8
+// CHECK3-NEXT: [[TMP7:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK3-NEXT: store i64 [[TMP7]], ptr [[DOTOMP_UB]], align 8
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT: [[CONV9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: [[CONV10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK3-NEXT: store i64 [[CONV9]], ptr [[DOTOMP_LB]], align 8
+// CHECK3-NEXT: store i64 [[CONV10]], ptr [[DOTOMP_UB]], align 8
+// CHECK3-NEXT: store i64 1, ptr [[DOTOMP_STRIDE]], align 8
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_init_8(ptr @[[GLOB3]], i32 [[TMP11]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i64 1, i64 1)
+// CHECK3-NEXT: [[TMP12:%.*]] = load i64, ptr [[DOTOMP_LB]], align 8
+// CHECK3-NEXT: store i64 [[TMP12]], ptr [[DOTOMP_IV]], align 8
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3: omp.inner.for.cond:
+// CHECK3-NEXT: [[TMP13:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: [[CONV13:%.*]] = zext i32 [[TMP14]] to i64
+// CHECK3-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP13]], [[CONV13]]
+// CHECK3-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3: omp.inner.for.body:
+// CHECK3-NEXT: [[TMP15:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK3-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP16]], 0
+// CHECK3-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1
+// CHECK3-NEXT: [[MUL17:%.*]] = mul nsw i32 1, [[DIV16]]
+// CHECK3-NEXT: [[CONV18:%.*]] = sext i32 [[MUL17]] to i64
+// CHECK3-NEXT: [[DIV19:%.*]] = sdiv i64 [[TMP15]], [[CONV18]]
+// CHECK3-NEXT: [[MUL20:%.*]] = mul nsw i64 [[DIV19]], 1
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL20]]
+// CHECK3-NEXT: [[CONV21:%.*]] = trunc i64 [[ADD]] to i32
+// CHECK3-NEXT: store i32 [[CONV21]], ptr [[I11]], align 4
+// CHECK3-NEXT: [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK3-NEXT: [[TMP18:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK3-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK3-NEXT: [[SUB22:%.*]] = sub nsw i32 [[TMP19]], 0
+// CHECK3-NEXT: [[DIV23:%.*]] = sdiv i32 [[SUB22]], 1
+// CHECK3-NEXT: [[MUL24:%.*]] = mul nsw i32 1, [[DIV23]]
+// CHECK3-NEXT: [[CONV25:%.*]] = sext i32 [[MUL24]] to i64
+// CHECK3-NEXT: [[DIV26:%.*]] = sdiv i64 [[TMP18]], [[CONV25]]
+// CHECK3-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK3-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP20]], 0
+// CHECK3-NEXT: [[DIV28:%.*]] = sdiv i32 [[SUB27]], 1
+// CHECK3-NEXT: [[MUL29:%.*]] = mul nsw i32 1, [[DIV28]]
+// CHECK3-NEXT: [[CONV30:%.*]] = sext i32 [[MUL29]] to i64
+// CHECK3-NEXT: [[MUL31:%.*]] = mul nsw i64 [[DIV26]], [[CONV30]]
+// CHECK3-NEXT: [[SUB32:%.*]] = sub nsw i64 [[TMP17]], [[MUL31]]
+// CHECK3-NEXT: [[MUL33:%.*]] = mul nsw i64 [[SUB32]], 1
+// CHECK3-NEXT: [[ADD34:%.*]] = add nsw i64 0, [[MUL33]]
+// CHECK3-NEXT: [[CONV35:%.*]] = trunc i64 [[ADD34]] to i32
+// CHECK3-NEXT: store i32 [[CONV35]], ptr [[J12]], align 4
+// CHECK3-NEXT: [[TMP21:%.*]] = load i32, ptr [[I11]], align 4
+// CHECK3-NEXT: [[TMP22:%.*]] = load i32, ptr [[J12]], align 4
+// CHECK3-NEXT: [[ADD36:%.*]] = add nsw i32 [[TMP21]], [[TMP22]]
+// CHECK3-NEXT: [[TMP23:%.*]] = load i32, ptr [[I11]], align 4
+// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP0]], i32 0, i32 [[TMP23]]
+// CHECK3-NEXT: [[TMP24:%.*]] = load i32, ptr [[J12]], align 4
+// CHECK3-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i32 0, i32 [[TMP24]]
+// CHECK3-NEXT: store i32 [[ADD36]], ptr [[ARRAYIDX37]], align 4
+// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK3: omp.body.continue:
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3: omp.inner.for.inc:
+// CHECK3-NEXT: [[TMP25:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK3-NEXT: [[TMP26:%.*]] = load i64, ptr [[DOTOMP_STRIDE]], align 8
+// CHECK3-NEXT: [[ADD38:%.*]] = add nsw i64 [[TMP25]], [[TMP26]]
+// CHECK3-NEXT: store i64 [[ADD38]], ptr [[DOTOMP_IV]], align 8
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK3: omp.inner.for.end:
+// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3: omp.loop.exit:
+// CHECK3-NEXT: [[TMP27:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP28]])
+// CHECK3-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK3: omp.precond.end:
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l53
+// CHECK3-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[V:%.*]]) #[[ATTR4]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[V_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: store ptr [[V]], ptr [[V_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 2, i1 false)
+// CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK3: user_code.entry:
+// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP3]], ptr [[N_CASTED]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK3-NEXT: [[TMP5:%.*]] = load ptr, ptr [[V_ADDR]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l53_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]], ptr [[TMP5]]) #[[ATTR2]]
+// CHECK3-NEXT: call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK3-NEXT: ret void
+// CHECK3: worker.exit:
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l53_omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[V:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[V_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x ptr], align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: store ptr [[V]], ptr [[V_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK3-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK3: omp.precond.then:
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK3-NEXT: [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK3-NEXT: call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2]], i32 [[TMP6]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK3: cond.true:
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: br label [[COND_END:%.*]]
+// CHECK3: cond.false:
+// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: br label [[COND_END]]
+// CHECK3: cond.end:
+// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ]
+// CHECK3-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3: omp.inner.for.cond:
+// CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1
+// CHECK3-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]]
+// CHECK3-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3: omp.inner.for.body:
+// CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP16]], ptr [[N_CASTED]], align 4
+// CHECK3-NEXT: [[TMP17:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK3-NEXT: [[TMP18:%.*]] = load ptr, ptr [[V_ADDR]], align 4
+// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP20:%.*]] = inttoptr i32 [[TMP14]] to ptr
+// CHECK3-NEXT: store ptr [[TMP20]], ptr [[TMP19]], align 4
+// CHECK3-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK3-NEXT: [[TMP22:%.*]] = inttoptr i32 [[TMP15]] to ptr
+// CHECK3-NEXT: store ptr [[TMP22]], ptr [[TMP21]], align 4
+// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 2
+// CHECK3-NEXT: [[TMP24:%.*]] = inttoptr i32 [[TMP17]] to ptr
+// CHECK3-NEXT: store ptr [[TMP24]], ptr [[TMP23]], align 4
+// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 3
+// CHECK3-NEXT: store ptr [[TMP0]], ptr [[TMP25]], align 4
+// CHECK3-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 4
+// CHECK3-NEXT: store ptr [[TMP18]], ptr [[TMP26]], align 4
+// CHECK3-NEXT: [[TMP27:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
+// CHECK3-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP28]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l53_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 5)
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3: omp.inner.for.inc:
+// CHECK3-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]]
+// CHECK3-NEXT: store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]]
+// CHECK3-NEXT: store i32 [[ADD7]], ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP34:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]]
+// CHECK3-NEXT: store i32 [[ADD8]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP35:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]]
+// CHECK3-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]]
+// CHECK3: cond.true10:
+// CHECK3-NEXT: [[TMP37:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: br label [[COND_END12:%.*]]
+// CHECK3: cond.false11:
+// CHECK3-NEXT: [[TMP38:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: br label [[COND_END12]]
+// CHECK3: cond.end12:
+// CHECK3-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ]
+// CHECK3-NEXT: store i32 [[COND13]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP39:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP39]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK3: omp.inner.for.end:
+// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3: omp.loop.exit:
+// CHECK3-NEXT: [[TMP40:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP41:%.*]] = load i32, ptr [[TMP40]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP41]])
+// CHECK3-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK3: omp.precond.end:
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l53_omp_outlined_omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[V:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[V_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: store ptr [[V]], ptr [[V_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK3-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK3: omp.precond.then:
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB3]], i32 [[TMP8]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP9]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3: omp.inner.for.cond:
+// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]]
+// CHECK3-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3: omp.inner.for.body:
+// CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[I3]], align 4
+// CHECK3-NEXT: [[TMP13:%.*]] = load ptr, ptr [[V_ADDR]], align 4
+// CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[I3]], align 4
+// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 [[TMP14]]
+// CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[I3]], align 4
+// CHECK3-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i32 0, i32 [[TMP16]]
+// CHECK3-NEXT: store i32 [[TMP15]], ptr [[ARRAYIDX5]], align 4
+// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK3: omp.body.continue:
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3: omp.inner.for.inc:
+// CHECK3-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK3-NEXT: store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK3: omp.inner.for.end:
+// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3: omp.loop.exit:
+// CHECK3-NEXT: [[TMP19:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP20]])
+// CHECK3-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK3: omp.precond.end:
+// CHECK3-NEXT: ret void
+//
diff --git a/clang/test/OpenMP/nvptx_target_teams_generic_loop_generic_mode_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_generic_loop_generic_mode_codegen.cpp
new file mode 100644
index 00000000000000..9d351c45c4993d
--- /dev/null
+++ b/clang/test/OpenMP/nvptx_target_teams_generic_loop_generic_mode_codegen.cpp
@@ -0,0 +1,525 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK2
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK2
+
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK2
+// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK2
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+int a;
+
+int foo(int *a);
+
+int main(int argc, char **argv) {
+#pragma omp target teams loop map(tofrom:a) if(target:argc)
+ for (int i= 0; i < argc; ++i)
+ a = foo(&i) + foo(&a) + foo(&argc);
+ return 0;
+}
+
+
+#endif
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l24
+// CHECK1-SAME: (i64 noundef [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[ARGC_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store i64 [[ARGC]], ptr [[ARGC_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1:[0-9]+]], i8 2, i1 false)
+// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK1: user_code.entry:
+// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP3]], ptr [[ARGC_CASTED]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i64, ptr [[ARGC_CASTED]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l24_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]]) #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT: call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK1-NEXT: ret void
+// CHECK1: worker.exit:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l24_omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[ARGC_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[ARGC]], ptr [[ARGC_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK1-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK1: omp.precond.then:
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK1-NEXT: call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP6]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK1-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1
+// CHECK1-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]]
+// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64
+// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP18]], ptr [[ARGC_CASTED]], align 4
+// CHECK1-NEXT: [[TMP19:%.*]] = load i64, ptr [[ARGC_CASTED]], align 8
+// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
+// CHECK1-NEXT: [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to ptr
+// CHECK1-NEXT: store ptr [[TMP21]], ptr [[TMP20]], align 8
+// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
+// CHECK1-NEXT: [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to ptr
+// CHECK1-NEXT: store ptr [[TMP23]], ptr [[TMP22]], align 8
+// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
+// CHECK1-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to ptr
+// CHECK1-NEXT: store ptr [[TMP25]], ptr [[TMP24]], align 8
+// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3
+// CHECK1-NEXT: store ptr [[TMP0]], ptr [[TMP26]], align 8
+// CHECK1-NEXT: [[TMP27:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
+// CHECK1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP28]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l24_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4)
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]]
+// CHECK1-NEXT: store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]]
+// CHECK1-NEXT: store i32 [[ADD7]], ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP34:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]]
+// CHECK1-NEXT: store i32 [[ADD8]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP35:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]]
+// CHECK1-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]]
+// CHECK1: cond.true10:
+// CHECK1-NEXT: [[TMP37:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: br label [[COND_END12:%.*]]
+// CHECK1: cond.false11:
+// CHECK1-NEXT: [[TMP38:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END12]]
+// CHECK1: cond.end12:
+// CHECK1-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ]
+// CHECK1-NEXT: store i32 [[COND13]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP39:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP39]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: [[TMP40:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP41:%.*]] = load i32, ptr [[TMP40]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3:[0-9]+]], i32 [[TMP41]])
+// CHECK1-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK1: omp.precond.end:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l24_omp_outlined_omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[ARGC_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I4:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[ARGC]], ptr [[ARGC_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK1-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK1: omp.precond.then:
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP5:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP5]] to i32
+// CHECK1-NEXT: [[TMP6:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP6]] to i32
+// CHECK1-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[CONV3]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB3]], i32 [[TMP8]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP9]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[CONV5:%.*]] = sext i32 [[TMP10]] to i64
+// CHECK1-NEXT: [[TMP11:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: [[CMP6:%.*]] = icmp ule i64 [[CONV5]], [[TMP11]]
+// CHECK1-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[I4]], align 4
+// CHECK1-NEXT: [[CALL:%.*]] = call noundef i32 @_Z3fooPi(ptr noundef [[I4]]) #[[ATTR5:[0-9]+]]
+// CHECK1-NEXT: [[CALL7:%.*]] = call noundef i32 @_Z3fooPi(ptr noundef [[TMP0]]) #[[ATTR5]]
+// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 [[CALL]], [[CALL7]]
+// CHECK1-NEXT: [[CALL9:%.*]] = call noundef i32 @_Z3fooPi(ptr noundef [[ARGC_ADDR]]) #[[ATTR5]]
+// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD8]], [[CALL9]]
+// CHECK1-NEXT: store i32 [[ADD10]], ptr [[TMP0]], align 4
+// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1: omp.body.continue:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK1-NEXT: store i32 [[ADD11]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: [[TMP15:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP16]])
+// CHECK1-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK1: omp.precond.end:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l24
+// CHECK2-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK2-NEXT: entry:
+// CHECK2-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK2-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4
+// CHECK2-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1:[0-9]+]], i8 2, i1 false)
+// CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK2: user_code.entry:
+// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP3]], ptr [[ARGC_CASTED]], align 4
+// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARGC_CASTED]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
+// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l24_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]]) #[[ATTR3:[0-9]+]]
+// CHECK2-NEXT: call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
+// CHECK2-NEXT: ret void
+// CHECK2: worker.exit:
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l24_omp_outlined
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK2-NEXT: entry:
+// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK2-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[ARGC_CASTED:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 4
+// CHECK2-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK2-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK2-NEXT: store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4
+// CHECK2-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK2-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK2-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK2: omp.precond.then:
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK2-NEXT: [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK2-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK2-NEXT: call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP6]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK2-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK2-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK2: cond.true:
+// CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: br label [[COND_END:%.*]]
+// CHECK2: cond.false:
+// CHECK2-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: br label [[COND_END]]
+// CHECK2: cond.end:
+// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ]
+// CHECK2-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK2: omp.inner.for.cond:
+// CHECK2-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1
+// CHECK2-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]]
+// CHECK2-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK2: omp.inner.for.body:
+// CHECK2-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP16:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP16]], ptr [[ARGC_CASTED]], align 4
+// CHECK2-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARGC_CASTED]], align 4
+// CHECK2-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
+// CHECK2-NEXT: [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to ptr
+// CHECK2-NEXT: store ptr [[TMP19]], ptr [[TMP18]], align 4
+// CHECK2-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
+// CHECK2-NEXT: [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to ptr
+// CHECK2-NEXT: store ptr [[TMP21]], ptr [[TMP20]], align 4
+// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 2
+// CHECK2-NEXT: [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to ptr
+// CHECK2-NEXT: store ptr [[TMP23]], ptr [[TMP22]], align 4
+// CHECK2-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 3
+// CHECK2-NEXT: store ptr [[TMP0]], ptr [[TMP24]], align 4
+// CHECK2-NEXT: [[TMP25:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK2-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
+// CHECK2-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP26]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l24_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 4)
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK2: omp.inner.for.inc:
+// CHECK2-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP27]], [[TMP28]]
+// CHECK2-NEXT: store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP29]], [[TMP30]]
+// CHECK2-NEXT: store i32 [[ADD7]], ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP31]], [[TMP32]]
+// CHECK2-NEXT: store i32 [[ADD8]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP34:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[TMP33]], [[TMP34]]
+// CHECK2-NEXT: br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]]
+// CHECK2: cond.true10:
+// CHECK2-NEXT: [[TMP35:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: br label [[COND_END12:%.*]]
+// CHECK2: cond.false11:
+// CHECK2-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: br label [[COND_END12]]
+// CHECK2: cond.end12:
+// CHECK2-NEXT: [[COND13:%.*]] = phi i32 [ [[TMP35]], [[COND_TRUE10]] ], [ [[TMP36]], [[COND_FALSE11]] ]
+// CHECK2-NEXT: store i32 [[COND13]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP37:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: store i32 [[TMP37]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK2: omp.inner.for.end:
+// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK2: omp.loop.exit:
+// CHECK2-NEXT: [[TMP38:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK2-NEXT: [[TMP39:%.*]] = load i32, ptr [[TMP38]], align 4
+// CHECK2-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3:[0-9]+]], i32 [[TMP39]])
+// CHECK2-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK2: omp.precond.end:
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l24_omp_outlined_omp_outlined
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] {
+// CHECK2-NEXT: entry:
+// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK2-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK2-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK2-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK2-NEXT: store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4
+// CHECK2-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK2-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK2-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK2: omp.precond.then:
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK2-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_LB]], align 4
+// CHECK2-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK2-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK2-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK2-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB3]], i32 [[TMP8]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK2-NEXT: store i32 [[TMP9]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK2: omp.inner.for.cond:
+// CHECK2-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK2-NEXT: [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]]
+// CHECK2-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK2: omp.inner.for.body:
+// CHECK2-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1
+// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK2-NEXT: store i32 [[ADD]], ptr [[I3]], align 4
+// CHECK2-NEXT: [[CALL:%.*]] = call noundef i32 @_Z3fooPi(ptr noundef [[I3]]) #[[ATTR5:[0-9]+]]
+// CHECK2-NEXT: [[CALL5:%.*]] = call noundef i32 @_Z3fooPi(ptr noundef [[TMP0]]) #[[ATTR5]]
+// CHECK2-NEXT: [[ADD6:%.*]] = add nsw i32 [[CALL]], [[CALL5]]
+// CHECK2-NEXT: [[CALL7:%.*]] = call noundef i32 @_Z3fooPi(ptr noundef [[ARGC_ADDR]]) #[[ATTR5]]
+// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD6]], [[CALL7]]
+// CHECK2-NEXT: store i32 [[ADD8]], ptr [[TMP0]], align 4
+// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK2: omp.body.continue:
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK2: omp.inner.for.inc:
+// CHECK2-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK2-NEXT: store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK2: omp.inner.for.end:
+// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK2: omp.loop.exit:
+// CHECK2-NEXT: [[TMP15:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK2-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
+// CHECK2-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP16]])
+// CHECK2-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK2: omp.precond.end:
+// CHECK2-NEXT: ret void
+//
diff --git a/clang/test/OpenMP/parallel_generic_loop_codegen.cpp b/clang/test/OpenMP/parallel_generic_loop_codegen.cpp
new file mode 100644
index 00000000000000..f2eedc7f9cad1d
--- /dev/null
+++ b/clang/test/OpenMP/parallel_generic_loop_codegen.cpp
@@ -0,0 +1,224 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp %s
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -emit-llvm %s -o - | FileCheck %s --check-prefix=IR
+
+// Check same results after serialization round-trip
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -emit-pch -o %t %s
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=IR-PCH
+
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+#define N 64
+int foo() {
+ int x = 0;
+ int result[N] = {0};
+
+ #pragma omp parallel loop num_threads(N) allocate(x) private(x) collapse(2)
+ for (int i = 0; i < N; i++)
+ for (int j = 0; j < N; j++)
+ result[i] = i + j + x;
+ return 0;
+}
+#endif
+// IR-LABEL: define {{[^@]+}}@_Z3foov
+// IR-SAME: () #[[ATTR0:[0-9]+]] {
+// IR-NEXT: entry:
+// IR-NEXT: [[X:%.*]] = alloca i32, align 4
+// IR-NEXT: [[RESULT:%.*]] = alloca [64 x i32], align 16
+// IR-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2:[0-9]+]])
+// IR-NEXT: store i32 0, ptr [[X]], align 4
+// IR-NEXT: call void @llvm.memset.p0.i64(ptr align 16 [[RESULT]], i8 0, i64 256, i1 false)
+// IR-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB2]], i32 [[TMP0]], i32 64)
+// IR-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @_Z3foov.omp_outlined, ptr [[RESULT]])
+// IR-NEXT: ret i32 0
+//
+//
+// IR-LABEL: define {{[^@]+}}@_Z3foov.omp_outlined
+// IR-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(256) [[RESULT:%.*]]) #[[ATTR2:[0-9]+]] {
+// IR-NEXT: entry:
+// IR-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// IR-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// IR-NEXT: [[RESULT_ADDR:%.*]] = alloca ptr, align 8
+// IR-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// IR-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// IR-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// IR-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// IR-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// IR-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// IR-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// IR-NEXT: [[I:%.*]] = alloca i32, align 4
+// IR-NEXT: [[J:%.*]] = alloca i32, align 4
+// IR-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// IR-NEXT: store ptr [[RESULT]], ptr [[RESULT_ADDR]], align 8
+// IR-NEXT: [[TMP0:%.*]] = load ptr, ptr [[RESULT_ADDR]], align 8
+// IR-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// IR-NEXT: store i32 4095, ptr [[DOTOMP_UB]], align 4
+// IR-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// IR-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// IR-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// IR-NEXT: [[DOTX__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP2]], i64 4, ptr null)
+// IR-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP2]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// IR-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// IR-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 4095
+// IR-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// IR: cond.true:
+// IR-NEXT: br label [[COND_END:%.*]]
+// IR: cond.false:
+// IR-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// IR-NEXT: br label [[COND_END]]
+// IR: cond.end:
+// IR-NEXT: [[COND:%.*]] = phi i32 [ 4095, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// IR-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// IR-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// IR-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// IR-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// IR: omp.inner.for.cond:
+// IR-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// IR-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// IR-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// IR-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
+// IR: omp.inner.for.cond.cleanup:
+// IR-NEXT: br label [[OMP_INNER_FOR_END:%.*]]
+// IR: omp.inner.for.body:
+// IR-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// IR-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 64
+// IR-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
+// IR-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// IR-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// IR-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// IR-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// IR-NEXT: [[DIV3:%.*]] = sdiv i32 [[TMP10]], 64
+// IR-NEXT: [[MUL4:%.*]] = mul nsw i32 [[DIV3]], 64
+// IR-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL4]]
+// IR-NEXT: [[MUL5:%.*]] = mul nsw i32 [[SUB]], 1
+// IR-NEXT: [[ADD6:%.*]] = add nsw i32 0, [[MUL5]]
+// IR-NEXT: store i32 [[ADD6]], ptr [[J]], align 4
+// IR-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4
+// IR-NEXT: [[TMP12:%.*]] = load i32, ptr [[J]], align 4
+// IR-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// IR-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTX__VOID_ADDR]], align 4
+// IR-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP13]]
+// IR-NEXT: [[TMP14:%.*]] = load i32, ptr [[I]], align 4
+// IR-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64
+// IR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [64 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// IR-NEXT: store i32 [[ADD8]], ptr [[ARRAYIDX]], align 4
+// IR-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// IR: omp.body.continue:
+// IR-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// IR: omp.inner.for.inc:
+// IR-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// IR-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP15]], 1
+// IR-NEXT: store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4
+// IR-NEXT: br label [[OMP_INNER_FOR_COND]]
+// IR: omp.inner.for.end:
+// IR-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// IR: omp.loop.exit:
+// IR-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP2]])
+// IR-NEXT: call void @__kmpc_free(i32 [[TMP2]], ptr [[DOTX__VOID_ADDR]], ptr null)
+// IR-NEXT: ret void
+//
+//
+// IR-PCH-LABEL: define {{[^@]+}}@_Z3foov
+// IR-PCH-SAME: () #[[ATTR0:[0-9]+]] {
+// IR-PCH-NEXT: entry:
+// IR-PCH-NEXT: [[X:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[RESULT:%.*]] = alloca [64 x i32], align 16
+// IR-PCH-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2:[0-9]+]])
+// IR-PCH-NEXT: store i32 0, ptr [[X]], align 4
+// IR-PCH-NEXT: call void @llvm.memset.p0.i64(ptr align 16 [[RESULT]], i8 0, i64 256, i1 false)
+// IR-PCH-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB2]], i32 [[TMP0]], i32 64)
+// IR-PCH-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @_Z3foov.omp_outlined, ptr [[RESULT]])
+// IR-PCH-NEXT: ret i32 0
+//
+//
+// IR-PCH-LABEL: define {{[^@]+}}@_Z3foov.omp_outlined
+// IR-PCH-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(256) [[RESULT:%.*]]) #[[ATTR2:[0-9]+]] {
+// IR-PCH-NEXT: entry:
+// IR-PCH-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-NEXT: [[RESULT_ADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[I:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[J:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-PCH-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// IR-PCH-NEXT: store ptr [[RESULT]], ptr [[RESULT_ADDR]], align 8
+// IR-PCH-NEXT: [[TMP0:%.*]] = load ptr, ptr [[RESULT_ADDR]], align 8
+// IR-PCH-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// IR-PCH-NEXT: store i32 4095, ptr [[DOTOMP_UB]], align 4
+// IR-PCH-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// IR-PCH-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// IR-PCH-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-PCH-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// IR-PCH-NEXT: [[DOTX__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP2]], i64 4, ptr null)
+// IR-PCH-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP2]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// IR-PCH-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// IR-PCH-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 4095
+// IR-PCH-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// IR-PCH: cond.true:
+// IR-PCH-NEXT: br label [[COND_END:%.*]]
+// IR-PCH: cond.false:
+// IR-PCH-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// IR-PCH-NEXT: br label [[COND_END]]
+// IR-PCH: cond.end:
+// IR-PCH-NEXT: [[COND:%.*]] = phi i32 [ 4095, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// IR-PCH-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// IR-PCH-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// IR-PCH-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// IR-PCH-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// IR-PCH: omp.inner.for.cond:
+// IR-PCH-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// IR-PCH-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// IR-PCH-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// IR-PCH-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
+// IR-PCH: omp.inner.for.cond.cleanup:
+// IR-PCH-NEXT: br label [[OMP_INNER_FOR_END:%.*]]
+// IR-PCH: omp.inner.for.body:
+// IR-PCH-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// IR-PCH-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP8]], 64
+// IR-PCH-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
+// IR-PCH-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// IR-PCH-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// IR-PCH-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// IR-PCH-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// IR-PCH-NEXT: [[DIV3:%.*]] = sdiv i32 [[TMP10]], 64
+// IR-PCH-NEXT: [[MUL4:%.*]] = mul nsw i32 [[DIV3]], 64
+// IR-PCH-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL4]]
+// IR-PCH-NEXT: [[MUL5:%.*]] = mul nsw i32 [[SUB]], 1
+// IR-PCH-NEXT: [[ADD6:%.*]] = add nsw i32 0, [[MUL5]]
+// IR-PCH-NEXT: store i32 [[ADD6]], ptr [[J]], align 4
+// IR-PCH-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4
+// IR-PCH-NEXT: [[TMP12:%.*]] = load i32, ptr [[J]], align 4
+// IR-PCH-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// IR-PCH-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTX__VOID_ADDR]], align 4
+// IR-PCH-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP13]]
+// IR-PCH-NEXT: [[TMP14:%.*]] = load i32, ptr [[I]], align 4
+// IR-PCH-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64
+// IR-PCH-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [64 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// IR-PCH-NEXT: store i32 [[ADD8]], ptr [[ARRAYIDX]], align 4
+// IR-PCH-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// IR-PCH: omp.body.continue:
+// IR-PCH-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// IR-PCH: omp.inner.for.inc:
+// IR-PCH-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// IR-PCH-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP15]], 1
+// IR-PCH-NEXT: store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4
+// IR-PCH-NEXT: br label [[OMP_INNER_FOR_COND]]
+// IR-PCH: omp.inner.for.end:
+// IR-PCH-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// IR-PCH: omp.loop.exit:
+// IR-PCH-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP2]])
+// IR-PCH-NEXT: call void @__kmpc_free(i32 [[TMP2]], ptr [[DOTX__VOID_ADDR]], ptr null)
+// IR-PCH-NEXT: ret void
+//
diff --git a/clang/test/OpenMP/target_parallel_generic_loop_codegen-1.cpp b/clang/test/OpenMP/target_parallel_generic_loop_codegen-1.cpp
new file mode 100644
index 00000000000000..3220562510e323
--- /dev/null
+++ b/clang/test/OpenMP/target_parallel_generic_loop_codegen-1.cpp
@@ -0,0 +1,8390 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// Test host codegen.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+
+// Test target parallel for codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s -check-prefix=TCHECK
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s -check-prefix=TCHECK
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+
+// Check that no target code is emitted if no omptests flag was provided.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK-NTARGET
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY2 %s
+
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix OMP-DEFAULT
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix OMP-DEFAULT
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix OMP-DEfAULT
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix OMP-DEFAULT
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+
+// Test target parallel for codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s -check-prefix=TCHECK
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK
+
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s -check-prefix=TCHECK
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+
+// Check that no target code is emitted if no omptests flag was provided.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK-NTARGET-OMP-DEFAULT
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY2 %s
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+
+
+
+
+// We have 7 target regions
+
+
+
+
+
+// We have 4 initializers, one for the 500 priority, another one for 501, or more for the default priority, and the last one for the offloading registration function.
+
+
+extern int *R;
+
+struct SA {
+ int arr[4];
+ void foo() {
+ int a = *R;
+ a += 1;
+ *R = a;
+ }
+ SA() {
+ int a = *R;
+ a += 2;
+ *R = a;
+ }
+ ~SA() {
+ int a = *R;
+ a += 3;
+ *R = a;
+ }
+};
+
+struct SB {
+ int arr[8];
+ void foo() {
+ int a = *R;
+ #pragma omp target parallel loop
+ for (int i = 0; i < 10; ++i)
+ a += 4;
+ *R = a;
+ }
+ SB() {
+ int a = *R;
+ a += 5;
+ *R = a;
+ }
+ ~SB() {
+ int a = *R;
+ a += 6;
+ *R = a;
+ }
+};
+
+struct SC {
+ int arr[16];
+ void foo() {
+ int a = *R;
+ a += 7;
+ *R = a;
+ }
+ SC() {
+ int a = *R;
+ #pragma omp target parallel loop
+ for (int i = 0; i < 10; ++i)
+ a += 8;
+ *R = a;
+ }
+ ~SC() {
+ int a = *R;
+ a += 9;
+ *R = a;
+ }
+};
+
+struct SD {
+ int arr[32];
+ void foo() {
+ int a = *R;
+ a += 10;
+ *R = a;
+ }
+ SD() {
+ int a = *R;
+ a += 11;
+ *R = a;
+ }
+ ~SD() {
+ int a = *R;
+ #pragma omp target parallel loop
+ for (int i = 0; i < 10; ++i)
+ a += 12;
+ *R = a;
+ }
+};
+
+struct SE {
+ int arr[64];
+ void foo() {
+ int a = *R;
+ #pragma omp target parallel loop if(target: 0)
+ for (int i = 0; i < 10; ++i)
+ a += 13;
+ *R = a;
+ }
+ SE() {
+ int a = *R;
+ #pragma omp target parallel loop
+ for (int i = 0; i < 10; ++i)
+ a += 14;
+ *R = a;
+ }
+ ~SE() {
+ int a = *R;
+ #pragma omp target parallel loop
+ for (int i = 0; i < 10; ++i)
+ a += 15;
+ *R = a;
+ }
+};
+
+template <int x>
+struct ST {
+ int arr[128 + x];
+ void foo() {
+ int a = *R;
+ #pragma omp target parallel loop
+ for (int i = 0; i < 10; ++i)
+ a += 16 + x;
+ *R = a;
+ }
+ ST() {
+ int a = *R;
+ #pragma omp target parallel loop
+ for (int i = 0; i < 10; ++i)
+ a += 17 + x;
+ *R = a;
+ }
+ ~ST() {
+ int a = *R;
+ #pragma omp target parallel loop
+ for (int i = 0; i < 10; ++i)
+ a += 18 + x;
+ *R = a;
+ }
+};
+
+// We have to make sure we us all the target regions:
+
+
+
+
+// We have 2 initializers with priority 500
+
+// We have 1 initializers with priority 501
+
+// We have 6 initializers with default priority
+
+static __attribute__((init_priority(500))) SA a1;
+SA a2;
+SB __attribute__((init_priority(500))) b1;
+SB __attribute__((init_priority(501))) b2;
+static SC c1;
+SD d1;
+SE e1;
+ST<100> t1;
+ST<1000> t2;
+
+
+int bar(int a){
+ int r = a;
+
+ a1.foo();
+ a2.foo();
+ b1.foo();
+ b2.foo();
+ c1.foo();
+ d1.foo();
+ e1.foo();
+ t1.foo();
+ t2.foo();
+
+ #pragma omp target parallel loop
+ for (int i = 0; i < 10; ++i)
+ ++r;
+
+ return r + *R;
+}
+
+// Check metadata is properly generated:
+
+
+#endif
+// CHECK-LABEL: define {{[^@]+}}@__cxx_global_var_init
+// CHECK-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @_ZN2SAC1Ev(ptr noundef nonnull align 4 dereferenceable(16) @_ZL2a1)
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SAD1Ev, ptr @_ZL2a1, ptr @__dso_handle) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT: ret void
+//
+//
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@__cxx_global_var_init.1
+// CHECK-SAME: () #[[ATTR4]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @_ZN2SAC1Ev(ptr noundef nonnull align 4 dereferenceable(16) @a2)
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SAD1Ev, ptr @a2, ptr @__dso_handle) #[[ATTR3]]
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@__cxx_global_var_init.2
+// CHECK-SAME: () #[[ATTR4]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @_ZN2SBC1Ev(ptr noundef nonnull align 4 dereferenceable(32) @b1)
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SBD1Ev, ptr @b1, ptr @__dso_handle) #[[ATTR3]]
+// CHECK-NEXT: ret void
+//
+//
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@__cxx_global_var_init.3
+// CHECK-SAME: () #[[ATTR4]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @_ZN2SBC1Ev(ptr noundef nonnull align 4 dereferenceable(32) @b2)
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SBD1Ev, ptr @b2, ptr @__dso_handle) #[[ATTR3]]
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@__cxx_global_var_init.4
+// CHECK-SAME: () #[[ATTR4]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @_ZN2SCC1Ev(ptr noundef nonnull align 4 dereferenceable(64) @_ZL2c1)
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SCD1Ev, ptr @_ZL2c1, ptr @__dso_handle) #[[ATTR3]]
+// CHECK-NEXT: ret void
+//
+//
+//
+//
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@__cxx_global_var_init.7
+// CHECK-SAME: () #[[ATTR4]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @_ZN2SDC1Ev(ptr noundef nonnull align 4 dereferenceable(128) @d1)
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SDD1Ev, ptr @d1, ptr @__dso_handle) #[[ATTR3]]
+// CHECK-NEXT: ret void
+//
+//
+//
+//
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@__cxx_global_var_init.10
+// CHECK-SAME: () #[[ATTR4]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @_ZN2SEC1Ev(ptr noundef nonnull align 4 dereferenceable(256) @e1)
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SED1Ev, ptr @e1, ptr @__dso_handle) #[[ATTR3]]
+// CHECK-NEXT: ret void
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@__cxx_global_var_init.15
+// CHECK-SAME: () #[[ATTR4]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @_ZN2STILi100EEC1Ev(ptr noundef nonnull align 4 dereferenceable(912) @t1)
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2STILi100EED1Ev, ptr @t1, ptr @__dso_handle) #[[ATTR3]]
+// CHECK-NEXT: ret void
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@__cxx_global_var_init.20
+// CHECK-SAME: () #[[ATTR4]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @_ZN2STILi1000EEC1Ev(ptr noundef nonnull align 4 dereferenceable(4512) @t2)
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2STILi1000EED1Ev, ptr @t2, ptr @__dso_handle) #[[ATTR3]]
+// CHECK-NEXT: ret void
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_GLOBAL__I_000500
+// CHECK-SAME: () #[[ATTR4]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @__cxx_global_var_init()
+// CHECK-NEXT: call void @__cxx_global_var_init.2()
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_GLOBAL__I_000501
+// CHECK-SAME: () #[[ATTR4]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @__cxx_global_var_init.3()
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_GLOBAL__sub_I_target_parallel_generic_loop_codegen_1.cpp
+// CHECK-SAME: () #[[ATTR4]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @__cxx_global_var_init.1()
+// CHECK-NEXT: call void @__cxx_global_var_init.4()
+// CHECK-NEXT: call void @__cxx_global_var_init.7()
+// CHECK-NEXT: call void @__cxx_global_var_init.10()
+// CHECK-NEXT: call void @__cxx_global_var_init.15()
+// CHECK-NEXT: call void @__cxx_global_var_init.20()
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK-SAME: () #[[ATTR4]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK-NEXT: ret void
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@__cxx_global_var_init
+// SIMD-ONLY0-SAME: () #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY0-NEXT: entry:
+// SIMD-ONLY0-NEXT: call void @_ZN2SAC1Ev(ptr noundef nonnull align 4 dereferenceable(16) @_ZL2a1)
+// SIMD-ONLY0-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SAD1Ev, ptr @_ZL2a1, ptr @__dso_handle) #[[ATTR2:[0-9]+]]
+// SIMD-ONLY0-NEXT: ret void
+//
+//
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@__cxx_global_var_init.1
+// SIMD-ONLY0-SAME: () #[[ATTR0]] {
+// SIMD-ONLY0-NEXT: entry:
+// SIMD-ONLY0-NEXT: call void @_ZN2SAC1Ev(ptr noundef nonnull align 4 dereferenceable(16) @a2)
+// SIMD-ONLY0-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SAD1Ev, ptr @a2, ptr @__dso_handle) #[[ATTR2]]
+// SIMD-ONLY0-NEXT: ret void
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@__cxx_global_var_init.2
+// SIMD-ONLY0-SAME: () #[[ATTR0]] {
+// SIMD-ONLY0-NEXT: entry:
+// SIMD-ONLY0-NEXT: call void @_ZN2SBC1Ev(ptr noundef nonnull align 4 dereferenceable(32) @b1)
+// SIMD-ONLY0-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SBD1Ev, ptr @b1, ptr @__dso_handle) #[[ATTR2]]
+// SIMD-ONLY0-NEXT: ret void
+//
+//
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@__cxx_global_var_init.3
+// SIMD-ONLY0-SAME: () #[[ATTR0]] {
+// SIMD-ONLY0-NEXT: entry:
+// SIMD-ONLY0-NEXT: call void @_ZN2SBC1Ev(ptr noundef nonnull align 4 dereferenceable(32) @b2)
+// SIMD-ONLY0-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SBD1Ev, ptr @b2, ptr @__dso_handle) #[[ATTR2]]
+// SIMD-ONLY0-NEXT: ret void
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@__cxx_global_var_init.4
+// SIMD-ONLY0-SAME: () #[[ATTR0]] {
+// SIMD-ONLY0-NEXT: entry:
+// SIMD-ONLY0-NEXT: call void @_ZN2SCC1Ev(ptr noundef nonnull align 4 dereferenceable(64) @_ZL2c1)
+// SIMD-ONLY0-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SCD1Ev, ptr @_ZL2c1, ptr @__dso_handle) #[[ATTR2]]
+// SIMD-ONLY0-NEXT: ret void
+//
+//
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@__cxx_global_var_init.5
+// SIMD-ONLY0-SAME: () #[[ATTR0]] {
+// SIMD-ONLY0-NEXT: entry:
+// SIMD-ONLY0-NEXT: call void @_ZN2SDC1Ev(ptr noundef nonnull align 4 dereferenceable(128) @d1)
+// SIMD-ONLY0-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SDD1Ev, ptr @d1, ptr @__dso_handle) #[[ATTR2]]
+// SIMD-ONLY0-NEXT: ret void
+//
+//
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@__cxx_global_var_init.6
+// SIMD-ONLY0-SAME: () #[[ATTR0]] {
+// SIMD-ONLY0-NEXT: entry:
+// SIMD-ONLY0-NEXT: call void @_ZN2SEC1Ev(ptr noundef nonnull align 4 dereferenceable(256) @e1)
+// SIMD-ONLY0-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SED1Ev, ptr @e1, ptr @__dso_handle) #[[ATTR2]]
+// SIMD-ONLY0-NEXT: ret void
+//
+//
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@__cxx_global_var_init.7
+// SIMD-ONLY0-SAME: () #[[ATTR0]] {
+// SIMD-ONLY0-NEXT: entry:
+// SIMD-ONLY0-NEXT: call void @_ZN2STILi100EEC1Ev(ptr noundef nonnull align 4 dereferenceable(912) @t1)
+// SIMD-ONLY0-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2STILi100EED1Ev, ptr @t1, ptr @__dso_handle) #[[ATTR2]]
+// SIMD-ONLY0-NEXT: ret void
+//
+//
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@__cxx_global_var_init.8
+// SIMD-ONLY0-SAME: () #[[ATTR0]] {
+// SIMD-ONLY0-NEXT: entry:
+// SIMD-ONLY0-NEXT: call void @_ZN2STILi1000EEC1Ev(ptr noundef nonnull align 4 dereferenceable(4512) @t2)
+// SIMD-ONLY0-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2STILi1000EED1Ev, ptr @t2, ptr @__dso_handle) #[[ATTR2]]
+// SIMD-ONLY0-NEXT: ret void
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_GLOBAL__I_000500
+// SIMD-ONLY0-SAME: () #[[ATTR0]] {
+// SIMD-ONLY0-NEXT: entry:
+// SIMD-ONLY0-NEXT: call void @__cxx_global_var_init()
+// SIMD-ONLY0-NEXT: call void @__cxx_global_var_init.2()
+// SIMD-ONLY0-NEXT: ret void
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_GLOBAL__I_000501
+// SIMD-ONLY0-SAME: () #[[ATTR0]] {
+// SIMD-ONLY0-NEXT: entry:
+// SIMD-ONLY0-NEXT: call void @__cxx_global_var_init.3()
+// SIMD-ONLY0-NEXT: ret void
+//
+//
+// SIMD-ONLY0-LABEL: define {{[^@]+}}@_GLOBAL__sub_I_target_parallel_generic_loop_codegen_1.cpp
+// SIMD-ONLY0-SAME: () #[[ATTR0]] {
+// SIMD-ONLY0-NEXT: entry:
+// SIMD-ONLY0-NEXT: call void @__cxx_global_var_init.1()
+// SIMD-ONLY0-NEXT: call void @__cxx_global_var_init.4()
+// SIMD-ONLY0-NEXT: call void @__cxx_global_var_init.5()
+// SIMD-ONLY0-NEXT: call void @__cxx_global_var_init.6()
+// SIMD-ONLY0-NEXT: call void @__cxx_global_var_init.7()
+// SIMD-ONLY0-NEXT: call void @__cxx_global_var_init.8()
+// SIMD-ONLY0-NEXT: ret void
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@__cxx_global_var_init
+// SIMD-ONLY1-SAME: () #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY1-NEXT: entry:
+// SIMD-ONLY1-NEXT: call void @_ZN2SAC1Ev(ptr noundef nonnull align 4 dereferenceable(16) @_ZL2a1)
+// SIMD-ONLY1-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SAD1Ev, ptr @_ZL2a1, ptr @__dso_handle) #[[ATTR2:[0-9]+]]
+// SIMD-ONLY1-NEXT: ret void
+//
+//
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@__cxx_global_var_init.1
+// SIMD-ONLY1-SAME: () #[[ATTR0]] {
+// SIMD-ONLY1-NEXT: entry:
+// SIMD-ONLY1-NEXT: call void @_ZN2SAC1Ev(ptr noundef nonnull align 4 dereferenceable(16) @a2)
+// SIMD-ONLY1-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SAD1Ev, ptr @a2, ptr @__dso_handle) #[[ATTR2]]
+// SIMD-ONLY1-NEXT: ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@__cxx_global_var_init.2
+// SIMD-ONLY1-SAME: () #[[ATTR0]] {
+// SIMD-ONLY1-NEXT: entry:
+// SIMD-ONLY1-NEXT: call void @_ZN2SBC1Ev(ptr noundef nonnull align 4 dereferenceable(32) @b1)
+// SIMD-ONLY1-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SBD1Ev, ptr @b1, ptr @__dso_handle) #[[ATTR2]]
+// SIMD-ONLY1-NEXT: ret void
+//
+//
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@__cxx_global_var_init.3
+// SIMD-ONLY1-SAME: () #[[ATTR0]] {
+// SIMD-ONLY1-NEXT: entry:
+// SIMD-ONLY1-NEXT: call void @_ZN2SBC1Ev(ptr noundef nonnull align 4 dereferenceable(32) @b2)
+// SIMD-ONLY1-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SBD1Ev, ptr @b2, ptr @__dso_handle) #[[ATTR2]]
+// SIMD-ONLY1-NEXT: ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@__cxx_global_var_init.4
+// SIMD-ONLY1-SAME: () #[[ATTR0]] {
+// SIMD-ONLY1-NEXT: entry:
+// SIMD-ONLY1-NEXT: call void @_ZN2SCC1Ev(ptr noundef nonnull align 4 dereferenceable(64) @_ZL2c1)
+// SIMD-ONLY1-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SCD1Ev, ptr @_ZL2c1, ptr @__dso_handle) #[[ATTR2]]
+// SIMD-ONLY1-NEXT: ret void
+//
+//
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@__cxx_global_var_init.5
+// SIMD-ONLY1-SAME: () #[[ATTR0]] {
+// SIMD-ONLY1-NEXT: entry:
+// SIMD-ONLY1-NEXT: call void @_ZN2SDC1Ev(ptr noundef nonnull align 4 dereferenceable(128) @d1)
+// SIMD-ONLY1-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SDD1Ev, ptr @d1, ptr @__dso_handle) #[[ATTR2]]
+// SIMD-ONLY1-NEXT: ret void
+//
+//
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@__cxx_global_var_init.6
+// SIMD-ONLY1-SAME: () #[[ATTR0]] {
+// SIMD-ONLY1-NEXT: entry:
+// SIMD-ONLY1-NEXT: call void @_ZN2SEC1Ev(ptr noundef nonnull align 4 dereferenceable(256) @e1)
+// SIMD-ONLY1-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SED1Ev, ptr @e1, ptr @__dso_handle) #[[ATTR2]]
+// SIMD-ONLY1-NEXT: ret void
+//
+//
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@__cxx_global_var_init.7
+// SIMD-ONLY1-SAME: () #[[ATTR0]] {
+// SIMD-ONLY1-NEXT: entry:
+// SIMD-ONLY1-NEXT: call void @_ZN2STILi100EEC1Ev(ptr noundef nonnull align 4 dereferenceable(912) @t1)
+// SIMD-ONLY1-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2STILi100EED1Ev, ptr @t1, ptr @__dso_handle) #[[ATTR2]]
+// SIMD-ONLY1-NEXT: ret void
+//
+//
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@__cxx_global_var_init.8
+// SIMD-ONLY1-SAME: () #[[ATTR0]] {
+// SIMD-ONLY1-NEXT: entry:
+// SIMD-ONLY1-NEXT: call void @_ZN2STILi1000EEC1Ev(ptr noundef nonnull align 4 dereferenceable(4512) @t2)
+// SIMD-ONLY1-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2STILi1000EED1Ev, ptr @t2, ptr @__dso_handle) #[[ATTR2]]
+// SIMD-ONLY1-NEXT: ret void
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_GLOBAL__I_000500
+// SIMD-ONLY1-SAME: () #[[ATTR0]] {
+// SIMD-ONLY1-NEXT: entry:
+// SIMD-ONLY1-NEXT: call void @__cxx_global_var_init()
+// SIMD-ONLY1-NEXT: call void @__cxx_global_var_init.2()
+// SIMD-ONLY1-NEXT: ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_GLOBAL__I_000501
+// SIMD-ONLY1-SAME: () #[[ATTR0]] {
+// SIMD-ONLY1-NEXT: entry:
+// SIMD-ONLY1-NEXT: call void @__cxx_global_var_init.3()
+// SIMD-ONLY1-NEXT: ret void
+//
+//
+// SIMD-ONLY1-LABEL: define {{[^@]+}}@_GLOBAL__sub_I_target_parallel_generic_loop_codegen_1.cpp
+// SIMD-ONLY1-SAME: () #[[ATTR0]] {
+// SIMD-ONLY1-NEXT: entry:
+// SIMD-ONLY1-NEXT: call void @__cxx_global_var_init.1()
+// SIMD-ONLY1-NEXT: call void @__cxx_global_var_init.4()
+// SIMD-ONLY1-NEXT: call void @__cxx_global_var_init.5()
+// SIMD-ONLY1-NEXT: call void @__cxx_global_var_init.6()
+// SIMD-ONLY1-NEXT: call void @__cxx_global_var_init.7()
+// SIMD-ONLY1-NEXT: call void @__cxx_global_var_init.8()
+// SIMD-ONLY1-NEXT: ret void
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_Z3bari
+// CHECK-NTARGET-SAME: (i32 noundef signext [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[R:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[R_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP0]], ptr [[R]], align 4
+// CHECK-NTARGET-NEXT: call void @_ZN2SA3fooEv(ptr noundef nonnull align 4 dereferenceable(16) @_ZL2a1)
+// CHECK-NTARGET-NEXT: call void @_ZN2SA3fooEv(ptr noundef nonnull align 4 dereferenceable(16) @a2)
+// CHECK-NTARGET-NEXT: call void @_ZN2SB3fooEv(ptr noundef nonnull align 4 dereferenceable(32) @b1)
+// CHECK-NTARGET-NEXT: call void @_ZN2SB3fooEv(ptr noundef nonnull align 4 dereferenceable(32) @b2)
+// CHECK-NTARGET-NEXT: call void @_ZN2SC3fooEv(ptr noundef nonnull align 4 dereferenceable(64) @_ZL2c1)
+// CHECK-NTARGET-NEXT: call void @_ZN2SD3fooEv(ptr noundef nonnull align 4 dereferenceable(128) @d1)
+// CHECK-NTARGET-NEXT: call void @_ZN2SE3fooEv(ptr noundef nonnull align 4 dereferenceable(256) @e1)
+// CHECK-NTARGET-NEXT: call void @_ZN2STILi100EE3fooEv(ptr noundef nonnull align 4 dereferenceable(912) @t1)
+// CHECK-NTARGET-NEXT: call void @_ZN2STILi1000EE3fooEv(ptr noundef nonnull align 4 dereferenceable(4512) @t2)
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[R]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP1]], ptr [[R_CASTED]], align 4
+// CHECK-NTARGET-NEXT: [[TMP2:%.*]] = load i64, ptr [[R_CASTED]], align 8
+// CHECK-NTARGET-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3bari_l267(i64 [[TMP2]]) #[[ATTR2:[0-9]+]]
+// CHECK-NTARGET-NEXT: [[TMP3:%.*]] = load i32, ptr [[R]], align 4
+// CHECK-NTARGET-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+// CHECK-NTARGET-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], [[TMP5]]
+// CHECK-NTARGET-NEXT: ret i32 [[ADD]]
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SA3fooEv
+// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 1
+// CHECK-NTARGET-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SB3fooEv
+// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(32) [[THIS:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP2]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-NEXT: [[TMP3:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SB3fooEv_l122(i64 [[TMP3]]) #[[ATTR2]]
+// CHECK-NTARGET-NEXT: [[TMP4:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP5:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: store i32 [[TMP4]], ptr [[TMP5]], align 4
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SC3fooEv
+// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(64) [[THIS:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 7
+// CHECK-NTARGET-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SD3fooEv
+// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(128) [[THIS:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 10
+// CHECK-NTARGET-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SE3fooEv
+// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[THIS:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP2]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-NEXT: [[TMP3:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SE3fooEv_l185(i64 [[TMP3]]) #[[ATTR2]]
+// CHECK-NTARGET-NEXT: [[TMP4:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP5:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: store i32 [[TMP4]], ptr [[TMP5]], align 4
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2STILi100EE3fooEv
+// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(912) [[THIS:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP2]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-NEXT: [[TMP3:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EE3fooEv_l211(i64 [[TMP3]]) #[[ATTR2]]
+// CHECK-NTARGET-NEXT: [[TMP4:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP5:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: store i32 [[TMP4]], ptr [[TMP5]], align 4
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2STILi1000EE3fooEv
+// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(4512) [[THIS:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP2]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-NEXT: [[TMP3:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EE3fooEv_l211(i64 [[TMP3]]) #[[ATTR2]]
+// CHECK-NTARGET-NEXT: [[TMP4:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP5:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: store i32 [[TMP4]], ptr [[TMP5]], align 4
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3bari_l267
+// CHECK-NTARGET-SAME: (i64 noundef [[R:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[R_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: [[R_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: store i64 [[R]], ptr [[R_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load i32, ptr [[R_ADDR]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP0]], ptr [[R_CASTED]], align 4
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i64, ptr [[R_CASTED]], align 8
+// CHECK-NTARGET-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2:[0-9]+]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3bari_l267.omp_outlined, i64 [[TMP1]])
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3bari_l267.omp_outlined
+// CHECK-NTARGET-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[R:%.*]]) #[[ATTR1]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[R_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: store i64 [[R]], ptr [[R_ADDR]], align 8
+// CHECK-NTARGET-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NTARGET-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NTARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-NTARGET-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-NTARGET: cond.true:
+// CHECK-NTARGET-NEXT: br label [[COND_END:%.*]]
+// CHECK-NTARGET: cond.false:
+// CHECK-NTARGET-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: br label [[COND_END]]
+// CHECK-NTARGET: cond.end:
+// CHECK-NTARGET-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-NTARGET-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-NTARGET: omp.inner.for.cond:
+// CHECK-NTARGET-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-NTARGET-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-NTARGET: omp.inner.for.body:
+// CHECK-NTARGET-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-NTARGET-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NTARGET-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-NTARGET-NEXT: [[TMP8:%.*]] = load i32, ptr [[R_ADDR]], align 4
+// CHECK-NTARGET-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1
+// CHECK-NTARGET-NEXT: store i32 [[INC]], ptr [[R_ADDR]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-NTARGET: omp.body.continue:
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-NTARGET: omp.inner.for.inc:
+// CHECK-NTARGET-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK-NTARGET-NEXT: store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-NTARGET: omp.inner.for.end:
+// CHECK-NTARGET-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-NTARGET: omp.loop.exit:
+// CHECK-NTARGET-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@__cxx_global_var_init
+// CHECK-NTARGET-SAME: () #[[ATTR3:[0-9]+]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: call void @_ZN2SAC1Ev(ptr noundef nonnull align 4 dereferenceable(16) @_ZL2a1)
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SAD1Ev, ptr @_ZL2a1, ptr @__dso_handle) #[[ATTR2]]
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SAC1Ev
+// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) unnamed_addr #[[ATTR4:[0-9]+]] comdat align 2 {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: call void @_ZN2SAC2Ev(ptr noundef nonnull align 4 dereferenceable(16) [[THIS1]])
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SAD1Ev
+// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: call void @_ZN2SAD2Ev(ptr noundef nonnull align 4 dereferenceable(16) [[THIS1]]) #[[ATTR2]]
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SAC2Ev
+// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 2
+// CHECK-NTARGET-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SAD2Ev
+// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 3
+// CHECK-NTARGET-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@__cxx_global_var_init.1
+// CHECK-NTARGET-SAME: () #[[ATTR3]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: call void @_ZN2SAC1Ev(ptr noundef nonnull align 4 dereferenceable(16) @a2)
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SAD1Ev, ptr @a2, ptr @__dso_handle) #[[ATTR2]]
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@__cxx_global_var_init.2
+// CHECK-NTARGET-SAME: () #[[ATTR3]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: call void @_ZN2SBC1Ev(ptr noundef nonnull align 4 dereferenceable(32) @b1)
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SBD1Ev, ptr @b1, ptr @__dso_handle) #[[ATTR2]]
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SBC1Ev
+// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(32) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: call void @_ZN2SBC2Ev(ptr noundef nonnull align 4 dereferenceable(32) [[THIS1]])
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SBD1Ev
+// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(32) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: call void @_ZN2SBD2Ev(ptr noundef nonnull align 4 dereferenceable(32) [[THIS1]]) #[[ATTR2]]
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SBC2Ev
+// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(32) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 5
+// CHECK-NTARGET-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SBD2Ev
+// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(32) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 6
+// CHECK-NTARGET-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@__cxx_global_var_init.3
+// CHECK-NTARGET-SAME: () #[[ATTR3]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: call void @_ZN2SBC1Ev(ptr noundef nonnull align 4 dereferenceable(32) @b2)
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SBD1Ev, ptr @b2, ptr @__dso_handle) #[[ATTR2]]
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@__cxx_global_var_init.4
+// CHECK-NTARGET-SAME: () #[[ATTR3]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: call void @_ZN2SCC1Ev(ptr noundef nonnull align 4 dereferenceable(64) @_ZL2c1)
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SCD1Ev, ptr @_ZL2c1, ptr @__dso_handle) #[[ATTR2]]
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SCC1Ev
+// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(64) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: call void @_ZN2SCC2Ev(ptr noundef nonnull align 4 dereferenceable(64) [[THIS1]])
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SCD1Ev
+// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(64) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: call void @_ZN2SCD2Ev(ptr noundef nonnull align 4 dereferenceable(64) [[THIS1]]) #[[ATTR2]]
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SCC2Ev
+// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(64) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP2]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-NEXT: [[TMP3:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SCC1Ev_l148(i64 [[TMP3]]) #[[ATTR2]]
+// CHECK-NTARGET-NEXT: [[TMP4:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP5:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: store i32 [[TMP4]], ptr [[TMP5]], align 4
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SCC1Ev_l148
+// CHECK-NTARGET-SAME: (i64 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SCC1Ev_l148.omp_outlined, i64 [[TMP1]])
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SCC1Ev_l148.omp_outlined
+// CHECK-NTARGET-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NTARGET-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NTARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-NTARGET-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-NTARGET: cond.true:
+// CHECK-NTARGET-NEXT: br label [[COND_END:%.*]]
+// CHECK-NTARGET: cond.false:
+// CHECK-NTARGET-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: br label [[COND_END]]
+// CHECK-NTARGET: cond.end:
+// CHECK-NTARGET-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-NTARGET-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-NTARGET: omp.inner.for.cond:
+// CHECK-NTARGET-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-NTARGET-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-NTARGET: omp.inner.for.body:
+// CHECK-NTARGET-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-NTARGET-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NTARGET-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-NTARGET-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 8
+// CHECK-NTARGET-NEXT: store i32 [[ADD2]], ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-NTARGET: omp.body.continue:
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-NTARGET: omp.inner.for.inc:
+// CHECK-NTARGET-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK-NTARGET-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-NTARGET: omp.inner.for.end:
+// CHECK-NTARGET-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-NTARGET: omp.loop.exit:
+// CHECK-NTARGET-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SCD2Ev
+// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(64) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 9
+// CHECK-NTARGET-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@__cxx_global_var_init.5
+// CHECK-NTARGET-SAME: () #[[ATTR3]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: call void @_ZN2SDC1Ev(ptr noundef nonnull align 4 dereferenceable(128) @d1)
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SDD1Ev, ptr @d1, ptr @__dso_handle) #[[ATTR2]]
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SDC1Ev
+// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(128) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: call void @_ZN2SDC2Ev(ptr noundef nonnull align 4 dereferenceable(128) [[THIS1]])
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SDD1Ev
+// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(128) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: call void @_ZN2SDD2Ev(ptr noundef nonnull align 4 dereferenceable(128) [[THIS1]]) #[[ATTR2]]
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SDC2Ev
+// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(128) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 11
+// CHECK-NTARGET-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SDD2Ev
+// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(128) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP2]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-NEXT: [[TMP3:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SDD1Ev_l174(i64 [[TMP3]]) #[[ATTR2]]
+// CHECK-NTARGET-NEXT: [[TMP4:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP5:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: store i32 [[TMP4]], ptr [[TMP5]], align 4
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SDD1Ev_l174
+// CHECK-NTARGET-SAME: (i64 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SDD1Ev_l174.omp_outlined, i64 [[TMP1]])
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SDD1Ev_l174.omp_outlined
+// CHECK-NTARGET-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NTARGET-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NTARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-NTARGET-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-NTARGET: cond.true:
+// CHECK-NTARGET-NEXT: br label [[COND_END:%.*]]
+// CHECK-NTARGET: cond.false:
+// CHECK-NTARGET-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: br label [[COND_END]]
+// CHECK-NTARGET: cond.end:
+// CHECK-NTARGET-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-NTARGET-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-NTARGET: omp.inner.for.cond:
+// CHECK-NTARGET-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-NTARGET-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-NTARGET: omp.inner.for.body:
+// CHECK-NTARGET-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-NTARGET-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NTARGET-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-NTARGET-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 12
+// CHECK-NTARGET-NEXT: store i32 [[ADD2]], ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-NTARGET: omp.body.continue:
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-NTARGET: omp.inner.for.inc:
+// CHECK-NTARGET-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK-NTARGET-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-NTARGET: omp.inner.for.end:
+// CHECK-NTARGET-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-NTARGET: omp.loop.exit:
+// CHECK-NTARGET-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@__cxx_global_var_init.6
+// CHECK-NTARGET-SAME: () #[[ATTR3]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: call void @_ZN2SEC1Ev(ptr noundef nonnull align 4 dereferenceable(256) @e1)
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SED1Ev, ptr @e1, ptr @__dso_handle) #[[ATTR2]]
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SEC1Ev
+// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: call void @_ZN2SEC2Ev(ptr noundef nonnull align 4 dereferenceable(256) [[THIS1]])
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SED1Ev
+// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: call void @_ZN2SED2Ev(ptr noundef nonnull align 4 dereferenceable(256) [[THIS1]]) #[[ATTR2]]
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SEC2Ev
+// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP2]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-NEXT: [[TMP3:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SEC1Ev_l192(i64 [[TMP3]]) #[[ATTR2]]
+// CHECK-NTARGET-NEXT: [[TMP4:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP5:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: store i32 [[TMP4]], ptr [[TMP5]], align 4
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SEC1Ev_l192
+// CHECK-NTARGET-SAME: (i64 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SEC1Ev_l192.omp_outlined, i64 [[TMP1]])
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SEC1Ev_l192.omp_outlined
+// CHECK-NTARGET-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NTARGET-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NTARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-NTARGET-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-NTARGET: cond.true:
+// CHECK-NTARGET-NEXT: br label [[COND_END:%.*]]
+// CHECK-NTARGET: cond.false:
+// CHECK-NTARGET-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: br label [[COND_END]]
+// CHECK-NTARGET: cond.end:
+// CHECK-NTARGET-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-NTARGET-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-NTARGET: omp.inner.for.cond:
+// CHECK-NTARGET-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-NTARGET-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-NTARGET: omp.inner.for.body:
+// CHECK-NTARGET-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-NTARGET-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NTARGET-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-NTARGET-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 14
+// CHECK-NTARGET-NEXT: store i32 [[ADD2]], ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-NTARGET: omp.body.continue:
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-NTARGET: omp.inner.for.inc:
+// CHECK-NTARGET-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK-NTARGET-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-NTARGET: omp.inner.for.end:
+// CHECK-NTARGET-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-NTARGET: omp.loop.exit:
+// CHECK-NTARGET-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2SED2Ev
+// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP2]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-NEXT: [[TMP3:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SED1Ev_l199(i64 [[TMP3]]) #[[ATTR2]]
+// CHECK-NTARGET-NEXT: [[TMP4:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP5:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: store i32 [[TMP4]], ptr [[TMP5]], align 4
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SED1Ev_l199
+// CHECK-NTARGET-SAME: (i64 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SED1Ev_l199.omp_outlined, i64 [[TMP1]])
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SED1Ev_l199.omp_outlined
+// CHECK-NTARGET-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NTARGET-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NTARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-NTARGET-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-NTARGET: cond.true:
+// CHECK-NTARGET-NEXT: br label [[COND_END:%.*]]
+// CHECK-NTARGET: cond.false:
+// CHECK-NTARGET-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: br label [[COND_END]]
+// CHECK-NTARGET: cond.end:
+// CHECK-NTARGET-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-NTARGET-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-NTARGET: omp.inner.for.cond:
+// CHECK-NTARGET-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-NTARGET-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-NTARGET: omp.inner.for.body:
+// CHECK-NTARGET-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-NTARGET-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NTARGET-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-NTARGET-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 15
+// CHECK-NTARGET-NEXT: store i32 [[ADD2]], ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-NTARGET: omp.body.continue:
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-NTARGET: omp.inner.for.inc:
+// CHECK-NTARGET-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK-NTARGET-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-NTARGET: omp.inner.for.end:
+// CHECK-NTARGET-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-NTARGET: omp.loop.exit:
+// CHECK-NTARGET-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@__cxx_global_var_init.7
+// CHECK-NTARGET-SAME: () #[[ATTR3]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: call void @_ZN2STILi100EEC1Ev(ptr noundef nonnull align 4 dereferenceable(912) @t1)
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2STILi100EED1Ev, ptr @t1, ptr @__dso_handle) #[[ATTR2]]
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2STILi100EEC1Ev
+// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(912) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: call void @_ZN2STILi100EEC2Ev(ptr noundef nonnull align 4 dereferenceable(912) [[THIS1]])
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2STILi100EED1Ev
+// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(912) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: call void @_ZN2STILi100EED2Ev(ptr noundef nonnull align 4 dereferenceable(912) [[THIS1]]) #[[ATTR2]]
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2STILi100EEC2Ev
+// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(912) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP2]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-NEXT: [[TMP3:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EEC1Ev_l218(i64 [[TMP3]]) #[[ATTR2]]
+// CHECK-NTARGET-NEXT: [[TMP4:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP5:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: store i32 [[TMP4]], ptr [[TMP5]], align 4
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EEC1Ev_l218
+// CHECK-NTARGET-SAME: (i64 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EEC1Ev_l218.omp_outlined, i64 [[TMP1]])
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EEC1Ev_l218.omp_outlined
+// CHECK-NTARGET-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NTARGET-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NTARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-NTARGET-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-NTARGET: cond.true:
+// CHECK-NTARGET-NEXT: br label [[COND_END:%.*]]
+// CHECK-NTARGET: cond.false:
+// CHECK-NTARGET-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: br label [[COND_END]]
+// CHECK-NTARGET: cond.end:
+// CHECK-NTARGET-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-NTARGET-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-NTARGET: omp.inner.for.cond:
+// CHECK-NTARGET-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-NTARGET-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-NTARGET: omp.inner.for.body:
+// CHECK-NTARGET-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-NTARGET-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NTARGET-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-NTARGET-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 117
+// CHECK-NTARGET-NEXT: store i32 [[ADD2]], ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-NTARGET: omp.body.continue:
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-NTARGET: omp.inner.for.inc:
+// CHECK-NTARGET-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK-NTARGET-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-NTARGET: omp.inner.for.end:
+// CHECK-NTARGET-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-NTARGET: omp.loop.exit:
+// CHECK-NTARGET-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2STILi100EED2Ev
+// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(912) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP2]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-NEXT: [[TMP3:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EED1Ev_l225(i64 [[TMP3]]) #[[ATTR2]]
+// CHECK-NTARGET-NEXT: [[TMP4:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP5:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: store i32 [[TMP4]], ptr [[TMP5]], align 4
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EED1Ev_l225
+// CHECK-NTARGET-SAME: (i64 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EED1Ev_l225.omp_outlined, i64 [[TMP1]])
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EED1Ev_l225.omp_outlined
+// CHECK-NTARGET-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NTARGET-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NTARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-NTARGET-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-NTARGET: cond.true:
+// CHECK-NTARGET-NEXT: br label [[COND_END:%.*]]
+// CHECK-NTARGET: cond.false:
+// CHECK-NTARGET-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: br label [[COND_END]]
+// CHECK-NTARGET: cond.end:
+// CHECK-NTARGET-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-NTARGET-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-NTARGET: omp.inner.for.cond:
+// CHECK-NTARGET-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-NTARGET-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-NTARGET: omp.inner.for.body:
+// CHECK-NTARGET-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-NTARGET-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NTARGET-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-NTARGET-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 118
+// CHECK-NTARGET-NEXT: store i32 [[ADD2]], ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-NTARGET: omp.body.continue:
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-NTARGET: omp.inner.for.inc:
+// CHECK-NTARGET-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK-NTARGET-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-NTARGET: omp.inner.for.end:
+// CHECK-NTARGET-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-NTARGET: omp.loop.exit:
+// CHECK-NTARGET-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@__cxx_global_var_init.8
+// CHECK-NTARGET-SAME: () #[[ATTR3]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: call void @_ZN2STILi1000EEC1Ev(ptr noundef nonnull align 4 dereferenceable(4512) @t2)
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2STILi1000EED1Ev, ptr @t2, ptr @__dso_handle) #[[ATTR2]]
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2STILi1000EEC1Ev
+// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(4512) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: call void @_ZN2STILi1000EEC2Ev(ptr noundef nonnull align 4 dereferenceable(4512) [[THIS1]])
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2STILi1000EED1Ev
+// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(4512) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: call void @_ZN2STILi1000EED2Ev(ptr noundef nonnull align 4 dereferenceable(4512) [[THIS1]]) #[[ATTR2]]
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2STILi1000EEC2Ev
+// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(4512) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP2]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-NEXT: [[TMP3:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EEC1Ev_l218(i64 [[TMP3]]) #[[ATTR2]]
+// CHECK-NTARGET-NEXT: [[TMP4:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP5:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: store i32 [[TMP4]], ptr [[TMP5]], align 4
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EEC1Ev_l218
+// CHECK-NTARGET-SAME: (i64 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EEC1Ev_l218.omp_outlined, i64 [[TMP1]])
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EEC1Ev_l218.omp_outlined
+// CHECK-NTARGET-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NTARGET-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NTARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-NTARGET-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-NTARGET: cond.true:
+// CHECK-NTARGET-NEXT: br label [[COND_END:%.*]]
+// CHECK-NTARGET: cond.false:
+// CHECK-NTARGET-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: br label [[COND_END]]
+// CHECK-NTARGET: cond.end:
+// CHECK-NTARGET-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-NTARGET-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-NTARGET: omp.inner.for.cond:
+// CHECK-NTARGET-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-NTARGET-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-NTARGET: omp.inner.for.body:
+// CHECK-NTARGET-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-NTARGET-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NTARGET-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-NTARGET-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1017
+// CHECK-NTARGET-NEXT: store i32 [[ADD2]], ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-NTARGET: omp.body.continue:
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-NTARGET: omp.inner.for.inc:
+// CHECK-NTARGET-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK-NTARGET-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-NTARGET: omp.inner.for.end:
+// CHECK-NTARGET-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-NTARGET: omp.loop.exit:
+// CHECK-NTARGET-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_ZN2STILi1000EED2Ev
+// CHECK-NTARGET-SAME: (ptr noundef nonnull align 4 dereferenceable(4512) [[THIS:%.*]]) unnamed_addr #[[ATTR4]] comdat align 2 {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP2]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-NEXT: [[TMP3:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EED1Ev_l225(i64 [[TMP3]]) #[[ATTR2]]
+// CHECK-NTARGET-NEXT: [[TMP4:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-NEXT: [[TMP5:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-NEXT: store i32 [[TMP4]], ptr [[TMP5]], align 4
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EED1Ev_l225
+// CHECK-NTARGET-SAME: (i64 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EED1Ev_l225.omp_outlined, i64 [[TMP1]])
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EED1Ev_l225.omp_outlined
+// CHECK-NTARGET-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NTARGET-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NTARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-NTARGET-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-NTARGET: cond.true:
+// CHECK-NTARGET-NEXT: br label [[COND_END:%.*]]
+// CHECK-NTARGET: cond.false:
+// CHECK-NTARGET-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: br label [[COND_END]]
+// CHECK-NTARGET: cond.end:
+// CHECK-NTARGET-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-NTARGET-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-NTARGET: omp.inner.for.cond:
+// CHECK-NTARGET-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-NTARGET-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-NTARGET: omp.inner.for.body:
+// CHECK-NTARGET-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-NTARGET-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NTARGET-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-NTARGET-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1018
+// CHECK-NTARGET-NEXT: store i32 [[ADD2]], ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-NTARGET: omp.body.continue:
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-NTARGET: omp.inner.for.inc:
+// CHECK-NTARGET-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK-NTARGET-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-NTARGET: omp.inner.for.end:
+// CHECK-NTARGET-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-NTARGET: omp.loop.exit:
+// CHECK-NTARGET-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SB3fooEv_l122
+// CHECK-NTARGET-SAME: (i64 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SB3fooEv_l122.omp_outlined, i64 [[TMP1]])
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SB3fooEv_l122.omp_outlined
+// CHECK-NTARGET-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NTARGET-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NTARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-NTARGET-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-NTARGET: cond.true:
+// CHECK-NTARGET-NEXT: br label [[COND_END:%.*]]
+// CHECK-NTARGET: cond.false:
+// CHECK-NTARGET-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: br label [[COND_END]]
+// CHECK-NTARGET: cond.end:
+// CHECK-NTARGET-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-NTARGET-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-NTARGET: omp.inner.for.cond:
+// CHECK-NTARGET-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-NTARGET-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-NTARGET: omp.inner.for.body:
+// CHECK-NTARGET-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-NTARGET-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NTARGET-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-NTARGET-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 4
+// CHECK-NTARGET-NEXT: store i32 [[ADD2]], ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-NTARGET: omp.body.continue:
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-NTARGET: omp.inner.for.inc:
+// CHECK-NTARGET-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK-NTARGET-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-NTARGET: omp.inner.for.end:
+// CHECK-NTARGET-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-NTARGET: omp.loop.exit:
+// CHECK-NTARGET-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SE3fooEv_l185
+// CHECK-NTARGET-SAME: (i64 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SE3fooEv_l185.omp_outlined, i64 [[TMP1]])
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SE3fooEv_l185.omp_outlined
+// CHECK-NTARGET-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NTARGET-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NTARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-NTARGET-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-NTARGET: cond.true:
+// CHECK-NTARGET-NEXT: br label [[COND_END:%.*]]
+// CHECK-NTARGET: cond.false:
+// CHECK-NTARGET-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: br label [[COND_END]]
+// CHECK-NTARGET: cond.end:
+// CHECK-NTARGET-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-NTARGET-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-NTARGET: omp.inner.for.cond:
+// CHECK-NTARGET-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-NTARGET-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-NTARGET: omp.inner.for.body:
+// CHECK-NTARGET-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-NTARGET-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NTARGET-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-NTARGET-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 13
+// CHECK-NTARGET-NEXT: store i32 [[ADD2]], ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-NTARGET: omp.body.continue:
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-NTARGET: omp.inner.for.inc:
+// CHECK-NTARGET-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK-NTARGET-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-NTARGET: omp.inner.for.end:
+// CHECK-NTARGET-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-NTARGET: omp.loop.exit:
+// CHECK-NTARGET-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EE3fooEv_l211
+// CHECK-NTARGET-SAME: (i64 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EE3fooEv_l211.omp_outlined, i64 [[TMP1]])
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EE3fooEv_l211.omp_outlined
+// CHECK-NTARGET-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NTARGET-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NTARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-NTARGET-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-NTARGET: cond.true:
+// CHECK-NTARGET-NEXT: br label [[COND_END:%.*]]
+// CHECK-NTARGET: cond.false:
+// CHECK-NTARGET-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: br label [[COND_END]]
+// CHECK-NTARGET: cond.end:
+// CHECK-NTARGET-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-NTARGET-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-NTARGET: omp.inner.for.cond:
+// CHECK-NTARGET-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-NTARGET-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-NTARGET: omp.inner.for.body:
+// CHECK-NTARGET-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-NTARGET-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NTARGET-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-NTARGET-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 116
+// CHECK-NTARGET-NEXT: store i32 [[ADD2]], ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-NTARGET: omp.body.continue:
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-NTARGET: omp.inner.for.inc:
+// CHECK-NTARGET-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK-NTARGET-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-NTARGET: omp.inner.for.end:
+// CHECK-NTARGET-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-NTARGET: omp.loop.exit:
+// CHECK-NTARGET-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EE3fooEv_l211
+// CHECK-NTARGET-SAME: (i64 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EE3fooEv_l211.omp_outlined, i64 [[TMP1]])
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EE3fooEv_l211.omp_outlined
+// CHECK-NTARGET-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NTARGET-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NTARGET-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NTARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-NTARGET-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-NTARGET: cond.true:
+// CHECK-NTARGET-NEXT: br label [[COND_END:%.*]]
+// CHECK-NTARGET: cond.false:
+// CHECK-NTARGET-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: br label [[COND_END]]
+// CHECK-NTARGET: cond.end:
+// CHECK-NTARGET-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-NTARGET-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-NTARGET: omp.inner.for.cond:
+// CHECK-NTARGET-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-NTARGET-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-NTARGET: omp.inner.for.body:
+// CHECK-NTARGET-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-NTARGET-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NTARGET-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-NTARGET-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1016
+// CHECK-NTARGET-NEXT: store i32 [[ADD2]], ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-NTARGET: omp.body.continue:
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-NTARGET: omp.inner.for.inc:
+// CHECK-NTARGET-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK-NTARGET-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-NTARGET: omp.inner.for.end:
+// CHECK-NTARGET-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-NTARGET: omp.loop.exit:
+// CHECK-NTARGET-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_GLOBAL__I_000500
+// CHECK-NTARGET-SAME: () #[[ATTR3]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: call void @__cxx_global_var_init()
+// CHECK-NTARGET-NEXT: call void @__cxx_global_var_init.2()
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_GLOBAL__I_000501
+// CHECK-NTARGET-SAME: () #[[ATTR3]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: call void @__cxx_global_var_init.3()
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// CHECK-NTARGET-LABEL: define {{[^@]+}}@_GLOBAL__sub_I_target_parallel_generic_loop_codegen_1.cpp
+// CHECK-NTARGET-SAME: () #[[ATTR3]] {
+// CHECK-NTARGET-NEXT: entry:
+// CHECK-NTARGET-NEXT: call void @__cxx_global_var_init.1()
+// CHECK-NTARGET-NEXT: call void @__cxx_global_var_init.4()
+// CHECK-NTARGET-NEXT: call void @__cxx_global_var_init.5()
+// CHECK-NTARGET-NEXT: call void @__cxx_global_var_init.6()
+// CHECK-NTARGET-NEXT: call void @__cxx_global_var_init.7()
+// CHECK-NTARGET-NEXT: call void @__cxx_global_var_init.8()
+// CHECK-NTARGET-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@__cxx_global_var_init
+// SIMD-ONLY2-SAME: () #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: call void @_ZN2SAC1Ev(ptr noundef nonnull align 4 dereferenceable(16) @_ZL2a1)
+// SIMD-ONLY2-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SAD1Ev, ptr @_ZL2a1, ptr @__dso_handle) #[[ATTR2:[0-9]+]]
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2SAC1Ev
+// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: call void @_ZN2SAC2Ev(ptr noundef nonnull align 4 dereferenceable(16) [[THIS1]])
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2SAD1Ev
+// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: call void @_ZN2SAD2Ev(ptr noundef nonnull align 4 dereferenceable(16) [[THIS1]]) #[[ATTR2]]
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@__cxx_global_var_init.1
+// SIMD-ONLY2-SAME: () #[[ATTR0]] {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: call void @_ZN2SAC1Ev(ptr noundef nonnull align 4 dereferenceable(16) @a2)
+// SIMD-ONLY2-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SAD1Ev, ptr @a2, ptr @__dso_handle) #[[ATTR2]]
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@__cxx_global_var_init.2
+// SIMD-ONLY2-SAME: () #[[ATTR0]] {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: call void @_ZN2SBC1Ev(ptr noundef nonnull align 4 dereferenceable(32) @b1)
+// SIMD-ONLY2-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SBD1Ev, ptr @b1, ptr @__dso_handle) #[[ATTR2]]
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2SBC1Ev
+// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(32) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: call void @_ZN2SBC2Ev(ptr noundef nonnull align 4 dereferenceable(32) [[THIS1]])
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2SBD1Ev
+// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(32) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: call void @_ZN2SBD2Ev(ptr noundef nonnull align 4 dereferenceable(32) [[THIS1]]) #[[ATTR2]]
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@__cxx_global_var_init.3
+// SIMD-ONLY2-SAME: () #[[ATTR0]] {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: call void @_ZN2SBC1Ev(ptr noundef nonnull align 4 dereferenceable(32) @b2)
+// SIMD-ONLY2-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SBD1Ev, ptr @b2, ptr @__dso_handle) #[[ATTR2]]
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@__cxx_global_var_init.4
+// SIMD-ONLY2-SAME: () #[[ATTR0]] {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: call void @_ZN2SCC1Ev(ptr noundef nonnull align 4 dereferenceable(64) @_ZL2c1)
+// SIMD-ONLY2-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SCD1Ev, ptr @_ZL2c1, ptr @__dso_handle) #[[ATTR2]]
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2SCC1Ev
+// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(64) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: call void @_ZN2SCC2Ev(ptr noundef nonnull align 4 dereferenceable(64) [[THIS1]])
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2SCD1Ev
+// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(64) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: call void @_ZN2SCD2Ev(ptr noundef nonnull align 4 dereferenceable(64) [[THIS1]]) #[[ATTR2]]
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@__cxx_global_var_init.5
+// SIMD-ONLY2-SAME: () #[[ATTR0]] {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: call void @_ZN2SDC1Ev(ptr noundef nonnull align 4 dereferenceable(128) @d1)
+// SIMD-ONLY2-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SDD1Ev, ptr @d1, ptr @__dso_handle) #[[ATTR2]]
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2SDC1Ev
+// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(128) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: call void @_ZN2SDC2Ev(ptr noundef nonnull align 4 dereferenceable(128) [[THIS1]])
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2SDD1Ev
+// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(128) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: call void @_ZN2SDD2Ev(ptr noundef nonnull align 4 dereferenceable(128) [[THIS1]]) #[[ATTR2]]
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@__cxx_global_var_init.6
+// SIMD-ONLY2-SAME: () #[[ATTR0]] {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: call void @_ZN2SEC1Ev(ptr noundef nonnull align 4 dereferenceable(256) @e1)
+// SIMD-ONLY2-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SED1Ev, ptr @e1, ptr @__dso_handle) #[[ATTR2]]
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2SEC1Ev
+// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: call void @_ZN2SEC2Ev(ptr noundef nonnull align 4 dereferenceable(256) [[THIS1]])
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2SED1Ev
+// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: call void @_ZN2SED2Ev(ptr noundef nonnull align 4 dereferenceable(256) [[THIS1]]) #[[ATTR2]]
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@__cxx_global_var_init.7
+// SIMD-ONLY2-SAME: () #[[ATTR0]] {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: call void @_ZN2STILi100EEC1Ev(ptr noundef nonnull align 4 dereferenceable(912) @t1)
+// SIMD-ONLY2-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2STILi100EED1Ev, ptr @t1, ptr @__dso_handle) #[[ATTR2]]
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2STILi100EEC1Ev
+// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(912) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: call void @_ZN2STILi100EEC2Ev(ptr noundef nonnull align 4 dereferenceable(912) [[THIS1]])
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2STILi100EED1Ev
+// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(912) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: call void @_ZN2STILi100EED2Ev(ptr noundef nonnull align 4 dereferenceable(912) [[THIS1]]) #[[ATTR2]]
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@__cxx_global_var_init.8
+// SIMD-ONLY2-SAME: () #[[ATTR0]] {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: call void @_ZN2STILi1000EEC1Ev(ptr noundef nonnull align 4 dereferenceable(4512) @t2)
+// SIMD-ONLY2-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2STILi1000EED1Ev, ptr @t2, ptr @__dso_handle) #[[ATTR2]]
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2STILi1000EEC1Ev
+// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(4512) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: call void @_ZN2STILi1000EEC2Ev(ptr noundef nonnull align 4 dereferenceable(4512) [[THIS1]])
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2STILi1000EED1Ev
+// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(4512) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: call void @_ZN2STILi1000EED2Ev(ptr noundef nonnull align 4 dereferenceable(4512) [[THIS1]]) #[[ATTR2]]
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_Z3bari
+// SIMD-ONLY2-SAME: (i32 noundef signext [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT: [[R:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT: [[I:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// SIMD-ONLY2-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// SIMD-ONLY2-NEXT: store i32 [[TMP0]], ptr [[R]], align 4
+// SIMD-ONLY2-NEXT: call void @_ZN2SA3fooEv(ptr noundef nonnull align 4 dereferenceable(16) @_ZL2a1)
+// SIMD-ONLY2-NEXT: call void @_ZN2SA3fooEv(ptr noundef nonnull align 4 dereferenceable(16) @a2)
+// SIMD-ONLY2-NEXT: call void @_ZN2SB3fooEv(ptr noundef nonnull align 4 dereferenceable(32) @b1)
+// SIMD-ONLY2-NEXT: call void @_ZN2SB3fooEv(ptr noundef nonnull align 4 dereferenceable(32) @b2)
+// SIMD-ONLY2-NEXT: call void @_ZN2SC3fooEv(ptr noundef nonnull align 4 dereferenceable(64) @_ZL2c1)
+// SIMD-ONLY2-NEXT: call void @_ZN2SD3fooEv(ptr noundef nonnull align 4 dereferenceable(128) @d1)
+// SIMD-ONLY2-NEXT: call void @_ZN2SE3fooEv(ptr noundef nonnull align 4 dereferenceable(256) @e1)
+// SIMD-ONLY2-NEXT: call void @_ZN2STILi100EE3fooEv(ptr noundef nonnull align 4 dereferenceable(912) @t1)
+// SIMD-ONLY2-NEXT: call void @_ZN2STILi1000EE3fooEv(ptr noundef nonnull align 4 dereferenceable(4512) @t2)
+// SIMD-ONLY2-NEXT: store i32 0, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_COND:%.*]]
+// SIMD-ONLY2: for.cond:
+// SIMD-ONLY2-NEXT: [[TMP1:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 10
+// SIMD-ONLY2-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// SIMD-ONLY2: for.body:
+// SIMD-ONLY2-NEXT: [[TMP2:%.*]] = load i32, ptr [[R]], align 4
+// SIMD-ONLY2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1
+// SIMD-ONLY2-NEXT: store i32 [[INC]], ptr [[R]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_INC:%.*]]
+// SIMD-ONLY2: for.inc:
+// SIMD-ONLY2-NEXT: [[TMP3:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: [[INC1:%.*]] = add nsw i32 [[TMP3]], 1
+// SIMD-ONLY2-NEXT: store i32 [[INC1]], ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP2:![0-9]+]]
+// SIMD-ONLY2: for.end:
+// SIMD-ONLY2-NEXT: [[TMP4:%.*]] = load i32, ptr [[R]], align 4
+// SIMD-ONLY2-NEXT: [[TMP5:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+// SIMD-ONLY2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP4]], [[TMP6]]
+// SIMD-ONLY2-NEXT: ret i32 [[ADD]]
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2SA3fooEv
+// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) #[[ATTR3]] comdat align 2 {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY2-NEXT: [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// SIMD-ONLY2-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 1
+// SIMD-ONLY2-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2SB3fooEv
+// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(32) [[THIS:%.*]]) #[[ATTR3]] comdat align 2 {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY2-NEXT: [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT: [[I:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// SIMD-ONLY2-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: store i32 0, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_COND:%.*]]
+// SIMD-ONLY2: for.cond:
+// SIMD-ONLY2-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP2]], 10
+// SIMD-ONLY2-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// SIMD-ONLY2: for.body:
+// SIMD-ONLY2-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 4
+// SIMD-ONLY2-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_INC:%.*]]
+// SIMD-ONLY2: for.inc:
+// SIMD-ONLY2-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1
+// SIMD-ONLY2-NEXT: store i32 [[INC]], ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]]
+// SIMD-ONLY2: for.end:
+// SIMD-ONLY2-NEXT: [[TMP5:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP6:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: store i32 [[TMP5]], ptr [[TMP6]], align 4
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2SC3fooEv
+// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(64) [[THIS:%.*]]) #[[ATTR3]] comdat align 2 {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY2-NEXT: [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// SIMD-ONLY2-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 7
+// SIMD-ONLY2-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2SD3fooEv
+// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(128) [[THIS:%.*]]) #[[ATTR3]] comdat align 2 {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY2-NEXT: [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// SIMD-ONLY2-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 10
+// SIMD-ONLY2-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2SE3fooEv
+// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[THIS:%.*]]) #[[ATTR3]] comdat align 2 {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY2-NEXT: [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT: [[I:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// SIMD-ONLY2-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: store i32 0, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_COND:%.*]]
+// SIMD-ONLY2: for.cond:
+// SIMD-ONLY2-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP2]], 10
+// SIMD-ONLY2-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// SIMD-ONLY2: for.body:
+// SIMD-ONLY2-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 13
+// SIMD-ONLY2-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_INC:%.*]]
+// SIMD-ONLY2: for.inc:
+// SIMD-ONLY2-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1
+// SIMD-ONLY2-NEXT: store i32 [[INC]], ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]]
+// SIMD-ONLY2: for.end:
+// SIMD-ONLY2-NEXT: [[TMP5:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP6:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: store i32 [[TMP5]], ptr [[TMP6]], align 4
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2STILi100EE3fooEv
+// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(912) [[THIS:%.*]]) #[[ATTR3]] comdat align 2 {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY2-NEXT: [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT: [[I:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// SIMD-ONLY2-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: store i32 0, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_COND:%.*]]
+// SIMD-ONLY2: for.cond:
+// SIMD-ONLY2-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP2]], 10
+// SIMD-ONLY2-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// SIMD-ONLY2: for.body:
+// SIMD-ONLY2-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 116
+// SIMD-ONLY2-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_INC:%.*]]
+// SIMD-ONLY2: for.inc:
+// SIMD-ONLY2-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1
+// SIMD-ONLY2-NEXT: store i32 [[INC]], ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP6:![0-9]+]]
+// SIMD-ONLY2: for.end:
+// SIMD-ONLY2-NEXT: [[TMP5:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP6:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: store i32 [[TMP5]], ptr [[TMP6]], align 4
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2STILi1000EE3fooEv
+// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(4512) [[THIS:%.*]]) #[[ATTR3]] comdat align 2 {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY2-NEXT: [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT: [[I:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// SIMD-ONLY2-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: store i32 0, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_COND:%.*]]
+// SIMD-ONLY2: for.cond:
+// SIMD-ONLY2-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP2]], 10
+// SIMD-ONLY2-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// SIMD-ONLY2: for.body:
+// SIMD-ONLY2-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 1016
+// SIMD-ONLY2-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_INC:%.*]]
+// SIMD-ONLY2: for.inc:
+// SIMD-ONLY2-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1
+// SIMD-ONLY2-NEXT: store i32 [[INC]], ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
+// SIMD-ONLY2: for.end:
+// SIMD-ONLY2-NEXT: [[TMP5:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP6:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: store i32 [[TMP5]], ptr [[TMP6]], align 4
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2SAC2Ev
+// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY2-NEXT: [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// SIMD-ONLY2-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 2
+// SIMD-ONLY2-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2SAD2Ev
+// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY2-NEXT: [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// SIMD-ONLY2-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 3
+// SIMD-ONLY2-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2SBC2Ev
+// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(32) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY2-NEXT: [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// SIMD-ONLY2-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 5
+// SIMD-ONLY2-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2SBD2Ev
+// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(32) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY2-NEXT: [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// SIMD-ONLY2-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 6
+// SIMD-ONLY2-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2SCC2Ev
+// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(64) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY2-NEXT: [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT: [[I:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// SIMD-ONLY2-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: store i32 0, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_COND:%.*]]
+// SIMD-ONLY2: for.cond:
+// SIMD-ONLY2-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP2]], 10
+// SIMD-ONLY2-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// SIMD-ONLY2: for.body:
+// SIMD-ONLY2-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 8
+// SIMD-ONLY2-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_INC:%.*]]
+// SIMD-ONLY2: for.inc:
+// SIMD-ONLY2-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1
+// SIMD-ONLY2-NEXT: store i32 [[INC]], ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP8:![0-9]+]]
+// SIMD-ONLY2: for.end:
+// SIMD-ONLY2-NEXT: [[TMP5:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP6:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: store i32 [[TMP5]], ptr [[TMP6]], align 4
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2SCD2Ev
+// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(64) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY2-NEXT: [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// SIMD-ONLY2-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 9
+// SIMD-ONLY2-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2SDC2Ev
+// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(128) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY2-NEXT: [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// SIMD-ONLY2-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 11
+// SIMD-ONLY2-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2SDD2Ev
+// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(128) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY2-NEXT: [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT: [[I:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// SIMD-ONLY2-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: store i32 0, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_COND:%.*]]
+// SIMD-ONLY2: for.cond:
+// SIMD-ONLY2-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP2]], 10
+// SIMD-ONLY2-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// SIMD-ONLY2: for.body:
+// SIMD-ONLY2-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 12
+// SIMD-ONLY2-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_INC:%.*]]
+// SIMD-ONLY2: for.inc:
+// SIMD-ONLY2-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1
+// SIMD-ONLY2-NEXT: store i32 [[INC]], ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP9:![0-9]+]]
+// SIMD-ONLY2: for.end:
+// SIMD-ONLY2-NEXT: [[TMP5:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP6:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: store i32 [[TMP5]], ptr [[TMP6]], align 4
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2SEC2Ev
+// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY2-NEXT: [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT: [[I:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// SIMD-ONLY2-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: store i32 0, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_COND:%.*]]
+// SIMD-ONLY2: for.cond:
+// SIMD-ONLY2-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP2]], 10
+// SIMD-ONLY2-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// SIMD-ONLY2: for.body:
+// SIMD-ONLY2-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 14
+// SIMD-ONLY2-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_INC:%.*]]
+// SIMD-ONLY2: for.inc:
+// SIMD-ONLY2-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1
+// SIMD-ONLY2-NEXT: store i32 [[INC]], ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]]
+// SIMD-ONLY2: for.end:
+// SIMD-ONLY2-NEXT: [[TMP5:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP6:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: store i32 [[TMP5]], ptr [[TMP6]], align 4
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2SED2Ev
+// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY2-NEXT: [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT: [[I:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// SIMD-ONLY2-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: store i32 0, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_COND:%.*]]
+// SIMD-ONLY2: for.cond:
+// SIMD-ONLY2-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP2]], 10
+// SIMD-ONLY2-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// SIMD-ONLY2: for.body:
+// SIMD-ONLY2-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 15
+// SIMD-ONLY2-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_INC:%.*]]
+// SIMD-ONLY2: for.inc:
+// SIMD-ONLY2-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1
+// SIMD-ONLY2-NEXT: store i32 [[INC]], ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP11:![0-9]+]]
+// SIMD-ONLY2: for.end:
+// SIMD-ONLY2-NEXT: [[TMP5:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP6:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: store i32 [[TMP5]], ptr [[TMP6]], align 4
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2STILi100EEC2Ev
+// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(912) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY2-NEXT: [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT: [[I:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// SIMD-ONLY2-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: store i32 0, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_COND:%.*]]
+// SIMD-ONLY2: for.cond:
+// SIMD-ONLY2-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP2]], 10
+// SIMD-ONLY2-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// SIMD-ONLY2: for.body:
+// SIMD-ONLY2-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 117
+// SIMD-ONLY2-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_INC:%.*]]
+// SIMD-ONLY2: for.inc:
+// SIMD-ONLY2-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1
+// SIMD-ONLY2-NEXT: store i32 [[INC]], ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]]
+// SIMD-ONLY2: for.end:
+// SIMD-ONLY2-NEXT: [[TMP5:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP6:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: store i32 [[TMP5]], ptr [[TMP6]], align 4
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2STILi100EED2Ev
+// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(912) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY2-NEXT: [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT: [[I:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// SIMD-ONLY2-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: store i32 0, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_COND:%.*]]
+// SIMD-ONLY2: for.cond:
+// SIMD-ONLY2-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP2]], 10
+// SIMD-ONLY2-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// SIMD-ONLY2: for.body:
+// SIMD-ONLY2-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 118
+// SIMD-ONLY2-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_INC:%.*]]
+// SIMD-ONLY2: for.inc:
+// SIMD-ONLY2-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1
+// SIMD-ONLY2-NEXT: store i32 [[INC]], ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
+// SIMD-ONLY2: for.end:
+// SIMD-ONLY2-NEXT: [[TMP5:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP6:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: store i32 [[TMP5]], ptr [[TMP6]], align 4
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2STILi1000EEC2Ev
+// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(4512) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY2-NEXT: [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT: [[I:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// SIMD-ONLY2-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: store i32 0, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_COND:%.*]]
+// SIMD-ONLY2: for.cond:
+// SIMD-ONLY2-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP2]], 10
+// SIMD-ONLY2-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// SIMD-ONLY2: for.body:
+// SIMD-ONLY2-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 1017
+// SIMD-ONLY2-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_INC:%.*]]
+// SIMD-ONLY2: for.inc:
+// SIMD-ONLY2-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1
+// SIMD-ONLY2-NEXT: store i32 [[INC]], ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]]
+// SIMD-ONLY2: for.end:
+// SIMD-ONLY2-NEXT: [[TMP5:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP6:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: store i32 [[TMP5]], ptr [[TMP6]], align 4
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_ZN2STILi1000EED2Ev
+// SIMD-ONLY2-SAME: (ptr noundef nonnull align 4 dereferenceable(4512) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// SIMD-ONLY2-NEXT: [[A:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT: [[I:%.*]] = alloca i32, align 4
+// SIMD-ONLY2-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// SIMD-ONLY2-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// SIMD-ONLY2-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: store i32 0, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_COND:%.*]]
+// SIMD-ONLY2: for.cond:
+// SIMD-ONLY2-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP2]], 10
+// SIMD-ONLY2-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// SIMD-ONLY2: for.body:
+// SIMD-ONLY2-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 1018
+// SIMD-ONLY2-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_INC:%.*]]
+// SIMD-ONLY2: for.inc:
+// SIMD-ONLY2-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1
+// SIMD-ONLY2-NEXT: store i32 [[INC]], ptr [[I]], align 4
+// SIMD-ONLY2-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]]
+// SIMD-ONLY2: for.end:
+// SIMD-ONLY2-NEXT: [[TMP5:%.*]] = load i32, ptr [[A]], align 4
+// SIMD-ONLY2-NEXT: [[TMP6:%.*]] = load ptr, ptr @R, align 8
+// SIMD-ONLY2-NEXT: store i32 [[TMP5]], ptr [[TMP6]], align 4
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_GLOBAL__I_000500
+// SIMD-ONLY2-SAME: () #[[ATTR0]] {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: call void @__cxx_global_var_init()
+// SIMD-ONLY2-NEXT: call void @__cxx_global_var_init.2()
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_GLOBAL__I_000501
+// SIMD-ONLY2-SAME: () #[[ATTR0]] {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: call void @__cxx_global_var_init.3()
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// SIMD-ONLY2-LABEL: define {{[^@]+}}@_GLOBAL__sub_I_target_parallel_generic_loop_codegen_1.cpp
+// SIMD-ONLY2-SAME: () #[[ATTR0]] {
+// SIMD-ONLY2-NEXT: entry:
+// SIMD-ONLY2-NEXT: call void @__cxx_global_var_init.1()
+// SIMD-ONLY2-NEXT: call void @__cxx_global_var_init.4()
+// SIMD-ONLY2-NEXT: call void @__cxx_global_var_init.5()
+// SIMD-ONLY2-NEXT: call void @__cxx_global_var_init.6()
+// SIMD-ONLY2-NEXT: call void @__cxx_global_var_init.7()
+// SIMD-ONLY2-NEXT: call void @__cxx_global_var_init.8()
+// SIMD-ONLY2-NEXT: ret void
+//
+//
+// OMP-DEFAULT-LABEL: define {{[^@]+}}@__cxx_global_var_init
+// OMP-DEFAULT-SAME: () #[[ATTR0:[0-9]+]] {
+// OMP-DEFAULT-NEXT: entry:
+// OMP-DEFAULT-NEXT: call void @_ZN2SAC1Ev(ptr noundef nonnull align 4 dereferenceable(16) @_ZL2a1)
+// OMP-DEFAULT-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SAD1Ev, ptr @_ZL2a1, ptr @__dso_handle) #[[ATTR2:[0-9]+]]
+// OMP-DEFAULT-NEXT: ret void
+//
+//
+//
+//
+//
+//
+// OMP-DEFAULT-LABEL: define {{[^@]+}}@__cxx_global_var_init.1
+// OMP-DEFAULT-SAME: () #[[ATTR0]] {
+// OMP-DEFAULT-NEXT: entry:
+// OMP-DEFAULT-NEXT: call void @_ZN2SAC1Ev(ptr noundef nonnull align 4 dereferenceable(16) @a2)
+// OMP-DEFAULT-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SAD1Ev, ptr @a2, ptr @__dso_handle) #[[ATTR2]]
+// OMP-DEFAULT-NEXT: ret void
+//
+//
+// OMP-DEFAULT-LABEL: define {{[^@]+}}@__cxx_global_var_init.2
+// OMP-DEFAULT-SAME: () #[[ATTR0]] {
+// OMP-DEFAULT-NEXT: entry:
+// OMP-DEFAULT-NEXT: call void @_ZN2SBC1Ev(ptr noundef nonnull align 4 dereferenceable(32) @b1)
+// OMP-DEFAULT-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SBD1Ev, ptr @b1, ptr @__dso_handle) #[[ATTR2]]
+// OMP-DEFAULT-NEXT: ret void
+//
+//
+//
+//
+//
+//
+// OMP-DEFAULT-LABEL: define {{[^@]+}}@__cxx_global_var_init.3
+// OMP-DEFAULT-SAME: () #[[ATTR0]] {
+// OMP-DEFAULT-NEXT: entry:
+// OMP-DEFAULT-NEXT: call void @_ZN2SBC1Ev(ptr noundef nonnull align 4 dereferenceable(32) @b2)
+// OMP-DEFAULT-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SBD1Ev, ptr @b2, ptr @__dso_handle) #[[ATTR2]]
+// OMP-DEFAULT-NEXT: ret void
+//
+//
+// OMP-DEFAULT-LABEL: define {{[^@]+}}@__cxx_global_var_init.4
+// OMP-DEFAULT-SAME: () #[[ATTR0]] {
+// OMP-DEFAULT-NEXT: entry:
+// OMP-DEFAULT-NEXT: call void @_ZN2SCC1Ev(ptr noundef nonnull align 4 dereferenceable(64) @_ZL2c1)
+// OMP-DEFAULT-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SCD1Ev, ptr @_ZL2c1, ptr @__dso_handle) #[[ATTR2]]
+// OMP-DEFAULT-NEXT: ret void
+//
+//
+//
+//
+//
+//
+//
+//
+// OMP-DEFAULT-LABEL: define {{[^@]+}}@__cxx_global_var_init.5
+// OMP-DEFAULT-SAME: () #[[ATTR0]] {
+// OMP-DEFAULT-NEXT: entry:
+// OMP-DEFAULT-NEXT: call void @_ZN2SDC1Ev(ptr noundef nonnull align 4 dereferenceable(128) @d1)
+// OMP-DEFAULT-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SDD1Ev, ptr @d1, ptr @__dso_handle) #[[ATTR2]]
+// OMP-DEFAULT-NEXT: ret void
+//
+//
+//
+//
+//
+//
+//
+//
+// OMP-DEFAULT-LABEL: define {{[^@]+}}@__cxx_global_var_init.8
+// OMP-DEFAULT-SAME: () #[[ATTR0]] {
+// OMP-DEFAULT-NEXT: entry:
+// OMP-DEFAULT-NEXT: call void @_ZN2SEC1Ev(ptr noundef nonnull align 4 dereferenceable(256) @e1)
+// OMP-DEFAULT-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SED1Ev, ptr @e1, ptr @__dso_handle) #[[ATTR2]]
+// OMP-DEFAULT-NEXT: ret void
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+// OMP-DEFAULT-LABEL: define {{[^@]+}}@__cxx_global_var_init.13
+// OMP-DEFAULT-SAME: () #[[ATTR0]] {
+// OMP-DEFAULT-NEXT: entry:
+// OMP-DEFAULT-NEXT: call void @_ZN2STILi100EEC1Ev(ptr noundef nonnull align 4 dereferenceable(912) @t1)
+// OMP-DEFAULT-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2STILi100EED1Ev, ptr @t1, ptr @__dso_handle) #[[ATTR2]]
+// OMP-DEFAULT-NEXT: ret void
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+// OMP-DEFAULT-LABEL: define {{[^@]+}}@__cxx_global_var_init.18
+// OMP-DEFAULT-SAME: () #[[ATTR0]] {
+// OMP-DEFAULT-NEXT: entry:
+// OMP-DEFAULT-NEXT: call void @_ZN2STILi1000EEC1Ev(ptr noundef nonnull align 4 dereferenceable(4512) @t2)
+// OMP-DEFAULT-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2STILi1000EED1Ev, ptr @t2, ptr @__dso_handle) #[[ATTR2]]
+// OMP-DEFAULT-NEXT: ret void
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+// OMP-DEFAULT-LABEL: define {{[^@]+}}@_GLOBAL__I_000500
+// OMP-DEFAULT-SAME: () #[[ATTR0]] {
+// OMP-DEFAULT-NEXT: entry:
+// OMP-DEFAULT-NEXT: call void @__cxx_global_var_init()
+// OMP-DEFAULT-NEXT: call void @__cxx_global_var_init.2()
+// OMP-DEFAULT-NEXT: ret void
+//
+//
+// OMP-DEFAULT-LABEL: define {{[^@]+}}@_GLOBAL__I_000501
+// OMP-DEFAULT-SAME: () #[[ATTR0]] {
+// OMP-DEFAULT-NEXT: entry:
+// OMP-DEFAULT-NEXT: call void @__cxx_global_var_init.3()
+// OMP-DEFAULT-NEXT: ret void
+//
+//
+// OMP-DEFAULT-LABEL: define {{[^@]+}}@_GLOBAL__sub_I_target_parallel_generic_loop_codegen_1.cpp
+// OMP-DEFAULT-SAME: () #[[ATTR0]] {
+// OMP-DEFAULT-NEXT: entry:
+// OMP-DEFAULT-NEXT: call void @__cxx_global_var_init.1()
+// OMP-DEFAULT-NEXT: call void @__cxx_global_var_init.4()
+// OMP-DEFAULT-NEXT: call void @__cxx_global_var_init.5()
+// OMP-DEFAULT-NEXT: call void @__cxx_global_var_init.8()
+// OMP-DEFAULT-NEXT: call void @__cxx_global_var_init.13()
+// OMP-DEFAULT-NEXT: call void @__cxx_global_var_init.18()
+// OMP-DEFAULT-NEXT: ret void
+//
+//
+// OMP-DEFAULT-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// OMP-DEFAULT-SAME: () #[[ATTR0]] {
+// OMP-DEFAULT-NEXT: entry:
+// OMP-DEFAULT-NEXT: call void @__tgt_register_requires(i64 1)
+// OMP-DEFAULT-NEXT: ret void
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@__cxx_global_var_init
+// OMP-DEfAULT-SAME: () #[[ATTR0:[0-9]+]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: call void @_ZN2SAC1Ev(ptr noundef nonnull align 4 dereferenceable(16) @_ZL2a1)
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SAD1Ev, ptr @_ZL2a1, ptr @__dso_handle) #[[ATTR2:[0-9]+]]
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2SAC1Ev
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: call void @_ZN2SAC2Ev(ptr noundef nonnull align 4 dereferenceable(16) [[THIS1]])
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2SAD1Ev
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: call void @_ZN2SAD2Ev(ptr noundef nonnull align 4 dereferenceable(16) [[THIS1]]) #[[ATTR2]]
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2SAC2Ev
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 2
+// OMP-DEfAULT-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2SAD2Ev
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 3
+// OMP-DEfAULT-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@__cxx_global_var_init.1
+// OMP-DEfAULT-SAME: () #[[ATTR0]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: call void @_ZN2SAC1Ev(ptr noundef nonnull align 4 dereferenceable(16) @a2)
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SAD1Ev, ptr @a2, ptr @__dso_handle) #[[ATTR2]]
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@__cxx_global_var_init.2
+// OMP-DEfAULT-SAME: () #[[ATTR0]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: call void @_ZN2SBC1Ev(ptr noundef nonnull align 4 dereferenceable(32) @b1)
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SBD1Ev, ptr @b1, ptr @__dso_handle) #[[ATTR2]]
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2SBC1Ev
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(32) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: call void @_ZN2SBC2Ev(ptr noundef nonnull align 4 dereferenceable(32) [[THIS1]])
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2SBD1Ev
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(32) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: call void @_ZN2SBD2Ev(ptr noundef nonnull align 4 dereferenceable(32) [[THIS1]]) #[[ATTR2]]
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2SBC2Ev
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(32) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 5
+// OMP-DEfAULT-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2SBD2Ev
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(32) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 6
+// OMP-DEfAULT-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@__cxx_global_var_init.3
+// OMP-DEfAULT-SAME: () #[[ATTR0]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: call void @_ZN2SBC1Ev(ptr noundef nonnull align 4 dereferenceable(32) @b2)
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SBD1Ev, ptr @b2, ptr @__dso_handle) #[[ATTR2]]
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@__cxx_global_var_init.4
+// OMP-DEfAULT-SAME: () #[[ATTR0]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: call void @_ZN2SCC1Ev(ptr noundef nonnull align 4 dereferenceable(64) @_ZL2c1)
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SCD1Ev, ptr @_ZL2c1, ptr @__dso_handle) #[[ATTR2]]
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2SCC1Ev
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(64) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: call void @_ZN2SCC2Ev(ptr noundef nonnull align 4 dereferenceable(64) [[THIS1]])
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2SCD1Ev
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(64) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: call void @_ZN2SCD2Ev(ptr noundef nonnull align 4 dereferenceable(64) [[THIS1]]) #[[ATTR2]]
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2SCC2Ev
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(64) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// OMP-DEfAULT-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// OMP-DEfAULT-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// OMP-DEfAULT-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// OMP-DEfAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP2]], ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// OMP-DEfAULT-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store i32 [[TMP3]], ptr [[TMP5]], align 4
+// OMP-DEfAULT-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store ptr null, ptr [[TMP6]], align 4
+// OMP-DEfAULT-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store i32 2, ptr [[TMP9]], align 4
+// OMP-DEfAULT-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// OMP-DEfAULT-NEXT: store i32 1, ptr [[TMP10]], align 4
+// OMP-DEfAULT-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// OMP-DEfAULT-NEXT: store ptr [[TMP7]], ptr [[TMP11]], align 4
+// OMP-DEfAULT-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// OMP-DEfAULT-NEXT: store ptr [[TMP8]], ptr [[TMP12]], align 4
+// OMP-DEfAULT-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// OMP-DEfAULT-NEXT: store ptr @.offload_sizes, ptr [[TMP13]], align 4
+// OMP-DEfAULT-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// OMP-DEfAULT-NEXT: store ptr @.offload_maptypes, ptr [[TMP14]], align 4
+// OMP-DEfAULT-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// OMP-DEfAULT-NEXT: store ptr null, ptr [[TMP15]], align 4
+// OMP-DEfAULT-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// OMP-DEfAULT-NEXT: store ptr null, ptr [[TMP16]], align 4
+// OMP-DEfAULT-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// OMP-DEfAULT-NEXT: store i64 0, ptr [[TMP17]], align 8
+// OMP-DEfAULT-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// OMP-DEfAULT-NEXT: store i64 0, ptr [[TMP18]], align 8
+// OMP-DEfAULT-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// OMP-DEfAULT-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP19]], align 4
+// OMP-DEfAULT-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// OMP-DEfAULT-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP20]], align 4
+// OMP-DEfAULT-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[TMP21]], align 4
+// OMP-DEfAULT-NEXT: [[TMP22:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2:[0-9]+]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SCC1Ev_l148.region_id, ptr [[KERNEL_ARGS]])
+// OMP-DEfAULT-NEXT: [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
+// OMP-DEfAULT-NEXT: br i1 [[TMP23]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// OMP-DEfAULT: omp_offload.failed:
+// OMP-DEfAULT-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SCC1Ev_l148(i32 [[TMP3]]) #[[ATTR2]]
+// OMP-DEfAULT-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// OMP-DEfAULT: omp_offload.cont:
+// OMP-DEfAULT-NEXT: [[TMP24:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP25:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP24]], ptr [[TMP25]], align 4
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SCC1Ev_l148
+// OMP-DEfAULT-SAME: (i32 noundef [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SCC1Ev_l148.omp_outlined, i32 [[TMP1]])
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SCC1Ev_l148.omp_outlined
+// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR4:[0-9]+]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[I:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// OMP-DEfAULT-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP-DEfAULT-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// OMP-DEfAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// OMP-DEfAULT-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// OMP-DEfAULT: cond.true:
+// OMP-DEfAULT-NEXT: br label [[COND_END:%.*]]
+// OMP-DEfAULT: cond.false:
+// OMP-DEfAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: br label [[COND_END]]
+// OMP-DEfAULT: cond.end:
+// OMP-DEfAULT-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// OMP-DEfAULT-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// OMP-DEfAULT: omp.inner.for.cond:
+// OMP-DEfAULT-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// OMP-DEfAULT-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// OMP-DEfAULT: omp.inner.for.body:
+// OMP-DEfAULT-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// OMP-DEfAULT-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// OMP-DEfAULT-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// OMP-DEfAULT-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 8
+// OMP-DEfAULT-NEXT: store i32 [[ADD2]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// OMP-DEfAULT: omp.body.continue:
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// OMP-DEfAULT: omp.inner.for.inc:
+// OMP-DEfAULT-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP9]], 1
+// OMP-DEfAULT-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_COND]]
+// OMP-DEfAULT: omp.inner.for.end:
+// OMP-DEfAULT-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// OMP-DEfAULT: omp.loop.exit:
+// OMP-DEfAULT-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2SCD2Ev
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(64) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 9
+// OMP-DEfAULT-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@__cxx_global_var_init.5
+// OMP-DEfAULT-SAME: () #[[ATTR0]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: call void @_ZN2SDC1Ev(ptr noundef nonnull align 4 dereferenceable(128) @d1)
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SDD1Ev, ptr @d1, ptr @__dso_handle) #[[ATTR2]]
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2SDC1Ev
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(128) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: call void @_ZN2SDC2Ev(ptr noundef nonnull align 4 dereferenceable(128) [[THIS1]])
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2SDD1Ev
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(128) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: call void @_ZN2SDD2Ev(ptr noundef nonnull align 4 dereferenceable(128) [[THIS1]]) #[[ATTR2]]
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2SDC2Ev
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(128) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 11
+// OMP-DEfAULT-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2SDD2Ev
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(128) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// OMP-DEfAULT-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// OMP-DEfAULT-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// OMP-DEfAULT-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// OMP-DEfAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP2]], ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// OMP-DEfAULT-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store i32 [[TMP3]], ptr [[TMP5]], align 4
+// OMP-DEfAULT-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store ptr null, ptr [[TMP6]], align 4
+// OMP-DEfAULT-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store i32 2, ptr [[TMP9]], align 4
+// OMP-DEfAULT-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// OMP-DEfAULT-NEXT: store i32 1, ptr [[TMP10]], align 4
+// OMP-DEfAULT-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// OMP-DEfAULT-NEXT: store ptr [[TMP7]], ptr [[TMP11]], align 4
+// OMP-DEfAULT-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// OMP-DEfAULT-NEXT: store ptr [[TMP8]], ptr [[TMP12]], align 4
+// OMP-DEfAULT-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// OMP-DEfAULT-NEXT: store ptr @.offload_sizes.6, ptr [[TMP13]], align 4
+// OMP-DEfAULT-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// OMP-DEfAULT-NEXT: store ptr @.offload_maptypes.7, ptr [[TMP14]], align 4
+// OMP-DEfAULT-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// OMP-DEfAULT-NEXT: store ptr null, ptr [[TMP15]], align 4
+// OMP-DEfAULT-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// OMP-DEfAULT-NEXT: store ptr null, ptr [[TMP16]], align 4
+// OMP-DEfAULT-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// OMP-DEfAULT-NEXT: store i64 0, ptr [[TMP17]], align 8
+// OMP-DEfAULT-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// OMP-DEfAULT-NEXT: store i64 0, ptr [[TMP18]], align 8
+// OMP-DEfAULT-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// OMP-DEfAULT-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP19]], align 4
+// OMP-DEfAULT-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// OMP-DEfAULT-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP20]], align 4
+// OMP-DEfAULT-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[TMP21]], align 4
+// OMP-DEfAULT-NEXT: [[TMP22:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SDD1Ev_l174.region_id, ptr [[KERNEL_ARGS]])
+// OMP-DEfAULT-NEXT: [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
+// OMP-DEfAULT-NEXT: br i1 [[TMP23]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// OMP-DEfAULT: omp_offload.failed:
+// OMP-DEfAULT-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SDD1Ev_l174(i32 [[TMP3]]) #[[ATTR2]]
+// OMP-DEfAULT-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// OMP-DEfAULT: omp_offload.cont:
+// OMP-DEfAULT-NEXT: [[TMP24:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP25:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP24]], ptr [[TMP25]], align 4
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SDD1Ev_l174
+// OMP-DEfAULT-SAME: (i32 noundef [[A:%.*]]) #[[ATTR3]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SDD1Ev_l174.omp_outlined, i32 [[TMP1]])
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SDD1Ev_l174.omp_outlined
+// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR4]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[I:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// OMP-DEfAULT-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP-DEfAULT-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// OMP-DEfAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// OMP-DEfAULT-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// OMP-DEfAULT: cond.true:
+// OMP-DEfAULT-NEXT: br label [[COND_END:%.*]]
+// OMP-DEfAULT: cond.false:
+// OMP-DEfAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: br label [[COND_END]]
+// OMP-DEfAULT: cond.end:
+// OMP-DEfAULT-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// OMP-DEfAULT-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// OMP-DEfAULT: omp.inner.for.cond:
+// OMP-DEfAULT-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// OMP-DEfAULT-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// OMP-DEfAULT: omp.inner.for.body:
+// OMP-DEfAULT-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// OMP-DEfAULT-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// OMP-DEfAULT-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// OMP-DEfAULT-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 12
+// OMP-DEfAULT-NEXT: store i32 [[ADD2]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// OMP-DEfAULT: omp.body.continue:
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// OMP-DEfAULT: omp.inner.for.inc:
+// OMP-DEfAULT-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP9]], 1
+// OMP-DEfAULT-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_COND]]
+// OMP-DEfAULT: omp.inner.for.end:
+// OMP-DEfAULT-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// OMP-DEfAULT: omp.loop.exit:
+// OMP-DEfAULT-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@__cxx_global_var_init.8
+// OMP-DEfAULT-SAME: () #[[ATTR0]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: call void @_ZN2SEC1Ev(ptr noundef nonnull align 4 dereferenceable(256) @e1)
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SED1Ev, ptr @e1, ptr @__dso_handle) #[[ATTR2]]
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2SEC1Ev
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: call void @_ZN2SEC2Ev(ptr noundef nonnull align 4 dereferenceable(256) [[THIS1]])
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2SED1Ev
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: call void @_ZN2SED2Ev(ptr noundef nonnull align 4 dereferenceable(256) [[THIS1]]) #[[ATTR2]]
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2SEC2Ev
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// OMP-DEfAULT-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// OMP-DEfAULT-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// OMP-DEfAULT-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// OMP-DEfAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP2]], ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// OMP-DEfAULT-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store i32 [[TMP3]], ptr [[TMP5]], align 4
+// OMP-DEfAULT-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store ptr null, ptr [[TMP6]], align 4
+// OMP-DEfAULT-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store i32 2, ptr [[TMP9]], align 4
+// OMP-DEfAULT-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// OMP-DEfAULT-NEXT: store i32 1, ptr [[TMP10]], align 4
+// OMP-DEfAULT-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// OMP-DEfAULT-NEXT: store ptr [[TMP7]], ptr [[TMP11]], align 4
+// OMP-DEfAULT-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// OMP-DEfAULT-NEXT: store ptr [[TMP8]], ptr [[TMP12]], align 4
+// OMP-DEfAULT-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// OMP-DEfAULT-NEXT: store ptr @.offload_sizes.9, ptr [[TMP13]], align 4
+// OMP-DEfAULT-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// OMP-DEfAULT-NEXT: store ptr @.offload_maptypes.10, ptr [[TMP14]], align 4
+// OMP-DEfAULT-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// OMP-DEfAULT-NEXT: store ptr null, ptr [[TMP15]], align 4
+// OMP-DEfAULT-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// OMP-DEfAULT-NEXT: store ptr null, ptr [[TMP16]], align 4
+// OMP-DEfAULT-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// OMP-DEfAULT-NEXT: store i64 0, ptr [[TMP17]], align 8
+// OMP-DEfAULT-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// OMP-DEfAULT-NEXT: store i64 0, ptr [[TMP18]], align 8
+// OMP-DEfAULT-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// OMP-DEfAULT-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP19]], align 4
+// OMP-DEfAULT-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// OMP-DEfAULT-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP20]], align 4
+// OMP-DEfAULT-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[TMP21]], align 4
+// OMP-DEfAULT-NEXT: [[TMP22:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SEC1Ev_l192.region_id, ptr [[KERNEL_ARGS]])
+// OMP-DEfAULT-NEXT: [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
+// OMP-DEfAULT-NEXT: br i1 [[TMP23]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// OMP-DEfAULT: omp_offload.failed:
+// OMP-DEfAULT-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SEC1Ev_l192(i32 [[TMP3]]) #[[ATTR2]]
+// OMP-DEfAULT-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// OMP-DEfAULT: omp_offload.cont:
+// OMP-DEfAULT-NEXT: [[TMP24:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP25:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP24]], ptr [[TMP25]], align 4
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SEC1Ev_l192
+// OMP-DEfAULT-SAME: (i32 noundef [[A:%.*]]) #[[ATTR3]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SEC1Ev_l192.omp_outlined, i32 [[TMP1]])
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SEC1Ev_l192.omp_outlined
+// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR4]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[I:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// OMP-DEfAULT-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP-DEfAULT-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// OMP-DEfAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// OMP-DEfAULT-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// OMP-DEfAULT: cond.true:
+// OMP-DEfAULT-NEXT: br label [[COND_END:%.*]]
+// OMP-DEfAULT: cond.false:
+// OMP-DEfAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: br label [[COND_END]]
+// OMP-DEfAULT: cond.end:
+// OMP-DEfAULT-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// OMP-DEfAULT-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// OMP-DEfAULT: omp.inner.for.cond:
+// OMP-DEfAULT-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// OMP-DEfAULT-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// OMP-DEfAULT: omp.inner.for.body:
+// OMP-DEfAULT-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// OMP-DEfAULT-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// OMP-DEfAULT-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// OMP-DEfAULT-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 14
+// OMP-DEfAULT-NEXT: store i32 [[ADD2]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// OMP-DEfAULT: omp.body.continue:
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// OMP-DEfAULT: omp.inner.for.inc:
+// OMP-DEfAULT-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP9]], 1
+// OMP-DEfAULT-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_COND]]
+// OMP-DEfAULT: omp.inner.for.end:
+// OMP-DEfAULT-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// OMP-DEfAULT: omp.loop.exit:
+// OMP-DEfAULT-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2SED2Ev
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// OMP-DEfAULT-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// OMP-DEfAULT-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// OMP-DEfAULT-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// OMP-DEfAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP2]], ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// OMP-DEfAULT-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store i32 [[TMP3]], ptr [[TMP5]], align 4
+// OMP-DEfAULT-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store ptr null, ptr [[TMP6]], align 4
+// OMP-DEfAULT-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store i32 2, ptr [[TMP9]], align 4
+// OMP-DEfAULT-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// OMP-DEfAULT-NEXT: store i32 1, ptr [[TMP10]], align 4
+// OMP-DEfAULT-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// OMP-DEfAULT-NEXT: store ptr [[TMP7]], ptr [[TMP11]], align 4
+// OMP-DEfAULT-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// OMP-DEfAULT-NEXT: store ptr [[TMP8]], ptr [[TMP12]], align 4
+// OMP-DEfAULT-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// OMP-DEfAULT-NEXT: store ptr @.offload_sizes.11, ptr [[TMP13]], align 4
+// OMP-DEfAULT-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// OMP-DEfAULT-NEXT: store ptr @.offload_maptypes.12, ptr [[TMP14]], align 4
+// OMP-DEfAULT-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// OMP-DEfAULT-NEXT: store ptr null, ptr [[TMP15]], align 4
+// OMP-DEfAULT-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// OMP-DEfAULT-NEXT: store ptr null, ptr [[TMP16]], align 4
+// OMP-DEfAULT-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// OMP-DEfAULT-NEXT: store i64 0, ptr [[TMP17]], align 8
+// OMP-DEfAULT-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// OMP-DEfAULT-NEXT: store i64 0, ptr [[TMP18]], align 8
+// OMP-DEfAULT-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// OMP-DEfAULT-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP19]], align 4
+// OMP-DEfAULT-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// OMP-DEfAULT-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP20]], align 4
+// OMP-DEfAULT-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[TMP21]], align 4
+// OMP-DEfAULT-NEXT: [[TMP22:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SED1Ev_l199.region_id, ptr [[KERNEL_ARGS]])
+// OMP-DEfAULT-NEXT: [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
+// OMP-DEfAULT-NEXT: br i1 [[TMP23]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// OMP-DEfAULT: omp_offload.failed:
+// OMP-DEfAULT-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SED1Ev_l199(i32 [[TMP3]]) #[[ATTR2]]
+// OMP-DEfAULT-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// OMP-DEfAULT: omp_offload.cont:
+// OMP-DEfAULT-NEXT: [[TMP24:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP25:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP24]], ptr [[TMP25]], align 4
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SED1Ev_l199
+// OMP-DEfAULT-SAME: (i32 noundef [[A:%.*]]) #[[ATTR3]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SED1Ev_l199.omp_outlined, i32 [[TMP1]])
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SED1Ev_l199.omp_outlined
+// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR4]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[I:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// OMP-DEfAULT-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP-DEfAULT-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// OMP-DEfAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// OMP-DEfAULT-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// OMP-DEfAULT: cond.true:
+// OMP-DEfAULT-NEXT: br label [[COND_END:%.*]]
+// OMP-DEfAULT: cond.false:
+// OMP-DEfAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: br label [[COND_END]]
+// OMP-DEfAULT: cond.end:
+// OMP-DEfAULT-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// OMP-DEfAULT-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// OMP-DEfAULT: omp.inner.for.cond:
+// OMP-DEfAULT-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// OMP-DEfAULT-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// OMP-DEfAULT: omp.inner.for.body:
+// OMP-DEfAULT-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// OMP-DEfAULT-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// OMP-DEfAULT-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// OMP-DEfAULT-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 15
+// OMP-DEfAULT-NEXT: store i32 [[ADD2]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// OMP-DEfAULT: omp.body.continue:
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// OMP-DEfAULT: omp.inner.for.inc:
+// OMP-DEfAULT-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP9]], 1
+// OMP-DEfAULT-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_COND]]
+// OMP-DEfAULT: omp.inner.for.end:
+// OMP-DEfAULT-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// OMP-DEfAULT: omp.loop.exit:
+// OMP-DEfAULT-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@__cxx_global_var_init.13
+// OMP-DEfAULT-SAME: () #[[ATTR0]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: call void @_ZN2STILi100EEC1Ev(ptr noundef nonnull align 4 dereferenceable(912) @t1)
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2STILi100EED1Ev, ptr @t1, ptr @__dso_handle) #[[ATTR2]]
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2STILi100EEC1Ev
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(912) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: call void @_ZN2STILi100EEC2Ev(ptr noundef nonnull align 4 dereferenceable(912) [[THIS1]])
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2STILi100EED1Ev
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(912) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: call void @_ZN2STILi100EED2Ev(ptr noundef nonnull align 4 dereferenceable(912) [[THIS1]]) #[[ATTR2]]
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2STILi100EEC2Ev
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(912) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// OMP-DEfAULT-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// OMP-DEfAULT-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// OMP-DEfAULT-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// OMP-DEfAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP2]], ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// OMP-DEfAULT-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store i32 [[TMP3]], ptr [[TMP5]], align 4
+// OMP-DEfAULT-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store ptr null, ptr [[TMP6]], align 4
+// OMP-DEfAULT-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store i32 2, ptr [[TMP9]], align 4
+// OMP-DEfAULT-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// OMP-DEfAULT-NEXT: store i32 1, ptr [[TMP10]], align 4
+// OMP-DEfAULT-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// OMP-DEfAULT-NEXT: store ptr [[TMP7]], ptr [[TMP11]], align 4
+// OMP-DEfAULT-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// OMP-DEfAULT-NEXT: store ptr [[TMP8]], ptr [[TMP12]], align 4
+// OMP-DEfAULT-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// OMP-DEfAULT-NEXT: store ptr @.offload_sizes.14, ptr [[TMP13]], align 4
+// OMP-DEfAULT-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// OMP-DEfAULT-NEXT: store ptr @.offload_maptypes.15, ptr [[TMP14]], align 4
+// OMP-DEfAULT-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// OMP-DEfAULT-NEXT: store ptr null, ptr [[TMP15]], align 4
+// OMP-DEfAULT-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// OMP-DEfAULT-NEXT: store ptr null, ptr [[TMP16]], align 4
+// OMP-DEfAULT-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// OMP-DEfAULT-NEXT: store i64 0, ptr [[TMP17]], align 8
+// OMP-DEfAULT-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// OMP-DEfAULT-NEXT: store i64 0, ptr [[TMP18]], align 8
+// OMP-DEfAULT-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// OMP-DEfAULT-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP19]], align 4
+// OMP-DEfAULT-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// OMP-DEfAULT-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP20]], align 4
+// OMP-DEfAULT-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[TMP21]], align 4
+// OMP-DEfAULT-NEXT: [[TMP22:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EEC1Ev_l218.region_id, ptr [[KERNEL_ARGS]])
+// OMP-DEfAULT-NEXT: [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
+// OMP-DEfAULT-NEXT: br i1 [[TMP23]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// OMP-DEfAULT: omp_offload.failed:
+// OMP-DEfAULT-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EEC1Ev_l218(i32 [[TMP3]]) #[[ATTR2]]
+// OMP-DEfAULT-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// OMP-DEfAULT: omp_offload.cont:
+// OMP-DEfAULT-NEXT: [[TMP24:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP25:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP24]], ptr [[TMP25]], align 4
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EEC1Ev_l218
+// OMP-DEfAULT-SAME: (i32 noundef [[A:%.*]]) #[[ATTR3]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EEC1Ev_l218.omp_outlined, i32 [[TMP1]])
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EEC1Ev_l218.omp_outlined
+// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR4]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[I:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// OMP-DEfAULT-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP-DEfAULT-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// OMP-DEfAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// OMP-DEfAULT-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// OMP-DEfAULT: cond.true:
+// OMP-DEfAULT-NEXT: br label [[COND_END:%.*]]
+// OMP-DEfAULT: cond.false:
+// OMP-DEfAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: br label [[COND_END]]
+// OMP-DEfAULT: cond.end:
+// OMP-DEfAULT-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// OMP-DEfAULT-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// OMP-DEfAULT: omp.inner.for.cond:
+// OMP-DEfAULT-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// OMP-DEfAULT-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// OMP-DEfAULT: omp.inner.for.body:
+// OMP-DEfAULT-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// OMP-DEfAULT-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// OMP-DEfAULT-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// OMP-DEfAULT-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 117
+// OMP-DEfAULT-NEXT: store i32 [[ADD2]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// OMP-DEfAULT: omp.body.continue:
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// OMP-DEfAULT: omp.inner.for.inc:
+// OMP-DEfAULT-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP9]], 1
+// OMP-DEfAULT-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_COND]]
+// OMP-DEfAULT: omp.inner.for.end:
+// OMP-DEfAULT-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// OMP-DEfAULT: omp.loop.exit:
+// OMP-DEfAULT-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2STILi100EED2Ev
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(912) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// OMP-DEfAULT-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// OMP-DEfAULT-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// OMP-DEfAULT-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// OMP-DEfAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP2]], ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// OMP-DEfAULT-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store i32 [[TMP3]], ptr [[TMP5]], align 4
+// OMP-DEfAULT-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store ptr null, ptr [[TMP6]], align 4
+// OMP-DEfAULT-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store i32 2, ptr [[TMP9]], align 4
+// OMP-DEfAULT-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// OMP-DEfAULT-NEXT: store i32 1, ptr [[TMP10]], align 4
+// OMP-DEfAULT-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// OMP-DEfAULT-NEXT: store ptr [[TMP7]], ptr [[TMP11]], align 4
+// OMP-DEfAULT-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// OMP-DEfAULT-NEXT: store ptr [[TMP8]], ptr [[TMP12]], align 4
+// OMP-DEfAULT-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// OMP-DEfAULT-NEXT: store ptr @.offload_sizes.16, ptr [[TMP13]], align 4
+// OMP-DEfAULT-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// OMP-DEfAULT-NEXT: store ptr @.offload_maptypes.17, ptr [[TMP14]], align 4
+// OMP-DEfAULT-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// OMP-DEfAULT-NEXT: store ptr null, ptr [[TMP15]], align 4
+// OMP-DEfAULT-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// OMP-DEfAULT-NEXT: store ptr null, ptr [[TMP16]], align 4
+// OMP-DEfAULT-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// OMP-DEfAULT-NEXT: store i64 0, ptr [[TMP17]], align 8
+// OMP-DEfAULT-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// OMP-DEfAULT-NEXT: store i64 0, ptr [[TMP18]], align 8
+// OMP-DEfAULT-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// OMP-DEfAULT-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP19]], align 4
+// OMP-DEfAULT-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// OMP-DEfAULT-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP20]], align 4
+// OMP-DEfAULT-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[TMP21]], align 4
+// OMP-DEfAULT-NEXT: [[TMP22:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EED1Ev_l225.region_id, ptr [[KERNEL_ARGS]])
+// OMP-DEfAULT-NEXT: [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
+// OMP-DEfAULT-NEXT: br i1 [[TMP23]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// OMP-DEfAULT: omp_offload.failed:
+// OMP-DEfAULT-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EED1Ev_l225(i32 [[TMP3]]) #[[ATTR2]]
+// OMP-DEfAULT-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// OMP-DEfAULT: omp_offload.cont:
+// OMP-DEfAULT-NEXT: [[TMP24:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP25:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP24]], ptr [[TMP25]], align 4
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EED1Ev_l225
+// OMP-DEfAULT-SAME: (i32 noundef [[A:%.*]]) #[[ATTR3]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EED1Ev_l225.omp_outlined, i32 [[TMP1]])
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EED1Ev_l225.omp_outlined
+// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR4]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[I:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// OMP-DEfAULT-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP-DEfAULT-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// OMP-DEfAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// OMP-DEfAULT-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// OMP-DEfAULT: cond.true:
+// OMP-DEfAULT-NEXT: br label [[COND_END:%.*]]
+// OMP-DEfAULT: cond.false:
+// OMP-DEfAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: br label [[COND_END]]
+// OMP-DEfAULT: cond.end:
+// OMP-DEfAULT-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// OMP-DEfAULT-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// OMP-DEfAULT: omp.inner.for.cond:
+// OMP-DEfAULT-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// OMP-DEfAULT-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// OMP-DEfAULT: omp.inner.for.body:
+// OMP-DEfAULT-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// OMP-DEfAULT-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// OMP-DEfAULT-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// OMP-DEfAULT-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 118
+// OMP-DEfAULT-NEXT: store i32 [[ADD2]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// OMP-DEfAULT: omp.body.continue:
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// OMP-DEfAULT: omp.inner.for.inc:
+// OMP-DEfAULT-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP9]], 1
+// OMP-DEfAULT-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_COND]]
+// OMP-DEfAULT: omp.inner.for.end:
+// OMP-DEfAULT-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// OMP-DEfAULT: omp.loop.exit:
+// OMP-DEfAULT-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@__cxx_global_var_init.18
+// OMP-DEfAULT-SAME: () #[[ATTR0]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: call void @_ZN2STILi1000EEC1Ev(ptr noundef nonnull align 4 dereferenceable(4512) @t2)
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2STILi1000EED1Ev, ptr @t2, ptr @__dso_handle) #[[ATTR2]]
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2STILi1000EEC1Ev
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(4512) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: call void @_ZN2STILi1000EEC2Ev(ptr noundef nonnull align 4 dereferenceable(4512) [[THIS1]])
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2STILi1000EED1Ev
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(4512) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: call void @_ZN2STILi1000EED2Ev(ptr noundef nonnull align 4 dereferenceable(4512) [[THIS1]]) #[[ATTR2]]
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2STILi1000EEC2Ev
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(4512) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// OMP-DEfAULT-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// OMP-DEfAULT-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// OMP-DEfAULT-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// OMP-DEfAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP2]], ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// OMP-DEfAULT-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store i32 [[TMP3]], ptr [[TMP5]], align 4
+// OMP-DEfAULT-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store ptr null, ptr [[TMP6]], align 4
+// OMP-DEfAULT-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store i32 2, ptr [[TMP9]], align 4
+// OMP-DEfAULT-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// OMP-DEfAULT-NEXT: store i32 1, ptr [[TMP10]], align 4
+// OMP-DEfAULT-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// OMP-DEfAULT-NEXT: store ptr [[TMP7]], ptr [[TMP11]], align 4
+// OMP-DEfAULT-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// OMP-DEfAULT-NEXT: store ptr [[TMP8]], ptr [[TMP12]], align 4
+// OMP-DEfAULT-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// OMP-DEfAULT-NEXT: store ptr @.offload_sizes.19, ptr [[TMP13]], align 4
+// OMP-DEfAULT-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// OMP-DEfAULT-NEXT: store ptr @.offload_maptypes.20, ptr [[TMP14]], align 4
+// OMP-DEfAULT-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// OMP-DEfAULT-NEXT: store ptr null, ptr [[TMP15]], align 4
+// OMP-DEfAULT-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// OMP-DEfAULT-NEXT: store ptr null, ptr [[TMP16]], align 4
+// OMP-DEfAULT-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// OMP-DEfAULT-NEXT: store i64 0, ptr [[TMP17]], align 8
+// OMP-DEfAULT-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// OMP-DEfAULT-NEXT: store i64 0, ptr [[TMP18]], align 8
+// OMP-DEfAULT-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// OMP-DEfAULT-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP19]], align 4
+// OMP-DEfAULT-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// OMP-DEfAULT-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP20]], align 4
+// OMP-DEfAULT-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[TMP21]], align 4
+// OMP-DEfAULT-NEXT: [[TMP22:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EEC1Ev_l218.region_id, ptr [[KERNEL_ARGS]])
+// OMP-DEfAULT-NEXT: [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
+// OMP-DEfAULT-NEXT: br i1 [[TMP23]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// OMP-DEfAULT: omp_offload.failed:
+// OMP-DEfAULT-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EEC1Ev_l218(i32 [[TMP3]]) #[[ATTR2]]
+// OMP-DEfAULT-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// OMP-DEfAULT: omp_offload.cont:
+// OMP-DEfAULT-NEXT: [[TMP24:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP25:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP24]], ptr [[TMP25]], align 4
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EEC1Ev_l218
+// OMP-DEfAULT-SAME: (i32 noundef [[A:%.*]]) #[[ATTR3]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EEC1Ev_l218.omp_outlined, i32 [[TMP1]])
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EEC1Ev_l218.omp_outlined
+// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR4]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[I:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// OMP-DEfAULT-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP-DEfAULT-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// OMP-DEfAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// OMP-DEfAULT-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// OMP-DEfAULT: cond.true:
+// OMP-DEfAULT-NEXT: br label [[COND_END:%.*]]
+// OMP-DEfAULT: cond.false:
+// OMP-DEfAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: br label [[COND_END]]
+// OMP-DEfAULT: cond.end:
+// OMP-DEfAULT-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// OMP-DEfAULT-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// OMP-DEfAULT: omp.inner.for.cond:
+// OMP-DEfAULT-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// OMP-DEfAULT-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// OMP-DEfAULT: omp.inner.for.body:
+// OMP-DEfAULT-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// OMP-DEfAULT-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// OMP-DEfAULT-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// OMP-DEfAULT-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1017
+// OMP-DEfAULT-NEXT: store i32 [[ADD2]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// OMP-DEfAULT: omp.body.continue:
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// OMP-DEfAULT: omp.inner.for.inc:
+// OMP-DEfAULT-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP9]], 1
+// OMP-DEfAULT-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_COND]]
+// OMP-DEfAULT: omp.inner.for.end:
+// OMP-DEfAULT-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// OMP-DEfAULT: omp.loop.exit:
+// OMP-DEfAULT-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2STILi1000EED2Ev
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(4512) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// OMP-DEfAULT-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// OMP-DEfAULT-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// OMP-DEfAULT-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// OMP-DEfAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP2]], ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// OMP-DEfAULT-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store i32 [[TMP3]], ptr [[TMP5]], align 4
+// OMP-DEfAULT-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store ptr null, ptr [[TMP6]], align 4
+// OMP-DEfAULT-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store i32 2, ptr [[TMP9]], align 4
+// OMP-DEfAULT-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// OMP-DEfAULT-NEXT: store i32 1, ptr [[TMP10]], align 4
+// OMP-DEfAULT-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// OMP-DEfAULT-NEXT: store ptr [[TMP7]], ptr [[TMP11]], align 4
+// OMP-DEfAULT-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// OMP-DEfAULT-NEXT: store ptr [[TMP8]], ptr [[TMP12]], align 4
+// OMP-DEfAULT-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// OMP-DEfAULT-NEXT: store ptr @.offload_sizes.21, ptr [[TMP13]], align 4
+// OMP-DEfAULT-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// OMP-DEfAULT-NEXT: store ptr @.offload_maptypes.22, ptr [[TMP14]], align 4
+// OMP-DEfAULT-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// OMP-DEfAULT-NEXT: store ptr null, ptr [[TMP15]], align 4
+// OMP-DEfAULT-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// OMP-DEfAULT-NEXT: store ptr null, ptr [[TMP16]], align 4
+// OMP-DEfAULT-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// OMP-DEfAULT-NEXT: store i64 0, ptr [[TMP17]], align 8
+// OMP-DEfAULT-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// OMP-DEfAULT-NEXT: store i64 0, ptr [[TMP18]], align 8
+// OMP-DEfAULT-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// OMP-DEfAULT-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP19]], align 4
+// OMP-DEfAULT-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// OMP-DEfAULT-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP20]], align 4
+// OMP-DEfAULT-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[TMP21]], align 4
+// OMP-DEfAULT-NEXT: [[TMP22:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EED1Ev_l225.region_id, ptr [[KERNEL_ARGS]])
+// OMP-DEfAULT-NEXT: [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
+// OMP-DEfAULT-NEXT: br i1 [[TMP23]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// OMP-DEfAULT: omp_offload.failed:
+// OMP-DEfAULT-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EED1Ev_l225(i32 [[TMP3]]) #[[ATTR2]]
+// OMP-DEfAULT-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// OMP-DEfAULT: omp_offload.cont:
+// OMP-DEfAULT-NEXT: [[TMP24:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP25:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP24]], ptr [[TMP25]], align 4
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EED1Ev_l225
+// OMP-DEfAULT-SAME: (i32 noundef [[A:%.*]]) #[[ATTR3]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EED1Ev_l225.omp_outlined, i32 [[TMP1]])
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EED1Ev_l225.omp_outlined
+// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR4]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[I:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// OMP-DEfAULT-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP-DEfAULT-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// OMP-DEfAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// OMP-DEfAULT-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// OMP-DEfAULT: cond.true:
+// OMP-DEfAULT-NEXT: br label [[COND_END:%.*]]
+// OMP-DEfAULT: cond.false:
+// OMP-DEfAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: br label [[COND_END]]
+// OMP-DEfAULT: cond.end:
+// OMP-DEfAULT-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// OMP-DEfAULT-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// OMP-DEfAULT: omp.inner.for.cond:
+// OMP-DEfAULT-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// OMP-DEfAULT-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// OMP-DEfAULT: omp.inner.for.body:
+// OMP-DEfAULT-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// OMP-DEfAULT-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// OMP-DEfAULT-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// OMP-DEfAULT-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1018
+// OMP-DEfAULT-NEXT: store i32 [[ADD2]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// OMP-DEfAULT: omp.body.continue:
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// OMP-DEfAULT: omp.inner.for.inc:
+// OMP-DEfAULT-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP9]], 1
+// OMP-DEfAULT-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_COND]]
+// OMP-DEfAULT: omp.inner.for.end:
+// OMP-DEfAULT-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// OMP-DEfAULT: omp.loop.exit:
+// OMP-DEfAULT-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_Z3bari
+// OMP-DEfAULT-SAME: (i32 noundef [[A:%.*]]) #[[ATTR5:[0-9]+]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[R:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[R_CASTED:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// OMP-DEfAULT-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// OMP-DEfAULT-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// OMP-DEfAULT-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// OMP-DEfAULT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP0]], ptr [[R]], align 4
+// OMP-DEfAULT-NEXT: call void @_ZN2SA3fooEv(ptr noundef nonnull align 4 dereferenceable(16) @_ZL2a1)
+// OMP-DEfAULT-NEXT: call void @_ZN2SA3fooEv(ptr noundef nonnull align 4 dereferenceable(16) @a2)
+// OMP-DEfAULT-NEXT: call void @_ZN2SB3fooEv(ptr noundef nonnull align 4 dereferenceable(32) @b1)
+// OMP-DEfAULT-NEXT: call void @_ZN2SB3fooEv(ptr noundef nonnull align 4 dereferenceable(32) @b2)
+// OMP-DEfAULT-NEXT: call void @_ZN2SC3fooEv(ptr noundef nonnull align 4 dereferenceable(64) @_ZL2c1)
+// OMP-DEfAULT-NEXT: call void @_ZN2SD3fooEv(ptr noundef nonnull align 4 dereferenceable(128) @d1)
+// OMP-DEfAULT-NEXT: call void @_ZN2SE3fooEv(ptr noundef nonnull align 4 dereferenceable(256) @e1)
+// OMP-DEfAULT-NEXT: call void @_ZN2STILi100EE3fooEv(ptr noundef nonnull align 4 dereferenceable(912) @t1)
+// OMP-DEfAULT-NEXT: call void @_ZN2STILi1000EE3fooEv(ptr noundef nonnull align 4 dereferenceable(4512) @t2)
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[R]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP1]], ptr [[R_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[R_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store i32 [[TMP2]], ptr [[TMP3]], align 4
+// OMP-DEfAULT-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store i32 [[TMP2]], ptr [[TMP4]], align 4
+// OMP-DEfAULT-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store ptr null, ptr [[TMP5]], align 4
+// OMP-DEfAULT-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store i32 2, ptr [[TMP8]], align 4
+// OMP-DEfAULT-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// OMP-DEfAULT-NEXT: store i32 1, ptr [[TMP9]], align 4
+// OMP-DEfAULT-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// OMP-DEfAULT-NEXT: store ptr [[TMP6]], ptr [[TMP10]], align 4
+// OMP-DEfAULT-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// OMP-DEfAULT-NEXT: store ptr [[TMP7]], ptr [[TMP11]], align 4
+// OMP-DEfAULT-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// OMP-DEfAULT-NEXT: store ptr @.offload_sizes.23, ptr [[TMP12]], align 4
+// OMP-DEfAULT-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// OMP-DEfAULT-NEXT: store ptr @.offload_maptypes.24, ptr [[TMP13]], align 4
+// OMP-DEfAULT-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// OMP-DEfAULT-NEXT: store ptr null, ptr [[TMP14]], align 4
+// OMP-DEfAULT-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// OMP-DEfAULT-NEXT: store ptr null, ptr [[TMP15]], align 4
+// OMP-DEfAULT-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// OMP-DEfAULT-NEXT: store i64 0, ptr [[TMP16]], align 8
+// OMP-DEfAULT-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// OMP-DEfAULT-NEXT: store i64 0, ptr [[TMP17]], align 8
+// OMP-DEfAULT-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// OMP-DEfAULT-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4
+// OMP-DEfAULT-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// OMP-DEfAULT-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4
+// OMP-DEfAULT-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[TMP20]], align 4
+// OMP-DEfAULT-NEXT: [[TMP21:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3bari_l267.region_id, ptr [[KERNEL_ARGS]])
+// OMP-DEfAULT-NEXT: [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
+// OMP-DEfAULT-NEXT: br i1 [[TMP22]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// OMP-DEfAULT: omp_offload.failed:
+// OMP-DEfAULT-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3bari_l267(i32 [[TMP2]]) #[[ATTR2]]
+// OMP-DEfAULT-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// OMP-DEfAULT: omp_offload.cont:
+// OMP-DEfAULT-NEXT: [[TMP23:%.*]] = load i32, ptr [[R]], align 4
+// OMP-DEfAULT-NEXT: [[TMP24:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 4
+// OMP-DEfAULT-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP23]], [[TMP25]]
+// OMP-DEfAULT-NEXT: ret i32 [[ADD]]
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2SA3fooEv
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) #[[ATTR5]] comdat align 2 {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 1
+// OMP-DEfAULT-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2SB3fooEv
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(32) [[THIS:%.*]]) #[[ATTR5]] comdat align 2 {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// OMP-DEfAULT-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// OMP-DEfAULT-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// OMP-DEfAULT-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// OMP-DEfAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP2]], ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// OMP-DEfAULT-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store i32 [[TMP3]], ptr [[TMP5]], align 4
+// OMP-DEfAULT-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store ptr null, ptr [[TMP6]], align 4
+// OMP-DEfAULT-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store i32 2, ptr [[TMP9]], align 4
+// OMP-DEfAULT-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// OMP-DEfAULT-NEXT: store i32 1, ptr [[TMP10]], align 4
+// OMP-DEfAULT-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// OMP-DEfAULT-NEXT: store ptr [[TMP7]], ptr [[TMP11]], align 4
+// OMP-DEfAULT-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// OMP-DEfAULT-NEXT: store ptr [[TMP8]], ptr [[TMP12]], align 4
+// OMP-DEfAULT-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// OMP-DEfAULT-NEXT: store ptr @.offload_sizes.25, ptr [[TMP13]], align 4
+// OMP-DEfAULT-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// OMP-DEfAULT-NEXT: store ptr @.offload_maptypes.26, ptr [[TMP14]], align 4
+// OMP-DEfAULT-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// OMP-DEfAULT-NEXT: store ptr null, ptr [[TMP15]], align 4
+// OMP-DEfAULT-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// OMP-DEfAULT-NEXT: store ptr null, ptr [[TMP16]], align 4
+// OMP-DEfAULT-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// OMP-DEfAULT-NEXT: store i64 0, ptr [[TMP17]], align 8
+// OMP-DEfAULT-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// OMP-DEfAULT-NEXT: store i64 0, ptr [[TMP18]], align 8
+// OMP-DEfAULT-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// OMP-DEfAULT-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP19]], align 4
+// OMP-DEfAULT-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// OMP-DEfAULT-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP20]], align 4
+// OMP-DEfAULT-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[TMP21]], align 4
+// OMP-DEfAULT-NEXT: [[TMP22:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SB3fooEv_l122.region_id, ptr [[KERNEL_ARGS]])
+// OMP-DEfAULT-NEXT: [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
+// OMP-DEfAULT-NEXT: br i1 [[TMP23]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// OMP-DEfAULT: omp_offload.failed:
+// OMP-DEfAULT-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SB3fooEv_l122(i32 [[TMP3]]) #[[ATTR2]]
+// OMP-DEfAULT-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// OMP-DEfAULT: omp_offload.cont:
+// OMP-DEfAULT-NEXT: [[TMP24:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP25:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP24]], ptr [[TMP25]], align 4
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2SC3fooEv
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(64) [[THIS:%.*]]) #[[ATTR5]] comdat align 2 {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 7
+// OMP-DEfAULT-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2SD3fooEv
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(128) [[THIS:%.*]]) #[[ATTR5]] comdat align 2 {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 10
+// OMP-DEfAULT-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2SE3fooEv
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[THIS:%.*]]) #[[ATTR5]] comdat align 2 {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP2]], ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SE3fooEv_l185(i32 [[TMP3]]) #[[ATTR2]]
+// OMP-DEfAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP5:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP4]], ptr [[TMP5]], align 4
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2STILi100EE3fooEv
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(912) [[THIS:%.*]]) #[[ATTR5]] comdat align 2 {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// OMP-DEfAULT-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// OMP-DEfAULT-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// OMP-DEfAULT-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// OMP-DEfAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP2]], ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// OMP-DEfAULT-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store i32 [[TMP3]], ptr [[TMP5]], align 4
+// OMP-DEfAULT-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store ptr null, ptr [[TMP6]], align 4
+// OMP-DEfAULT-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store i32 2, ptr [[TMP9]], align 4
+// OMP-DEfAULT-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// OMP-DEfAULT-NEXT: store i32 1, ptr [[TMP10]], align 4
+// OMP-DEfAULT-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// OMP-DEfAULT-NEXT: store ptr [[TMP7]], ptr [[TMP11]], align 4
+// OMP-DEfAULT-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// OMP-DEfAULT-NEXT: store ptr [[TMP8]], ptr [[TMP12]], align 4
+// OMP-DEfAULT-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// OMP-DEfAULT-NEXT: store ptr @.offload_sizes.27, ptr [[TMP13]], align 4
+// OMP-DEfAULT-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// OMP-DEfAULT-NEXT: store ptr @.offload_maptypes.28, ptr [[TMP14]], align 4
+// OMP-DEfAULT-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// OMP-DEfAULT-NEXT: store ptr null, ptr [[TMP15]], align 4
+// OMP-DEfAULT-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// OMP-DEfAULT-NEXT: store ptr null, ptr [[TMP16]], align 4
+// OMP-DEfAULT-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// OMP-DEfAULT-NEXT: store i64 0, ptr [[TMP17]], align 8
+// OMP-DEfAULT-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// OMP-DEfAULT-NEXT: store i64 0, ptr [[TMP18]], align 8
+// OMP-DEfAULT-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// OMP-DEfAULT-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP19]], align 4
+// OMP-DEfAULT-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// OMP-DEfAULT-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP20]], align 4
+// OMP-DEfAULT-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[TMP21]], align 4
+// OMP-DEfAULT-NEXT: [[TMP22:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EE3fooEv_l211.region_id, ptr [[KERNEL_ARGS]])
+// OMP-DEfAULT-NEXT: [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
+// OMP-DEfAULT-NEXT: br i1 [[TMP23]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// OMP-DEfAULT: omp_offload.failed:
+// OMP-DEfAULT-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EE3fooEv_l211(i32 [[TMP3]]) #[[ATTR2]]
+// OMP-DEfAULT-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// OMP-DEfAULT: omp_offload.cont:
+// OMP-DEfAULT-NEXT: [[TMP24:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP25:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP24]], ptr [[TMP25]], align 4
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2STILi1000EE3fooEv
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(4512) [[THIS:%.*]]) #[[ATTR5]] comdat align 2 {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// OMP-DEfAULT-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// OMP-DEfAULT-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// OMP-DEfAULT-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// OMP-DEfAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP2]], ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// OMP-DEfAULT-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store i32 [[TMP3]], ptr [[TMP5]], align 4
+// OMP-DEfAULT-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store ptr null, ptr [[TMP6]], align 4
+// OMP-DEfAULT-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// OMP-DEfAULT-NEXT: store i32 2, ptr [[TMP9]], align 4
+// OMP-DEfAULT-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// OMP-DEfAULT-NEXT: store i32 1, ptr [[TMP10]], align 4
+// OMP-DEfAULT-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// OMP-DEfAULT-NEXT: store ptr [[TMP7]], ptr [[TMP11]], align 4
+// OMP-DEfAULT-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// OMP-DEfAULT-NEXT: store ptr [[TMP8]], ptr [[TMP12]], align 4
+// OMP-DEfAULT-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// OMP-DEfAULT-NEXT: store ptr @.offload_sizes.29, ptr [[TMP13]], align 4
+// OMP-DEfAULT-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// OMP-DEfAULT-NEXT: store ptr @.offload_maptypes.30, ptr [[TMP14]], align 4
+// OMP-DEfAULT-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// OMP-DEfAULT-NEXT: store ptr null, ptr [[TMP15]], align 4
+// OMP-DEfAULT-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// OMP-DEfAULT-NEXT: store ptr null, ptr [[TMP16]], align 4
+// OMP-DEfAULT-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// OMP-DEfAULT-NEXT: store i64 0, ptr [[TMP17]], align 8
+// OMP-DEfAULT-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// OMP-DEfAULT-NEXT: store i64 0, ptr [[TMP18]], align 8
+// OMP-DEfAULT-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// OMP-DEfAULT-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP19]], align 4
+// OMP-DEfAULT-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// OMP-DEfAULT-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP20]], align 4
+// OMP-DEfAULT-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[TMP21]], align 4
+// OMP-DEfAULT-NEXT: [[TMP22:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EE3fooEv_l211.region_id, ptr [[KERNEL_ARGS]])
+// OMP-DEfAULT-NEXT: [[TMP23:%.*]] = icmp ne i32 [[TMP22]], 0
+// OMP-DEfAULT-NEXT: br i1 [[TMP23]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// OMP-DEfAULT: omp_offload.failed:
+// OMP-DEfAULT-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EE3fooEv_l211(i32 [[TMP3]]) #[[ATTR2]]
+// OMP-DEfAULT-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// OMP-DEfAULT: omp_offload.cont:
+// OMP-DEfAULT-NEXT: [[TMP24:%.*]] = load i32, ptr [[A]], align 4
+// OMP-DEfAULT-NEXT: [[TMP25:%.*]] = load ptr, ptr @R, align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP24]], ptr [[TMP25]], align 4
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3bari_l267
+// OMP-DEfAULT-SAME: (i32 noundef [[R:%.*]]) #[[ATTR3]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[R_ADDR:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[R_CASTED:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: store i32 [[R]], ptr [[R_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load i32, ptr [[R_ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP0]], ptr [[R_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[R_CASTED]], align 4
+// OMP-DEfAULT-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3bari_l267.omp_outlined, i32 [[TMP1]])
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3bari_l267.omp_outlined
+// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[R:%.*]]) #[[ATTR4]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[R_ADDR:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[I:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[R]], ptr [[R_ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// OMP-DEfAULT-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP-DEfAULT-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// OMP-DEfAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// OMP-DEfAULT-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// OMP-DEfAULT: cond.true:
+// OMP-DEfAULT-NEXT: br label [[COND_END:%.*]]
+// OMP-DEfAULT: cond.false:
+// OMP-DEfAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: br label [[COND_END]]
+// OMP-DEfAULT: cond.end:
+// OMP-DEfAULT-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// OMP-DEfAULT-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// OMP-DEfAULT: omp.inner.for.cond:
+// OMP-DEfAULT-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// OMP-DEfAULT-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// OMP-DEfAULT: omp.inner.for.body:
+// OMP-DEfAULT-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// OMP-DEfAULT-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// OMP-DEfAULT-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// OMP-DEfAULT-NEXT: [[TMP8:%.*]] = load i32, ptr [[R_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1
+// OMP-DEfAULT-NEXT: store i32 [[INC]], ptr [[R_ADDR]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// OMP-DEfAULT: omp.body.continue:
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// OMP-DEfAULT: omp.inner.for.inc:
+// OMP-DEfAULT-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP9]], 1
+// OMP-DEfAULT-NEXT: store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_COND]]
+// OMP-DEfAULT: omp.inner.for.end:
+// OMP-DEfAULT-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// OMP-DEfAULT: omp.loop.exit:
+// OMP-DEfAULT-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SB3fooEv_l122
+// OMP-DEfAULT-SAME: (i32 noundef [[A:%.*]]) #[[ATTR3]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SB3fooEv_l122.omp_outlined, i32 [[TMP1]])
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SB3fooEv_l122.omp_outlined
+// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR4]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[I:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// OMP-DEfAULT-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP-DEfAULT-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// OMP-DEfAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// OMP-DEfAULT-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// OMP-DEfAULT: cond.true:
+// OMP-DEfAULT-NEXT: br label [[COND_END:%.*]]
+// OMP-DEfAULT: cond.false:
+// OMP-DEfAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: br label [[COND_END]]
+// OMP-DEfAULT: cond.end:
+// OMP-DEfAULT-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// OMP-DEfAULT-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// OMP-DEfAULT: omp.inner.for.cond:
+// OMP-DEfAULT-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// OMP-DEfAULT-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// OMP-DEfAULT: omp.inner.for.body:
+// OMP-DEfAULT-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// OMP-DEfAULT-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// OMP-DEfAULT-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// OMP-DEfAULT-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 4
+// OMP-DEfAULT-NEXT: store i32 [[ADD2]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// OMP-DEfAULT: omp.body.continue:
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// OMP-DEfAULT: omp.inner.for.inc:
+// OMP-DEfAULT-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP9]], 1
+// OMP-DEfAULT-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_COND]]
+// OMP-DEfAULT: omp.inner.for.end:
+// OMP-DEfAULT-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// OMP-DEfAULT: omp.loop.exit:
+// OMP-DEfAULT-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SE3fooEv_l185
+// OMP-DEfAULT-SAME: (i32 noundef [[A:%.*]]) #[[ATTR4]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SE3fooEv_l185.omp_outlined, i32 [[TMP1]])
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SE3fooEv_l185.omp_outlined
+// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR4]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[I:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// OMP-DEfAULT-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP-DEfAULT-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// OMP-DEfAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// OMP-DEfAULT-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// OMP-DEfAULT: cond.true:
+// OMP-DEfAULT-NEXT: br label [[COND_END:%.*]]
+// OMP-DEfAULT: cond.false:
+// OMP-DEfAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: br label [[COND_END]]
+// OMP-DEfAULT: cond.end:
+// OMP-DEfAULT-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// OMP-DEfAULT-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// OMP-DEfAULT: omp.inner.for.cond:
+// OMP-DEfAULT-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// OMP-DEfAULT-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// OMP-DEfAULT: omp.inner.for.body:
+// OMP-DEfAULT-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// OMP-DEfAULT-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// OMP-DEfAULT-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// OMP-DEfAULT-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 13
+// OMP-DEfAULT-NEXT: store i32 [[ADD2]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// OMP-DEfAULT: omp.body.continue:
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// OMP-DEfAULT: omp.inner.for.inc:
+// OMP-DEfAULT-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP9]], 1
+// OMP-DEfAULT-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_COND]]
+// OMP-DEfAULT: omp.inner.for.end:
+// OMP-DEfAULT-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// OMP-DEfAULT: omp.loop.exit:
+// OMP-DEfAULT-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EE3fooEv_l211
+// OMP-DEfAULT-SAME: (i32 noundef [[A:%.*]]) #[[ATTR3]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EE3fooEv_l211.omp_outlined, i32 [[TMP1]])
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EE3fooEv_l211.omp_outlined
+// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR4]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[I:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// OMP-DEfAULT-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP-DEfAULT-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// OMP-DEfAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// OMP-DEfAULT-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// OMP-DEfAULT: cond.true:
+// OMP-DEfAULT-NEXT: br label [[COND_END:%.*]]
+// OMP-DEfAULT: cond.false:
+// OMP-DEfAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: br label [[COND_END]]
+// OMP-DEfAULT: cond.end:
+// OMP-DEfAULT-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// OMP-DEfAULT-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// OMP-DEfAULT: omp.inner.for.cond:
+// OMP-DEfAULT-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// OMP-DEfAULT-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// OMP-DEfAULT: omp.inner.for.body:
+// OMP-DEfAULT-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// OMP-DEfAULT-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// OMP-DEfAULT-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// OMP-DEfAULT-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 116
+// OMP-DEfAULT-NEXT: store i32 [[ADD2]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// OMP-DEfAULT: omp.body.continue:
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// OMP-DEfAULT: omp.inner.for.inc:
+// OMP-DEfAULT-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP9]], 1
+// OMP-DEfAULT-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_COND]]
+// OMP-DEfAULT: omp.inner.for.end:
+// OMP-DEfAULT-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// OMP-DEfAULT: omp.loop.exit:
+// OMP-DEfAULT-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EE3fooEv_l211
+// OMP-DEfAULT-SAME: (i32 noundef [[A:%.*]]) #[[ATTR3]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// OMP-DEfAULT-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EE3fooEv_l211.omp_outlined, i32 [[TMP1]])
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EE3fooEv_l211.omp_outlined
+// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR4]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// OMP-DEfAULT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: [[I:%.*]] = alloca i32, align 4
+// OMP-DEfAULT-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// OMP-DEfAULT-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// OMP-DEfAULT-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// OMP-DEfAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP-DEfAULT-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// OMP-DEfAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// OMP-DEfAULT-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// OMP-DEfAULT: cond.true:
+// OMP-DEfAULT-NEXT: br label [[COND_END:%.*]]
+// OMP-DEfAULT: cond.false:
+// OMP-DEfAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: br label [[COND_END]]
+// OMP-DEfAULT: cond.end:
+// OMP-DEfAULT-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// OMP-DEfAULT-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// OMP-DEfAULT-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// OMP-DEfAULT: omp.inner.for.cond:
+// OMP-DEfAULT-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP-DEfAULT-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// OMP-DEfAULT-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// OMP-DEfAULT: omp.inner.for.body:
+// OMP-DEfAULT-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// OMP-DEfAULT-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// OMP-DEfAULT-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// OMP-DEfAULT-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1016
+// OMP-DEfAULT-NEXT: store i32 [[ADD2]], ptr [[A_ADDR]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// OMP-DEfAULT: omp.body.continue:
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// OMP-DEfAULT: omp.inner.for.inc:
+// OMP-DEfAULT-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP9]], 1
+// OMP-DEfAULT-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// OMP-DEfAULT-NEXT: br label [[OMP_INNER_FOR_COND]]
+// OMP-DEfAULT: omp.inner.for.end:
+// OMP-DEfAULT-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// OMP-DEfAULT: omp.loop.exit:
+// OMP-DEfAULT-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_GLOBAL__I_000500
+// OMP-DEfAULT-SAME: () #[[ATTR0]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: call void @__cxx_global_var_init()
+// OMP-DEfAULT-NEXT: call void @__cxx_global_var_init.2()
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_GLOBAL__I_000501
+// OMP-DEfAULT-SAME: () #[[ATTR0]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: call void @__cxx_global_var_init.3()
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@_GLOBAL__sub_I_target_parallel_generic_loop_codegen_1.cpp
+// OMP-DEfAULT-SAME: () #[[ATTR0]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: call void @__cxx_global_var_init.1()
+// OMP-DEfAULT-NEXT: call void @__cxx_global_var_init.4()
+// OMP-DEfAULT-NEXT: call void @__cxx_global_var_init.5()
+// OMP-DEfAULT-NEXT: call void @__cxx_global_var_init.8()
+// OMP-DEfAULT-NEXT: call void @__cxx_global_var_init.13()
+// OMP-DEfAULT-NEXT: call void @__cxx_global_var_init.18()
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+// OMP-DEfAULT-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// OMP-DEfAULT-SAME: () #[[ATTR0]] {
+// OMP-DEfAULT-NEXT: entry:
+// OMP-DEfAULT-NEXT: call void @__tgt_register_requires(i64 1)
+// OMP-DEfAULT-NEXT: ret void
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@__cxx_global_var_init
+// CHECK-NTARGET-OMP-DEFAULT-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @_ZN2SAC1Ev(ptr noundef nonnull align 4 dereferenceable(16) @_ZL2a1)
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SAD1Ev, ptr @_ZL2a1, ptr @__dso_handle) #[[ATTR2:[0-9]+]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2SAC1Ev
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @_ZN2SAC2Ev(ptr noundef nonnull align 4 dereferenceable(16) [[THIS1]])
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2SAD1Ev
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @_ZN2SAD2Ev(ptr noundef nonnull align 4 dereferenceable(16) [[THIS1]]) #[[ATTR2]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2SAC2Ev
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 2
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2SAD2Ev
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 3
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@__cxx_global_var_init.1
+// CHECK-NTARGET-OMP-DEFAULT-SAME: () #[[ATTR0]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @_ZN2SAC1Ev(ptr noundef nonnull align 4 dereferenceable(16) @a2)
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SAD1Ev, ptr @a2, ptr @__dso_handle) #[[ATTR2]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@__cxx_global_var_init.2
+// CHECK-NTARGET-OMP-DEFAULT-SAME: () #[[ATTR0]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @_ZN2SBC1Ev(ptr noundef nonnull align 4 dereferenceable(32) @b1)
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SBD1Ev, ptr @b1, ptr @__dso_handle) #[[ATTR2]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2SBC1Ev
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(32) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @_ZN2SBC2Ev(ptr noundef nonnull align 4 dereferenceable(32) [[THIS1]])
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2SBD1Ev
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(32) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @_ZN2SBD2Ev(ptr noundef nonnull align 4 dereferenceable(32) [[THIS1]]) #[[ATTR2]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2SBC2Ev
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(32) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 5
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2SBD2Ev
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(32) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 6
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@__cxx_global_var_init.3
+// CHECK-NTARGET-OMP-DEFAULT-SAME: () #[[ATTR0]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @_ZN2SBC1Ev(ptr noundef nonnull align 4 dereferenceable(32) @b2)
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SBD1Ev, ptr @b2, ptr @__dso_handle) #[[ATTR2]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@__cxx_global_var_init.4
+// CHECK-NTARGET-OMP-DEFAULT-SAME: () #[[ATTR0]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @_ZN2SCC1Ev(ptr noundef nonnull align 4 dereferenceable(64) @_ZL2c1)
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SCD1Ev, ptr @_ZL2c1, ptr @__dso_handle) #[[ATTR2]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2SCC1Ev
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(64) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @_ZN2SCC2Ev(ptr noundef nonnull align 4 dereferenceable(64) [[THIS1]])
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2SCD1Ev
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(64) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @_ZN2SCD2Ev(ptr noundef nonnull align 4 dereferenceable(64) [[THIS1]]) #[[ATTR2]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2SCC2Ev
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(64) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP2]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP3:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SCC1Ev_l148(i64 [[TMP3]]) #[[ATTR2]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP5:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP4]], ptr [[TMP5]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SCC1Ev_l148
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (i64 noundef [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2:[0-9]+]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SCC1Ev_l148.omp_outlined, i64 [[TMP1]])
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SCC1Ev_l148.omp_outlined
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.true:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[COND_END:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.false:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[COND_END]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.end:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.cond:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.body:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD2]], ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.body.continue:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.inc:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.end:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.loop.exit:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2SCD2Ev
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(64) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 9
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@__cxx_global_var_init.5
+// CHECK-NTARGET-OMP-DEFAULT-SAME: () #[[ATTR0]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @_ZN2SDC1Ev(ptr noundef nonnull align 4 dereferenceable(128) @d1)
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SDD1Ev, ptr @d1, ptr @__dso_handle) #[[ATTR2]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2SDC1Ev
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(128) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @_ZN2SDC2Ev(ptr noundef nonnull align 4 dereferenceable(128) [[THIS1]])
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2SDD1Ev
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(128) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @_ZN2SDD2Ev(ptr noundef nonnull align 4 dereferenceable(128) [[THIS1]]) #[[ATTR2]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2SDC2Ev
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(128) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 11
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2SDD2Ev
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(128) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP2]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP3:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SDD1Ev_l174(i64 [[TMP3]]) #[[ATTR2]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP5:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP4]], ptr [[TMP5]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SDD1Ev_l174
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (i64 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SDD1Ev_l174.omp_outlined, i64 [[TMP1]])
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SDD1Ev_l174.omp_outlined
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.true:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[COND_END:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.false:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[COND_END]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.end:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.cond:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.body:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 12
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD2]], ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.body.continue:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.inc:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.end:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.loop.exit:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@__cxx_global_var_init.6
+// CHECK-NTARGET-OMP-DEFAULT-SAME: () #[[ATTR0]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @_ZN2SEC1Ev(ptr noundef nonnull align 4 dereferenceable(256) @e1)
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SED1Ev, ptr @e1, ptr @__dso_handle) #[[ATTR2]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2SEC1Ev
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @_ZN2SEC2Ev(ptr noundef nonnull align 4 dereferenceable(256) [[THIS1]])
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2SED1Ev
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @_ZN2SED2Ev(ptr noundef nonnull align 4 dereferenceable(256) [[THIS1]]) #[[ATTR2]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2SEC2Ev
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP2]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP3:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SEC1Ev_l192(i64 [[TMP3]]) #[[ATTR2]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP5:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP4]], ptr [[TMP5]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SEC1Ev_l192
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (i64 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SEC1Ev_l192.omp_outlined, i64 [[TMP1]])
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SEC1Ev_l192.omp_outlined
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.true:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[COND_END:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.false:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[COND_END]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.end:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.cond:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.body:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 14
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD2]], ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.body.continue:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.inc:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.end:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.loop.exit:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2SED2Ev
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP2]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP3:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SED1Ev_l199(i64 [[TMP3]]) #[[ATTR2]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP5:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP4]], ptr [[TMP5]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SED1Ev_l199
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (i64 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SED1Ev_l199.omp_outlined, i64 [[TMP1]])
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SED1Ev_l199.omp_outlined
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.true:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[COND_END:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.false:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[COND_END]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.end:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.cond:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.body:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 15
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD2]], ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.body.continue:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.inc:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.end:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.loop.exit:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@__cxx_global_var_init.7
+// CHECK-NTARGET-OMP-DEFAULT-SAME: () #[[ATTR0]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @_ZN2STILi100EEC1Ev(ptr noundef nonnull align 4 dereferenceable(912) @t1)
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2STILi100EED1Ev, ptr @t1, ptr @__dso_handle) #[[ATTR2]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2STILi100EEC1Ev
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(912) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @_ZN2STILi100EEC2Ev(ptr noundef nonnull align 4 dereferenceable(912) [[THIS1]])
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2STILi100EED1Ev
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(912) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @_ZN2STILi100EED2Ev(ptr noundef nonnull align 4 dereferenceable(912) [[THIS1]]) #[[ATTR2]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2STILi100EEC2Ev
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(912) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP2]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP3:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EEC1Ev_l218(i64 [[TMP3]]) #[[ATTR2]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP5:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP4]], ptr [[TMP5]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EEC1Ev_l218
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (i64 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EEC1Ev_l218.omp_outlined, i64 [[TMP1]])
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EEC1Ev_l218.omp_outlined
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.true:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[COND_END:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.false:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[COND_END]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.end:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.cond:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.body:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 117
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD2]], ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.body.continue:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.inc:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.end:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.loop.exit:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2STILi100EED2Ev
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(912) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP2]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP3:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EED1Ev_l225(i64 [[TMP3]]) #[[ATTR2]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP5:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP4]], ptr [[TMP5]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EED1Ev_l225
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (i64 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EED1Ev_l225.omp_outlined, i64 [[TMP1]])
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EED1Ev_l225.omp_outlined
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.true:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[COND_END:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.false:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[COND_END]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.end:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.cond:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.body:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 118
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD2]], ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.body.continue:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.inc:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.end:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.loop.exit:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@__cxx_global_var_init.8
+// CHECK-NTARGET-OMP-DEFAULT-SAME: () #[[ATTR0]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @_ZN2STILi1000EEC1Ev(ptr noundef nonnull align 4 dereferenceable(4512) @t2)
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2STILi1000EED1Ev, ptr @t2, ptr @__dso_handle) #[[ATTR2]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2STILi1000EEC1Ev
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(4512) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @_ZN2STILi1000EEC2Ev(ptr noundef nonnull align 4 dereferenceable(4512) [[THIS1]])
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2STILi1000EED1Ev
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(4512) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @_ZN2STILi1000EED2Ev(ptr noundef nonnull align 4 dereferenceable(4512) [[THIS1]]) #[[ATTR2]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2STILi1000EEC2Ev
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(4512) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP2]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP3:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EEC1Ev_l218(i64 [[TMP3]]) #[[ATTR2]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP5:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP4]], ptr [[TMP5]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EEC1Ev_l218
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (i64 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EEC1Ev_l218.omp_outlined, i64 [[TMP1]])
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EEC1Ev_l218.omp_outlined
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.true:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[COND_END:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.false:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[COND_END]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.end:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.cond:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.body:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1017
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD2]], ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.body.continue:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.inc:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.end:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.loop.exit:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2STILi1000EED2Ev
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(4512) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP2]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP3:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EED1Ev_l225(i64 [[TMP3]]) #[[ATTR2]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP5:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP4]], ptr [[TMP5]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EED1Ev_l225
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (i64 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EED1Ev_l225.omp_outlined, i64 [[TMP1]])
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EED1Ev_l225.omp_outlined
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.true:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[COND_END:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.false:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[COND_END]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.end:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.cond:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.body:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1018
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD2]], ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.body.continue:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.inc:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.end:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.loop.exit:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_Z3bari
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (i32 noundef signext [[A:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[R:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[R_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP0]], ptr [[R]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @_ZN2SA3fooEv(ptr noundef nonnull align 4 dereferenceable(16) @_ZL2a1)
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @_ZN2SA3fooEv(ptr noundef nonnull align 4 dereferenceable(16) @a2)
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @_ZN2SB3fooEv(ptr noundef nonnull align 4 dereferenceable(32) @b1)
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @_ZN2SB3fooEv(ptr noundef nonnull align 4 dereferenceable(32) @b2)
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @_ZN2SC3fooEv(ptr noundef nonnull align 4 dereferenceable(64) @_ZL2c1)
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @_ZN2SD3fooEv(ptr noundef nonnull align 4 dereferenceable(128) @d1)
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @_ZN2SE3fooEv(ptr noundef nonnull align 4 dereferenceable(256) @e1)
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @_ZN2STILi100EE3fooEv(ptr noundef nonnull align 4 dereferenceable(912) @t1)
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @_ZN2STILi1000EE3fooEv(ptr noundef nonnull align 4 dereferenceable(4512) @t2)
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[R]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP1]], ptr [[R_CASTED]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP2:%.*]] = load i64, ptr [[R_CASTED]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3bari_l267(i64 [[TMP2]]) #[[ATTR2]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[R]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], [[TMP5]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret i32 [[ADD]]
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2SA3fooEv
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) #[[ATTR4]] comdat align 2 {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 1
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2SB3fooEv
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(32) [[THIS:%.*]]) #[[ATTR4]] comdat align 2 {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP2]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP3:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SB3fooEv_l122(i64 [[TMP3]]) #[[ATTR2]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP5:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP4]], ptr [[TMP5]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2SC3fooEv
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(64) [[THIS:%.*]]) #[[ATTR4]] comdat align 2 {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 7
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2SD3fooEv
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(128) [[THIS:%.*]]) #[[ATTR4]] comdat align 2 {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 10
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP4:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2SE3fooEv
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[THIS:%.*]]) #[[ATTR4]] comdat align 2 {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP2]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP3:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SE3fooEv_l185(i64 [[TMP3]]) #[[ATTR2]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP5:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP4]], ptr [[TMP5]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2STILi100EE3fooEv
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(912) [[THIS:%.*]]) #[[ATTR4]] comdat align 2 {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP2]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP3:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EE3fooEv_l211(i64 [[TMP3]]) #[[ATTR2]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP5:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP4]], ptr [[TMP5]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_ZN2STILi1000EE3fooEv
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(4512) [[THIS:%.*]]) #[[ATTR4]] comdat align 2 {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP1]], ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP2]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP3:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EE3fooEv_l211(i64 [[TMP3]]) #[[ATTR2]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[A]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP5:%.*]] = load ptr, ptr @R, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP4]], ptr [[TMP5]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3bari_l267
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (i64 noundef [[R:%.*]]) #[[ATTR3]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[R_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[R_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i64 [[R]], ptr [[R_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load i32, ptr [[R_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP0]], ptr [[R_CASTED]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i64, ptr [[R_CASTED]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3bari_l267.omp_outlined, i64 [[TMP1]])
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3bari_l267.omp_outlined
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[R:%.*]]) #[[ATTR3]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[R_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i64 [[R]], ptr [[R_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.true:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[COND_END:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.false:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[COND_END]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.end:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.cond:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.body:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP8:%.*]] = load i32, ptr [[R_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[INC]], ptr [[R_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.body.continue:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.inc:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.end:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.loop.exit:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SB3fooEv_l122
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (i64 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SB3fooEv_l122.omp_outlined, i64 [[TMP1]])
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SB3fooEv_l122.omp_outlined
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.true:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[COND_END:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.false:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[COND_END]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.end:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.cond:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.body:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD2]], ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.body.continue:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.inc:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.end:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.loop.exit:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SE3fooEv_l185
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (i64 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SE3fooEv_l185.omp_outlined, i64 [[TMP1]])
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SE3fooEv_l185.omp_outlined
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.true:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[COND_END:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.false:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[COND_END]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.end:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.cond:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.body:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 13
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD2]], ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.body.continue:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.inc:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.end:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.loop.exit:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EE3fooEv_l211
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (i64 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EE3fooEv_l211.omp_outlined, i64 [[TMP1]])
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EE3fooEv_l211.omp_outlined
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.true:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[COND_END:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.false:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[COND_END]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.end:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.cond:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.body:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 116
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD2]], ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.body.continue:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.inc:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.end:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.loop.exit:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EE3fooEv_l211
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (i64 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EE3fooEv_l211.omp_outlined, i64 [[TMP1]])
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EE3fooEv_l211.omp_outlined
+// CHECK-NTARGET-OMP-DEFAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.true:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[COND_END:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.false:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[COND_END]]
+// CHECK-NTARGET-OMP-DEFAULT: cond.end:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.cond:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.body:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1016
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD2]], ptr [[A_ADDR]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.body.continue:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.inc:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.inner.for.end:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-NTARGET-OMP-DEFAULT: omp.loop.exit:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_GLOBAL__I_000500
+// CHECK-NTARGET-OMP-DEFAULT-SAME: () #[[ATTR0]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @__cxx_global_var_init()
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @__cxx_global_var_init.2()
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_GLOBAL__I_000501
+// CHECK-NTARGET-OMP-DEFAULT-SAME: () #[[ATTR0]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @__cxx_global_var_init.3()
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//
+// CHECK-NTARGET-OMP-DEFAULT-LABEL: define {{[^@]+}}@_GLOBAL__sub_I_target_parallel_generic_loop_codegen_1.cpp
+// CHECK-NTARGET-OMP-DEFAULT-SAME: () #[[ATTR0]] {
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: entry:
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @__cxx_global_var_init.1()
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @__cxx_global_var_init.4()
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @__cxx_global_var_init.5()
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @__cxx_global_var_init.6()
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @__cxx_global_var_init.7()
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: call void @__cxx_global_var_init.8()
+// CHECK-NTARGET-OMP-DEFAULT-NEXT: ret void
+//
+//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+// TCHECK: {{.*}}
diff --git a/clang/test/OpenMP/target_parallel_generic_loop_codegen-2.cpp b/clang/test/OpenMP/target_parallel_generic_loop_codegen-2.cpp
new file mode 100644
index 00000000000000..b5041e480d87d1
--- /dev/null
+++ b/clang/test/OpenMP/target_parallel_generic_loop_codegen-2.cpp
@@ -0,0 +1,952 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --version 2
+// Test host codegen.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK-X86
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK-X86
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0-X86 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0-X86 %s
+
+// Test target parallel for codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s -check-prefix=TCHECK-TARGET
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK-TARGET
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s -check-prefix=TCHECK-TARGET-X86
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK-TARGET-X86
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1-TARGET %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1-TARGET %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1-TARGET-X86 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1-TARGET-X86 %s
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+
+int nested(int a){
+ #pragma omp target parallel loop
+ for (int i = 0; i < 10; ++i)
+ ++a;
+
+ auto F = [&](){
+ #pragma omp parallel
+ {
+ #pragma omp target parallel loop
+ for (int i = 0; i < 10; ++i)
+ ++a;
+ }
+ };
+
+ F();
+
+ return a;
+}
+
+
+
+
+
+
+// Check metadata is properly generated:
+
+#endif
+// CHECK-LABEL: define dso_local noundef signext i32 @_Z6nestedi
+// CHECK-SAME: (i32 noundef signext [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK-NEXT: [[F:%.*]] = alloca [[CLASS_ANON:%.*]], align 8
+// CHECK-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-NEXT: store i64 [[TMP1]], ptr [[TMP2]], align 8
+// CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-NEXT: store i64 [[TMP1]], ptr [[TMP3]], align 8
+// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK-NEXT: store ptr null, ptr [[TMP4]], align 8
+// CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK-NEXT: store i32 2, ptr [[TMP7]], align 4
+// CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK-NEXT: store i32 1, ptr [[TMP8]], align 4
+// CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK-NEXT: store ptr [[TMP5]], ptr [[TMP9]], align 8
+// CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK-NEXT: store ptr [[TMP6]], ptr [[TMP10]], align 8
+// CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK-NEXT: store ptr @.offload_sizes, ptr [[TMP11]], align 8
+// CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK-NEXT: store ptr @.offload_maptypes, ptr [[TMP12]], align 8
+// CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK-NEXT: store ptr null, ptr [[TMP13]], align 8
+// CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK-NEXT: store ptr null, ptr [[TMP14]], align 8
+// CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK-NEXT: store i64 0, ptr [[TMP15]], align 8
+// CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK-NEXT: store i64 0, ptr [[TMP16]], align 8
+// CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP17]], align 4
+// CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP18]], align 4
+// CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK-NEXT: store i32 0, ptr [[TMP19]], align 4
+// CHECK-NEXT: [[TMP20:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2:[0-9]+]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l42.region_id, ptr [[KERNEL_ARGS]])
+// CHECK-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
+// CHECK-NEXT: br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK: omp_offload.failed:
+// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l42(i64 [[TMP1]]) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK: omp_offload.cont:
+// CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[F]], i32 0, i32 0
+// CHECK-NEXT: store ptr [[A_ADDR]], ptr [[TMP22]], align 8
+// CHECK-NEXT: call void @"_ZZ6nestediENK3$_0clEv"(ptr noundef nonnull align 8 dereferenceable(8) [[F]])
+// CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT: ret i32 [[TMP23]]
+//
+//
+// CHECK-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l42
+// CHECK-SAME: (i64 noundef [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l42.omp_outlined, i64 [[TMP1]])
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l42.omp_outlined
+// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK: cond.true:
+// CHECK-NEXT: br label [[COND_END:%.*]]
+// CHECK: cond.false:
+// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: br label [[COND_END]]
+// CHECK: cond.end:
+// CHECK-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK: omp.inner.for.cond:
+// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK: omp.inner.for.body:
+// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1
+// CHECK-NEXT: store i32 [[INC]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK: omp.body.continue:
+// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK: omp.inner.for.inc:
+// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK-NEXT: store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK: omp.inner.for.end:
+// CHECK-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK: omp.loop.exit:
+// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l49
+// CHECK-SAME: (i64 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l49.omp_outlined, i64 [[TMP1]])
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l49.omp_outlined
+// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR2]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK: cond.true:
+// CHECK-NEXT: br label [[COND_END:%.*]]
+// CHECK: cond.false:
+// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: br label [[COND_END]]
+// CHECK: cond.end:
+// CHECK-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK: omp.inner.for.cond:
+// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK: omp.inner.for.body:
+// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1
+// CHECK-NEXT: store i32 [[INC]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK: omp.body.continue:
+// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK: omp.inner.for.inc:
+// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK-NEXT: store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK: omp.inner.for.end:
+// CHECK-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK: omp.loop.exit:
+// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define internal void @.omp_offloading.requires_reg
+// CHECK-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-X86-LABEL: define dso_local noundef i32 @_Z6nestedi
+// CHECK-X86-SAME: (i32 noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-X86-NEXT: entry:
+// CHECK-X86-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-X86-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK-X86-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// CHECK-X86-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// CHECK-X86-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// CHECK-X86-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK-X86-NEXT: [[F:%.*]] = alloca [[CLASS_ANON:%.*]], align 4
+// CHECK-X86-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK-X86-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-X86-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// CHECK-X86-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// CHECK-X86-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-X86-NEXT: store i32 [[TMP1]], ptr [[TMP2]], align 4
+// CHECK-X86-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-X86-NEXT: store i32 [[TMP1]], ptr [[TMP3]], align 4
+// CHECK-X86-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK-X86-NEXT: store ptr null, ptr [[TMP4]], align 4
+// CHECK-X86-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-X86-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-X86-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK-X86-NEXT: store i32 2, ptr [[TMP7]], align 4
+// CHECK-X86-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK-X86-NEXT: store i32 1, ptr [[TMP8]], align 4
+// CHECK-X86-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK-X86-NEXT: store ptr [[TMP5]], ptr [[TMP9]], align 4
+// CHECK-X86-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK-X86-NEXT: store ptr [[TMP6]], ptr [[TMP10]], align 4
+// CHECK-X86-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK-X86-NEXT: store ptr @.offload_sizes, ptr [[TMP11]], align 4
+// CHECK-X86-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK-X86-NEXT: store ptr @.offload_maptypes, ptr [[TMP12]], align 4
+// CHECK-X86-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK-X86-NEXT: store ptr null, ptr [[TMP13]], align 4
+// CHECK-X86-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK-X86-NEXT: store ptr null, ptr [[TMP14]], align 4
+// CHECK-X86-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK-X86-NEXT: store i64 0, ptr [[TMP15]], align 8
+// CHECK-X86-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK-X86-NEXT: store i64 0, ptr [[TMP16]], align 8
+// CHECK-X86-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK-X86-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP17]], align 4
+// CHECK-X86-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK-X86-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP18]], align 4
+// CHECK-X86-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK-X86-NEXT: store i32 0, ptr [[TMP19]], align 4
+// CHECK-X86-NEXT: [[TMP20:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2:[0-9]+]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l42.region_id, ptr [[KERNEL_ARGS]])
+// CHECK-X86-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
+// CHECK-X86-NEXT: br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK-X86: omp_offload.failed:
+// CHECK-X86-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l42(i32 [[TMP1]]) #[[ATTR3:[0-9]+]]
+// CHECK-X86-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK-X86: omp_offload.cont:
+// CHECK-X86-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[F]], i32 0, i32 0
+// CHECK-X86-NEXT: store ptr [[A_ADDR]], ptr [[TMP22]], align 4
+// CHECK-X86-NEXT: call void @"_ZZ6nestediENK3$_0clEv"(ptr noundef nonnull align 4 dereferenceable(4) [[F]])
+// CHECK-X86-NEXT: [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-X86-NEXT: ret i32 [[TMP23]]
+//
+//
+// CHECK-X86-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l42
+// CHECK-X86-SAME: (i32 noundef [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-X86-NEXT: entry:
+// CHECK-X86-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-X86-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK-X86-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK-X86-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-X86-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// CHECK-X86-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// CHECK-X86-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l42.omp_outlined, i32 [[TMP1]])
+// CHECK-X86-NEXT: ret void
+//
+//
+// CHECK-X86-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l42.omp_outlined
+// CHECK-X86-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK-X86-NEXT: entry:
+// CHECK-X86-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-X86-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-X86-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-X86-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-X86-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-X86-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-X86-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-X86-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-X86-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-X86-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-X86-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-X86-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-X86-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK-X86-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-X86-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-X86-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-X86-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-X86-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-X86-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-X86-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-X86-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-X86-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-X86: cond.true:
+// CHECK-X86-NEXT: br label [[COND_END:%.*]]
+// CHECK-X86: cond.false:
+// CHECK-X86-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-X86-NEXT: br label [[COND_END]]
+// CHECK-X86: cond.end:
+// CHECK-X86-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-X86-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-X86-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-X86-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK-X86-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-X86: omp.inner.for.cond:
+// CHECK-X86-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-X86-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-X86-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-X86-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-X86: omp.inner.for.body:
+// CHECK-X86-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-X86-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-X86-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-X86-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-X86-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-X86-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1
+// CHECK-X86-NEXT: store i32 [[INC]], ptr [[A_ADDR]], align 4
+// CHECK-X86-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-X86: omp.body.continue:
+// CHECK-X86-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-X86: omp.inner.for.inc:
+// CHECK-X86-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-X86-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK-X86-NEXT: store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4
+// CHECK-X86-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-X86: omp.inner.for.end:
+// CHECK-X86-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-X86: omp.loop.exit:
+// CHECK-X86-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// CHECK-X86-NEXT: ret void
+//
+//
+// CHECK-X86-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l49
+// CHECK-X86-SAME: (i32 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-X86-NEXT: entry:
+// CHECK-X86-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-X86-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK-X86-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK-X86-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-X86-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// CHECK-X86-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// CHECK-X86-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l49.omp_outlined, i32 [[TMP1]])
+// CHECK-X86-NEXT: ret void
+//
+//
+// CHECK-X86-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l49.omp_outlined
+// CHECK-X86-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR2]] {
+// CHECK-X86-NEXT: entry:
+// CHECK-X86-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-X86-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK-X86-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-X86-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-X86-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-X86-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-X86-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-X86-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-X86-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-X86-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-X86-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-X86-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-X86-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK-X86-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-X86-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-X86-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-X86-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-X86-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-X86-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-X86-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-X86-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-X86-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-X86: cond.true:
+// CHECK-X86-NEXT: br label [[COND_END:%.*]]
+// CHECK-X86: cond.false:
+// CHECK-X86-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-X86-NEXT: br label [[COND_END]]
+// CHECK-X86: cond.end:
+// CHECK-X86-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-X86-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-X86-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-X86-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK-X86-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-X86: omp.inner.for.cond:
+// CHECK-X86-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-X86-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-X86-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-X86-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-X86: omp.inner.for.body:
+// CHECK-X86-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-X86-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-X86-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-X86-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-X86-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-X86-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1
+// CHECK-X86-NEXT: store i32 [[INC]], ptr [[A_ADDR]], align 4
+// CHECK-X86-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-X86: omp.body.continue:
+// CHECK-X86-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-X86: omp.inner.for.inc:
+// CHECK-X86-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-X86-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP9]], 1
+// CHECK-X86-NEXT: store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4
+// CHECK-X86-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-X86: omp.inner.for.end:
+// CHECK-X86-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-X86: omp.loop.exit:
+// CHECK-X86-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// CHECK-X86-NEXT: ret void
+//
+//
+// CHECK-X86-LABEL: define internal void @.omp_offloading.requires_reg
+// CHECK-X86-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK-X86-NEXT: entry:
+// CHECK-X86-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK-X86-NEXT: ret void
+//
+//
+// SIMD-ONLY0-LABEL: define dso_local noundef signext i32 @_Z6nestedi
+// SIMD-ONLY0-SAME: (i32 noundef signext [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY0-NEXT: entry:
+// SIMD-ONLY0-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT: [[I:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-NEXT: [[F:%.*]] = alloca [[CLASS_ANON:%.*]], align 8
+// SIMD-ONLY0-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// SIMD-ONLY0-NEXT: store i32 0, ptr [[I]], align 4
+// SIMD-ONLY0-NEXT: br label [[FOR_COND:%.*]]
+// SIMD-ONLY0: for.cond:
+// SIMD-ONLY0-NEXT: [[TMP0:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY0-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 10
+// SIMD-ONLY0-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// SIMD-ONLY0: for.body:
+// SIMD-ONLY0-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// SIMD-ONLY0-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1
+// SIMD-ONLY0-NEXT: store i32 [[INC]], ptr [[A_ADDR]], align 4
+// SIMD-ONLY0-NEXT: br label [[FOR_INC:%.*]]
+// SIMD-ONLY0: for.inc:
+// SIMD-ONLY0-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY0-NEXT: [[INC1:%.*]] = add nsw i32 [[TMP2]], 1
+// SIMD-ONLY0-NEXT: store i32 [[INC1]], ptr [[I]], align 4
+// SIMD-ONLY0-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP2:![0-9]+]]
+// SIMD-ONLY0: for.end:
+// SIMD-ONLY0-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[F]], i32 0, i32 0
+// SIMD-ONLY0-NEXT: store ptr [[A_ADDR]], ptr [[TMP3]], align 8
+// SIMD-ONLY0-NEXT: call void @"_ZZ6nestediENK3$_0clEv"(ptr noundef nonnull align 8 dereferenceable(8) [[F]])
+// SIMD-ONLY0-NEXT: [[TMP4:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// SIMD-ONLY0-NEXT: ret i32 [[TMP4]]
+//
+//
+// SIMD-ONLY0-X86-LABEL: define dso_local noundef i32 @_Z6nestedi
+// SIMD-ONLY0-X86-SAME: (i32 noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY0-X86-NEXT: entry:
+// SIMD-ONLY0-X86-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-X86-NEXT: [[I:%.*]] = alloca i32, align 4
+// SIMD-ONLY0-X86-NEXT: [[F:%.*]] = alloca [[CLASS_ANON:%.*]], align 4
+// SIMD-ONLY0-X86-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// SIMD-ONLY0-X86-NEXT: store i32 0, ptr [[I]], align 4
+// SIMD-ONLY0-X86-NEXT: br label [[FOR_COND:%.*]]
+// SIMD-ONLY0-X86: for.cond:
+// SIMD-ONLY0-X86-NEXT: [[TMP0:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY0-X86-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 10
+// SIMD-ONLY0-X86-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// SIMD-ONLY0-X86: for.body:
+// SIMD-ONLY0-X86-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// SIMD-ONLY0-X86-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1
+// SIMD-ONLY0-X86-NEXT: store i32 [[INC]], ptr [[A_ADDR]], align 4
+// SIMD-ONLY0-X86-NEXT: br label [[FOR_INC:%.*]]
+// SIMD-ONLY0-X86: for.inc:
+// SIMD-ONLY0-X86-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY0-X86-NEXT: [[INC1:%.*]] = add nsw i32 [[TMP2]], 1
+// SIMD-ONLY0-X86-NEXT: store i32 [[INC1]], ptr [[I]], align 4
+// SIMD-ONLY0-X86-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]]
+// SIMD-ONLY0-X86: for.end:
+// SIMD-ONLY0-X86-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[F]], i32 0, i32 0
+// SIMD-ONLY0-X86-NEXT: store ptr [[A_ADDR]], ptr [[TMP3]], align 4
+// SIMD-ONLY0-X86-NEXT: call void @"_ZZ6nestediENK3$_0clEv"(ptr noundef nonnull align 4 dereferenceable(4) [[F]])
+// SIMD-ONLY0-X86-NEXT: [[TMP4:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// SIMD-ONLY0-X86-NEXT: ret i32 [[TMP4]]
+//
+//
+// TCHECK-TARGET-LABEL: define weak_odr protected void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l42
+// TCHECK-TARGET-SAME: (i64 noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// TCHECK-TARGET-NEXT: entry:
+// TCHECK-TARGET-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// TCHECK-TARGET-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// TCHECK-TARGET-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// TCHECK-TARGET-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// TCHECK-TARGET-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// TCHECK-TARGET-NEXT: [[TMP1:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// TCHECK-TARGET-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2:[0-9]+]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l42.omp_outlined, i64 [[TMP1]])
+// TCHECK-TARGET-NEXT: ret void
+//
+//
+// TCHECK-TARGET-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l42.omp_outlined
+// TCHECK-TARGET-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+// TCHECK-TARGET-NEXT: entry:
+// TCHECK-TARGET-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// TCHECK-TARGET-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// TCHECK-TARGET-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// TCHECK-TARGET-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// TCHECK-TARGET-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// TCHECK-TARGET-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// TCHECK-TARGET-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// TCHECK-TARGET-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// TCHECK-TARGET-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// TCHECK-TARGET-NEXT: [[I:%.*]] = alloca i32, align 4
+// TCHECK-TARGET-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// TCHECK-TARGET-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// TCHECK-TARGET-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// TCHECK-TARGET-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// TCHECK-TARGET-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// TCHECK-TARGET-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// TCHECK-TARGET-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// TCHECK-TARGET-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// TCHECK-TARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// TCHECK-TARGET-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// TCHECK-TARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// TCHECK-TARGET-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// TCHECK-TARGET-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// TCHECK-TARGET: cond.true:
+// TCHECK-TARGET-NEXT: br label [[COND_END:%.*]]
+// TCHECK-TARGET: cond.false:
+// TCHECK-TARGET-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// TCHECK-TARGET-NEXT: br label [[COND_END]]
+// TCHECK-TARGET: cond.end:
+// TCHECK-TARGET-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// TCHECK-TARGET-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// TCHECK-TARGET-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// TCHECK-TARGET-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// TCHECK-TARGET-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// TCHECK-TARGET: omp.inner.for.cond:
+// TCHECK-TARGET-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// TCHECK-TARGET-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// TCHECK-TARGET-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// TCHECK-TARGET-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// TCHECK-TARGET: omp.inner.for.body:
+// TCHECK-TARGET-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// TCHECK-TARGET-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// TCHECK-TARGET-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// TCHECK-TARGET-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// TCHECK-TARGET-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// TCHECK-TARGET-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1
+// TCHECK-TARGET-NEXT: store i32 [[INC]], ptr [[A_ADDR]], align 4
+// TCHECK-TARGET-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// TCHECK-TARGET: omp.body.continue:
+// TCHECK-TARGET-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// TCHECK-TARGET: omp.inner.for.inc:
+// TCHECK-TARGET-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// TCHECK-TARGET-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP9]], 1
+// TCHECK-TARGET-NEXT: store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4
+// TCHECK-TARGET-NEXT: br label [[OMP_INNER_FOR_COND]]
+// TCHECK-TARGET: omp.inner.for.end:
+// TCHECK-TARGET-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// TCHECK-TARGET: omp.loop.exit:
+// TCHECK-TARGET-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// TCHECK-TARGET-NEXT: ret void
+//
+//
+// TCHECK-TARGET-LABEL: define weak_odr protected void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l49
+// TCHECK-TARGET-SAME: (i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// TCHECK-TARGET-NEXT: entry:
+// TCHECK-TARGET-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// TCHECK-TARGET-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// TCHECK-TARGET-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// TCHECK-TARGET-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// TCHECK-TARGET-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// TCHECK-TARGET-NEXT: [[TMP1:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// TCHECK-TARGET-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l49.omp_outlined, i64 [[TMP1]])
+// TCHECK-TARGET-NEXT: ret void
+//
+//
+// TCHECK-TARGET-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l49.omp_outlined
+// TCHECK-TARGET-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR1]] {
+// TCHECK-TARGET-NEXT: entry:
+// TCHECK-TARGET-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// TCHECK-TARGET-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// TCHECK-TARGET-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// TCHECK-TARGET-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// TCHECK-TARGET-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// TCHECK-TARGET-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// TCHECK-TARGET-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// TCHECK-TARGET-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// TCHECK-TARGET-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// TCHECK-TARGET-NEXT: [[I:%.*]] = alloca i32, align 4
+// TCHECK-TARGET-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// TCHECK-TARGET-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// TCHECK-TARGET-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// TCHECK-TARGET-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// TCHECK-TARGET-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// TCHECK-TARGET-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// TCHECK-TARGET-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// TCHECK-TARGET-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// TCHECK-TARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// TCHECK-TARGET-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// TCHECK-TARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// TCHECK-TARGET-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// TCHECK-TARGET-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// TCHECK-TARGET: cond.true:
+// TCHECK-TARGET-NEXT: br label [[COND_END:%.*]]
+// TCHECK-TARGET: cond.false:
+// TCHECK-TARGET-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// TCHECK-TARGET-NEXT: br label [[COND_END]]
+// TCHECK-TARGET: cond.end:
+// TCHECK-TARGET-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// TCHECK-TARGET-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// TCHECK-TARGET-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// TCHECK-TARGET-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// TCHECK-TARGET-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// TCHECK-TARGET: omp.inner.for.cond:
+// TCHECK-TARGET-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// TCHECK-TARGET-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// TCHECK-TARGET-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// TCHECK-TARGET-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// TCHECK-TARGET: omp.inner.for.body:
+// TCHECK-TARGET-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// TCHECK-TARGET-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// TCHECK-TARGET-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// TCHECK-TARGET-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// TCHECK-TARGET-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// TCHECK-TARGET-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1
+// TCHECK-TARGET-NEXT: store i32 [[INC]], ptr [[A_ADDR]], align 4
+// TCHECK-TARGET-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// TCHECK-TARGET: omp.body.continue:
+// TCHECK-TARGET-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// TCHECK-TARGET: omp.inner.for.inc:
+// TCHECK-TARGET-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// TCHECK-TARGET-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP9]], 1
+// TCHECK-TARGET-NEXT: store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4
+// TCHECK-TARGET-NEXT: br label [[OMP_INNER_FOR_COND]]
+// TCHECK-TARGET: omp.inner.for.end:
+// TCHECK-TARGET-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// TCHECK-TARGET: omp.loop.exit:
+// TCHECK-TARGET-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// TCHECK-TARGET-NEXT: ret void
+//
+//
+// TCHECK-TARGET-X86-LABEL: define weak_odr protected void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l42
+// TCHECK-TARGET-X86-SAME: (i32 noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// TCHECK-TARGET-X86-NEXT: entry:
+// TCHECK-TARGET-X86-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// TCHECK-TARGET-X86-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4
+// TCHECK-TARGET-X86-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// TCHECK-TARGET-X86-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// TCHECK-TARGET-X86-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// TCHECK-TARGET-X86-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// TCHECK-TARGET-X86-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2:[0-9]+]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l42.omp_outlined, i32 [[TMP1]])
+// TCHECK-TARGET-X86-NEXT: ret void
+//
+//
+// TCHECK-TARGET-X86-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l42.omp_outlined
+// TCHECK-TARGET-X86-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+// TCHECK-TARGET-X86-NEXT: entry:
+// TCHECK-TARGET-X86-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// TCHECK-TARGET-X86-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// TCHECK-TARGET-X86-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// TCHECK-TARGET-X86-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// TCHECK-TARGET-X86-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// TCHECK-TARGET-X86-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// TCHECK-TARGET-X86-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// TCHECK-TARGET-X86-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// TCHECK-TARGET-X86-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// TCHECK-TARGET-X86-NEXT: [[I:%.*]] = alloca i32, align 4
+// TCHECK-TARGET-X86-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// TCHECK-TARGET-X86-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// TCHECK-TARGET-X86-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// TCHECK-TARGET-X86-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// TCHECK-TARGET-X86-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// TCHECK-TARGET-X86-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// TCHECK-TARGET-X86-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// TCHECK-TARGET-X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// TCHECK-TARGET-X86-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// TCHECK-TARGET-X86-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// TCHECK-TARGET-X86-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// TCHECK-TARGET-X86-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// TCHECK-TARGET-X86-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// TCHECK-TARGET-X86: cond.true:
+// TCHECK-TARGET-X86-NEXT: br label [[COND_END:%.*]]
+// TCHECK-TARGET-X86: cond.false:
+// TCHECK-TARGET-X86-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// TCHECK-TARGET-X86-NEXT: br label [[COND_END]]
+// TCHECK-TARGET-X86: cond.end:
+// TCHECK-TARGET-X86-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// TCHECK-TARGET-X86-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// TCHECK-TARGET-X86-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// TCHECK-TARGET-X86-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// TCHECK-TARGET-X86-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// TCHECK-TARGET-X86: omp.inner.for.cond:
+// TCHECK-TARGET-X86-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// TCHECK-TARGET-X86-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// TCHECK-TARGET-X86-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// TCHECK-TARGET-X86-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// TCHECK-TARGET-X86: omp.inner.for.body:
+// TCHECK-TARGET-X86-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// TCHECK-TARGET-X86-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// TCHECK-TARGET-X86-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// TCHECK-TARGET-X86-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// TCHECK-TARGET-X86-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// TCHECK-TARGET-X86-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1
+// TCHECK-TARGET-X86-NEXT: store i32 [[INC]], ptr [[A_ADDR]], align 4
+// TCHECK-TARGET-X86-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// TCHECK-TARGET-X86: omp.body.continue:
+// TCHECK-TARGET-X86-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// TCHECK-TARGET-X86: omp.inner.for.inc:
+// TCHECK-TARGET-X86-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// TCHECK-TARGET-X86-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP9]], 1
+// TCHECK-TARGET-X86-NEXT: store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4
+// TCHECK-TARGET-X86-NEXT: br label [[OMP_INNER_FOR_COND]]
+// TCHECK-TARGET-X86: omp.inner.for.end:
+// TCHECK-TARGET-X86-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// TCHECK-TARGET-X86: omp.loop.exit:
+// TCHECK-TARGET-X86-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// TCHECK-TARGET-X86-NEXT: ret void
+//
+//
+// TCHECK-TARGET-X86-LABEL: define weak_odr protected void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l49
+// TCHECK-TARGET-X86-SAME: (i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// TCHECK-TARGET-X86-NEXT: entry:
+// TCHECK-TARGET-X86-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// TCHECK-TARGET-X86-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4
+// TCHECK-TARGET-X86-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// TCHECK-TARGET-X86-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// TCHECK-TARGET-X86-NEXT: store i32 [[TMP0]], ptr [[A_CASTED]], align 4
+// TCHECK-TARGET-X86-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// TCHECK-TARGET-X86-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l49.omp_outlined, i32 [[TMP1]])
+// TCHECK-TARGET-X86-NEXT: ret void
+//
+//
+// TCHECK-TARGET-X86-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l49.omp_outlined
+// TCHECK-TARGET-X86-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR1]] {
+// TCHECK-TARGET-X86-NEXT: entry:
+// TCHECK-TARGET-X86-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// TCHECK-TARGET-X86-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// TCHECK-TARGET-X86-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// TCHECK-TARGET-X86-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// TCHECK-TARGET-X86-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// TCHECK-TARGET-X86-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// TCHECK-TARGET-X86-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// TCHECK-TARGET-X86-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// TCHECK-TARGET-X86-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// TCHECK-TARGET-X86-NEXT: [[I:%.*]] = alloca i32, align 4
+// TCHECK-TARGET-X86-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// TCHECK-TARGET-X86-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// TCHECK-TARGET-X86-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// TCHECK-TARGET-X86-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// TCHECK-TARGET-X86-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// TCHECK-TARGET-X86-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// TCHECK-TARGET-X86-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// TCHECK-TARGET-X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// TCHECK-TARGET-X86-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// TCHECK-TARGET-X86-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// TCHECK-TARGET-X86-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// TCHECK-TARGET-X86-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// TCHECK-TARGET-X86-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// TCHECK-TARGET-X86: cond.true:
+// TCHECK-TARGET-X86-NEXT: br label [[COND_END:%.*]]
+// TCHECK-TARGET-X86: cond.false:
+// TCHECK-TARGET-X86-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// TCHECK-TARGET-X86-NEXT: br label [[COND_END]]
+// TCHECK-TARGET-X86: cond.end:
+// TCHECK-TARGET-X86-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// TCHECK-TARGET-X86-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// TCHECK-TARGET-X86-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// TCHECK-TARGET-X86-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// TCHECK-TARGET-X86-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// TCHECK-TARGET-X86: omp.inner.for.cond:
+// TCHECK-TARGET-X86-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// TCHECK-TARGET-X86-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// TCHECK-TARGET-X86-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// TCHECK-TARGET-X86-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// TCHECK-TARGET-X86: omp.inner.for.body:
+// TCHECK-TARGET-X86-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// TCHECK-TARGET-X86-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// TCHECK-TARGET-X86-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// TCHECK-TARGET-X86-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// TCHECK-TARGET-X86-NEXT: [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// TCHECK-TARGET-X86-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1
+// TCHECK-TARGET-X86-NEXT: store i32 [[INC]], ptr [[A_ADDR]], align 4
+// TCHECK-TARGET-X86-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// TCHECK-TARGET-X86: omp.body.continue:
+// TCHECK-TARGET-X86-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// TCHECK-TARGET-X86: omp.inner.for.inc:
+// TCHECK-TARGET-X86-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// TCHECK-TARGET-X86-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP9]], 1
+// TCHECK-TARGET-X86-NEXT: store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4
+// TCHECK-TARGET-X86-NEXT: br label [[OMP_INNER_FOR_COND]]
+// TCHECK-TARGET-X86: omp.inner.for.end:
+// TCHECK-TARGET-X86-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// TCHECK-TARGET-X86: omp.loop.exit:
+// TCHECK-TARGET-X86-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// TCHECK-TARGET-X86-NEXT: ret void
+//
+//
+// SIMD-ONLY1-TARGET-LABEL: define dso_local noundef signext i32 @_Z6nestedi
+// SIMD-ONLY1-TARGET-SAME: (i32 noundef signext [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY1-TARGET-NEXT: entry:
+// SIMD-ONLY1-TARGET-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-TARGET-NEXT: [[I:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-TARGET-NEXT: [[F:%.*]] = alloca [[CLASS_ANON:%.*]], align 8
+// SIMD-ONLY1-TARGET-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// SIMD-ONLY1-TARGET-NEXT: store i32 0, ptr [[I]], align 4
+// SIMD-ONLY1-TARGET-NEXT: br label [[FOR_COND:%.*]]
+// SIMD-ONLY1-TARGET: for.cond:
+// SIMD-ONLY1-TARGET-NEXT: [[TMP0:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY1-TARGET-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 10
+// SIMD-ONLY1-TARGET-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// SIMD-ONLY1-TARGET: for.body:
+// SIMD-ONLY1-TARGET-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// SIMD-ONLY1-TARGET-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1
+// SIMD-ONLY1-TARGET-NEXT: store i32 [[INC]], ptr [[A_ADDR]], align 4
+// SIMD-ONLY1-TARGET-NEXT: br label [[FOR_INC:%.*]]
+// SIMD-ONLY1-TARGET: for.inc:
+// SIMD-ONLY1-TARGET-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY1-TARGET-NEXT: [[INC1:%.*]] = add nsw i32 [[TMP2]], 1
+// SIMD-ONLY1-TARGET-NEXT: store i32 [[INC1]], ptr [[I]], align 4
+// SIMD-ONLY1-TARGET-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP2:![0-9]+]]
+// SIMD-ONLY1-TARGET: for.end:
+// SIMD-ONLY1-TARGET-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[F]], i32 0, i32 0
+// SIMD-ONLY1-TARGET-NEXT: store ptr [[A_ADDR]], ptr [[TMP3]], align 8
+// SIMD-ONLY1-TARGET-NEXT: call void @"_ZZ6nestediENK3$_0clEv"(ptr noundef nonnull align 8 dereferenceable(8) [[F]])
+// SIMD-ONLY1-TARGET-NEXT: [[TMP4:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// SIMD-ONLY1-TARGET-NEXT: ret i32 [[TMP4]]
+//
+//
+// SIMD-ONLY1-TARGET-X86-LABEL: define dso_local noundef i32 @_Z6nestedi
+// SIMD-ONLY1-TARGET-X86-SAME: (i32 noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// SIMD-ONLY1-TARGET-X86-NEXT: entry:
+// SIMD-ONLY1-TARGET-X86-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-TARGET-X86-NEXT: [[I:%.*]] = alloca i32, align 4
+// SIMD-ONLY1-TARGET-X86-NEXT: [[F:%.*]] = alloca [[CLASS_ANON:%.*]], align 4
+// SIMD-ONLY1-TARGET-X86-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// SIMD-ONLY1-TARGET-X86-NEXT: store i32 0, ptr [[I]], align 4
+// SIMD-ONLY1-TARGET-X86-NEXT: br label [[FOR_COND:%.*]]
+// SIMD-ONLY1-TARGET-X86: for.cond:
+// SIMD-ONLY1-TARGET-X86-NEXT: [[TMP0:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY1-TARGET-X86-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], 10
+// SIMD-ONLY1-TARGET-X86-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// SIMD-ONLY1-TARGET-X86: for.body:
+// SIMD-ONLY1-TARGET-X86-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// SIMD-ONLY1-TARGET-X86-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1
+// SIMD-ONLY1-TARGET-X86-NEXT: store i32 [[INC]], ptr [[A_ADDR]], align 4
+// SIMD-ONLY1-TARGET-X86-NEXT: br label [[FOR_INC:%.*]]
+// SIMD-ONLY1-TARGET-X86: for.inc:
+// SIMD-ONLY1-TARGET-X86-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4
+// SIMD-ONLY1-TARGET-X86-NEXT: [[INC1:%.*]] = add nsw i32 [[TMP2]], 1
+// SIMD-ONLY1-TARGET-X86-NEXT: store i32 [[INC1]], ptr [[I]], align 4
+// SIMD-ONLY1-TARGET-X86-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]]
+// SIMD-ONLY1-TARGET-X86: for.end:
+// SIMD-ONLY1-TARGET-X86-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[F]], i32 0, i32 0
+// SIMD-ONLY1-TARGET-X86-NEXT: store ptr [[A_ADDR]], ptr [[TMP3]], align 4
+// SIMD-ONLY1-TARGET-X86-NEXT: call void @"_ZZ6nestediENK3$_0clEv"(ptr noundef nonnull align 4 dereferenceable(4) [[F]])
+// SIMD-ONLY1-TARGET-X86-NEXT: [[TMP4:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// SIMD-ONLY1-TARGET-X86-NEXT: ret i32 [[TMP4]]
+//
diff --git a/clang/test/OpenMP/target_parallel_generic_loop_codegen-3.cpp b/clang/test/OpenMP/target_parallel_generic_loop_codegen-3.cpp
new file mode 100644
index 00000000000000..30004cabea84a9
--- /dev/null
+++ b/clang/test/OpenMP/target_parallel_generic_loop_codegen-3.cpp
@@ -0,0 +1,921 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -fopenmp-cuda-mode -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -fopenmp-cuda-mode -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -debug-info-kind=limited | FileCheck %s --check-prefix=CHECK1
+// expected-no-diagnostics
+
+int main() {
+ /* int(*b)[a]; */
+ /* int *(**c)[a]; */
+ bool bb;
+ int a;
+ int b[10][10];
+ int c[10][10][10];
+#pragma omp target parallel loop firstprivate(a, b) map(tofrom \
+ : c) map(tofrom \
+ : bb) if (a)
+ for (int i = 0; i < 10; ++i) {
+ int &f = c[1][1][1];
+ int &g = a;
+ int &h = b[1][1];
+ int d = 15;
+ a = 5;
+ b[0][a] = 10;
+ c[0][0][a] = 11;
+ b[0][a] = c[0][0][a];
+ bb |= b[0][a];
+ }
+#pragma omp target parallel loop firstprivate(a) map(tofrom \
+ : c, b) map(to \
+ : bb)
+ for (int i = 0; i < 10; ++i) {
+ int &f = c[1][1][1];
+ int &g = a;
+ int &h = b[1][1];
+ int d = 15;
+ a = 5;
+ b[0][a] = 10;
+ c[0][0][a] = 11;
+ b[0][a] = c[0][0][a];
+ d = bb;
+ }
+#pragma omp target parallel loop map(tofrom \
+ : a, c, b) map(from \
+ : bb)
+ for (int i = 0; i < 10; ++i) {
+ int &f = c[1][1][1];
+ int &g = a;
+ int &h = b[1][1];
+ int d = 15;
+ a = 5;
+ b[0][a] = 10;
+ c[0][0][a] = 11;
+ b[0][a] = c[0][0][a];
+ bb = b[0][a];
+ }
+ return 0;
+}
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug__
+// CHECK1-SAME: (ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]], i1 noundef zeroext [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG13:![0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr addrspace(1), align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i8, align 1
+// CHECK1-NEXT: [[TMP:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[_TMP1:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[_TMP2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
+// CHECK1-NEXT: store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META31:![0-9]+]], metadata !DIExpression()), !dbg [[DBG32:![0-9]+]]
+// CHECK1-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META33:![0-9]+]], metadata !DIExpression()), !dbg [[DBG34:![0-9]+]]
+// CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META35:![0-9]+]], metadata !DIExpression()), !dbg [[DBG36:![0-9]+]]
+// CHECK1-NEXT: store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META37:![0-9]+]], metadata !DIExpression()), !dbg [[DBG38:![0-9]+]]
+// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[DOTCAPTURE_EXPR_]] to i8
+// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 1
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTCAPTURE_EXPR__ADDR]], metadata [[META39:![0-9]+]], metadata !DIExpression()), !dbg [[DBG40:![0-9]+]]
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG41:![0-9]+]]
+// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG41]]
+// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG41]]
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG41]]
+// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG41]]
+// CHECK1-NEXT: store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG41]]
+// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG41]]
+// CHECK1-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG41]]
+// CHECK1-NEXT: [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG41]]
+// CHECK1-NEXT: store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG41]]
+// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG41]]
+// CHECK1-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1:[0-9]+]], i8 2, i1 false), !dbg [[DBG41]]
+// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP8]], -1, !dbg [[DBG41]]
+// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG41]]
+// CHECK1: user_code.entry:
+// CHECK1-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB6:[0-9]+]])
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG42:![0-9]+]]
+// CHECK1-NEXT: store i32 [[TMP10]], ptr [[A_CASTED]], align 4, !dbg [[DBG42]]
+// CHECK1-NEXT: [[TMP11:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG42]]
+// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG42]]
+// CHECK1-NEXT: store ptr [[TMP2]], ptr [[TMP12]], align 8, !dbg [[DBG42]]
+// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG42]]
+// CHECK1-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG42]]
+// CHECK1-NEXT: store ptr [[TMP14]], ptr [[TMP13]], align 8, !dbg [[DBG42]]
+// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG42]]
+// CHECK1-NEXT: store ptr [[TMP4]], ptr [[TMP15]], align 8, !dbg [[DBG42]]
+// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG42]]
+// CHECK1-NEXT: store ptr [[TMP7]], ptr [[TMP16]], align 8, !dbg [[DBG42]]
+// CHECK1-NEXT: [[TMP17:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG43:![0-9]+]]
+// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP17]] to i1, !dbg [[DBG43]]
+// CHECK1-NEXT: [[TMP18:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG42]]
+// CHECK1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB6]], i32 [[TMP9]], i32 [[TMP18]], i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG42]]
+// CHECK1-NEXT: call void @__kmpc_target_deinit(ptr @[[GLOB8:[0-9]+]], i8 2), !dbg [[DBG45:![0-9]+]]
+// CHECK1-NEXT: ret void, !dbg [[DBG46:![0-9]+]]
+// CHECK1: worker.exit:
+// CHECK1-NEXT: ret void, !dbg [[DBG41]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined_debug__
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG47:![0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr addrspace(1), align 8
+// CHECK1-NEXT: [[TMP:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[_TMP1:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[_TMP2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[_TMP3:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[B4:%.*]] = alloca [10 x [10 x i32]], align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[F:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[G:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[H:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[D:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META54:![0-9]+]], metadata !DIExpression()), !dbg [[DBG55:![0-9]+]]
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META56:![0-9]+]], metadata !DIExpression()), !dbg [[DBG55]]
+// CHECK1-NEXT: store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META57:![0-9]+]], metadata !DIExpression()), !dbg [[DBG58:![0-9]+]]
+// CHECK1-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META59:![0-9]+]], metadata !DIExpression()), !dbg [[DBG60:![0-9]+]]
+// CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META61:![0-9]+]], metadata !DIExpression()), !dbg [[DBG62:![0-9]+]]
+// CHECK1-NEXT: store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META63:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64:![0-9]+]]
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG65:![0-9]+]]
+// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG65]]
+// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG65]]
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG65]]
+// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG65]]
+// CHECK1-NEXT: store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG65]]
+// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG65]]
+// CHECK1-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG65]]
+// CHECK1-NEXT: [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG65]]
+// CHECK1-NEXT: store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG65]]
+// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG65]]
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META66:![0-9]+]], metadata !DIExpression()), !dbg [[DBG55]]
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META67:![0-9]+]], metadata !DIExpression()), !dbg [[DBG55]]
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG68:![0-9]+]]
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META69:![0-9]+]], metadata !DIExpression()), !dbg [[DBG55]]
+// CHECK1-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG68]]
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META70:![0-9]+]], metadata !DIExpression()), !dbg [[DBG55]]
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG68]]
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META71:![0-9]+]], metadata !DIExpression()), !dbg [[DBG55]]
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG68]]
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B4]], metadata [[META72:![0-9]+]], metadata !DIExpression()), !dbg [[DBG55]]
+// CHECK1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[B4]], ptr align 4 [[TMP4]], i64 400, i1 false), !dbg [[DBG65]]
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META73:![0-9]+]], metadata !DIExpression()), !dbg [[DBG55]]
+// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG65]]
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !dbg [[DBG65]]
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP9]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG74:![0-9]+]]
+// CHECK1-NEXT: br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG65]]
+// CHECK1: omp.dispatch.cond:
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG68]]
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP10]], 9, !dbg [[DBG68]]
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG68]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]], !dbg [[DBG68]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG68]]
+// CHECK1-NEXT: br label [[COND_END]], !dbg [[DBG68]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ], !dbg [[DBG68]]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG68]]
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG68]]
+// CHECK1-NEXT: store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG68]]
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG68]]
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG68]]
+// CHECK1-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]], !dbg [[DBG65]]
+// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG65]]
+// CHECK1: omp.dispatch.body:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG65]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG68]]
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG68]]
+// CHECK1-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]], !dbg [[DBG65]]
+// CHECK1-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG65]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG68]]
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP17]], 1, !dbg [[DBG75:![0-9]+]]
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG75]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG75]]
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META76:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79:![0-9]+]]
+// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG80:![0-9]+]]
+// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG80]]
+// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG80]]
+// CHECK1-NEXT: store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[DBG79]]
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META81:![0-9]+]], metadata !DIExpression()), !dbg [[DBG82:![0-9]+]]
+// CHECK1-NEXT: store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG82]]
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META83:![0-9]+]], metadata !DIExpression()), !dbg [[DBG84:![0-9]+]]
+// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 1, !dbg [[DBG85:![0-9]+]]
+// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG85]]
+// CHECK1-NEXT: store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[DBG84]]
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META86:![0-9]+]], metadata !DIExpression()), !dbg [[DBG87:![0-9]+]]
+// CHECK1-NEXT: store i32 15, ptr [[D]], align 4, !dbg [[DBG87]]
+// CHECK1-NEXT: store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG88:![0-9]+]]
+// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG89:![0-9]+]]
+// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG90:![0-9]+]]
+// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP18]] to i64, !dbg [[DBG89]]
+// CHECK1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG89]]
+// CHECK1-NEXT: store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG91:![0-9]+]]
+// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG92:![0-9]+]]
+// CHECK1-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG92]]
+// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG93:![0-9]+]]
+// CHECK1-NEXT: [[IDXPROM15:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG92]]
+// CHECK1-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG92]]
+// CHECK1-NEXT: store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG94:![0-9]+]]
+// CHECK1-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG95:![0-9]+]]
+// CHECK1-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG95]]
+// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG96:![0-9]+]]
+// CHECK1-NEXT: [[IDXPROM19:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG95]]
+// CHECK1-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG95]]
+// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG95]]
+// CHECK1-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG97:![0-9]+]]
+// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG98:![0-9]+]]
+// CHECK1-NEXT: [[IDXPROM22:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG97]]
+// CHECK1-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG97]]
+// CHECK1-NEXT: store i32 [[TMP21]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG99:![0-9]+]]
+// CHECK1-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG100:![0-9]+]]
+// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG101:![0-9]+]]
+// CHECK1-NEXT: [[IDXPROM25:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG100]]
+// CHECK1-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG100]]
+// CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG100]]
+// CHECK1-NEXT: [[TMP25:%.*]] = load i8, ptr [[TMP7]], align 1, !dbg [[DBG102:![0-9]+]]
+// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP25]] to i1, !dbg [[DBG102]]
+// CHECK1-NEXT: [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG102]]
+// CHECK1-NEXT: [[OR:%.*]] = or i32 [[CONV]], [[TMP24]], !dbg [[DBG102]]
+// CHECK1-NEXT: [[TOBOOL27:%.*]] = icmp ne i32 [[OR]], 0, !dbg [[DBG102]]
+// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL27]] to i8, !dbg [[DBG102]]
+// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[TMP7]], align 1, !dbg [[DBG102]]
+// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG103:![0-9]+]]
+// CHECK1: omp.body.continue:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG74]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG68]]
+// CHECK1-NEXT: [[ADD28:%.*]] = add nsw i32 [[TMP26]], 1, !dbg [[DBG65]]
+// CHECK1-NEXT: store i32 [[ADD28]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG65]]
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !dbg [[DBG74]], !llvm.loop [[LOOP104:![0-9]+]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG74]]
+// CHECK1: omp.dispatch.inc:
+// CHECK1-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG68]]
+// CHECK1-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG68]]
+// CHECK1-NEXT: [[ADD29:%.*]] = add nsw i32 [[TMP27]], [[TMP28]], !dbg [[DBG65]]
+// CHECK1-NEXT: store i32 [[ADD29]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG65]]
+// CHECK1-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG68]]
+// CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG68]]
+// CHECK1-NEXT: [[ADD30:%.*]] = add nsw i32 [[TMP29]], [[TMP30]], !dbg [[DBG65]]
+// CHECK1-NEXT: store i32 [[ADD30]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG65]]
+// CHECK1-NEXT: br label [[OMP_DISPATCH_COND]], !dbg [[DBG74]], !llvm.loop [[LOOP106:![0-9]+]]
+// CHECK1: omp.dispatch.end:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB5:[0-9]+]], i32 [[TMP9]]), !dbg [[DBG105:![0-9]+]]
+// CHECK1-NEXT: ret void, !dbg [[DBG107:![0-9]+]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG108:![0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META115:![0-9]+]], metadata !DIExpression()), !dbg [[DBG116:![0-9]+]]
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META117:![0-9]+]], metadata !DIExpression()), !dbg [[DBG116]]
+// CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META118:![0-9]+]], metadata !DIExpression()), !dbg [[DBG116]]
+// CHECK1-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META119:![0-9]+]], metadata !DIExpression()), !dbg [[DBG116]]
+// CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META120:![0-9]+]], metadata !DIExpression()), !dbg [[DBG116]]
+// CHECK1-NEXT: store ptr [[BB]], ptr [[BB_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META121:![0-9]+]], metadata !DIExpression()), !dbg [[DBG116]]
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG122:![0-9]+]]
+// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG122]]
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG122]]
+// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG122]]
+// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG122]]
+// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG122]]
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG122]]
+// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG122]]
+// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG122]]
+// CHECK1-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG122]]
+// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG122]]
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr [[TMP7]], ptr addrspace(1) [[TMP10]]) #[[ATTR3:[0-9]+]], !dbg [[DBG122]]
+// CHECK1-NEXT: ret void, !dbg [[DBG122]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR5:[0-9]+]] !dbg [[DBG123:![0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META126:![0-9]+]], metadata !DIExpression()), !dbg [[DBG127:![0-9]+]]
+// CHECK1-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META128:![0-9]+]], metadata !DIExpression()), !dbg [[DBG127]]
+// CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META129:![0-9]+]], metadata !DIExpression()), !dbg [[DBG127]]
+// CHECK1-NEXT: store ptr [[BB]], ptr [[BB_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META130:![0-9]+]], metadata !DIExpression()), !dbg [[DBG127]]
+// CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTCAPTURE_EXPR__ADDR]], metadata [[META131:![0-9]+]], metadata !DIExpression()), !dbg [[DBG127]]
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG132:![0-9]+]]
+// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG132]]
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG132]]
+// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG132]]
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG132]]
+// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG132]]
+// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG132]]
+// CHECK1-NEXT: [[TMP7:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG132]]
+// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP7]] to i1, !dbg [[DBG132]]
+// CHECK1-NEXT: [[TMP8:%.*]] = addrspacecast ptr [[TMP3]] to ptr addrspace(1), !dbg [[DBG132]]
+// CHECK1-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG132]]
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug__(ptr addrspace(1) [[TMP8]], i32 [[TMP4]], ptr [[TMP5]], ptr addrspace(1) [[TMP9]], i1 [[TOBOOL]]) #[[ATTR3]], !dbg [[DBG132]]
+// CHECK1-NEXT: ret void, !dbg [[DBG132]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug__
+// CHECK1-SAME: (ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG133:![0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr addrspace(1), align 8
+// CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr addrspace(1), align 8
+// CHECK1-NEXT: [[TMP:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[_TMP1:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[_TMP2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
+// CHECK1-NEXT: store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META138:![0-9]+]], metadata !DIExpression()), !dbg [[DBG139:![0-9]+]]
+// CHECK1-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META140:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141:![0-9]+]]
+// CHECK1-NEXT: store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META142:![0-9]+]], metadata !DIExpression()), !dbg [[DBG143:![0-9]+]]
+// CHECK1-NEXT: store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META144:![0-9]+]], metadata !DIExpression()), !dbg [[DBG145:![0-9]+]]
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG146:![0-9]+]]
+// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG146]]
+// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG146]]
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG146]]
+// CHECK1-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG146]]
+// CHECK1-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG146]]
+// CHECK1-NEXT: store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG146]]
+// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG146]]
+// CHECK1-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG146]]
+// CHECK1-NEXT: [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG146]]
+// CHECK1-NEXT: store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG146]]
+// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG146]]
+// CHECK1-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB10:[0-9]+]], i8 2, i1 false), !dbg [[DBG146]]
+// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP9]], -1, !dbg [[DBG146]]
+// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG146]]
+// CHECK1: user_code.entry:
+// CHECK1-NEXT: [[TMP10:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB15:[0-9]+]])
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG147:![0-9]+]]
+// CHECK1-NEXT: store i32 [[TMP11]], ptr [[A_CASTED]], align 4, !dbg [[DBG147]]
+// CHECK1-NEXT: [[TMP12:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG147]]
+// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG147]]
+// CHECK1-NEXT: store ptr [[TMP2]], ptr [[TMP13]], align 8, !dbg [[DBG147]]
+// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG147]]
+// CHECK1-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP12]] to ptr, !dbg [[DBG147]]
+// CHECK1-NEXT: store ptr [[TMP15]], ptr [[TMP14]], align 8, !dbg [[DBG147]]
+// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG147]]
+// CHECK1-NEXT: store ptr [[TMP5]], ptr [[TMP16]], align 8, !dbg [[DBG147]]
+// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG147]]
+// CHECK1-NEXT: store ptr [[TMP8]], ptr [[TMP17]], align 8, !dbg [[DBG147]]
+// CHECK1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB15]], i32 [[TMP10]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG147]]
+// CHECK1-NEXT: call void @__kmpc_target_deinit(ptr @[[GLOB17:[0-9]+]], i8 2), !dbg [[DBG148:![0-9]+]]
+// CHECK1-NEXT: ret void, !dbg [[DBG150:![0-9]+]]
+// CHECK1: worker.exit:
+// CHECK1-NEXT: ret void, !dbg [[DBG146]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined_debug__
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG151:![0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr addrspace(1), align 8
+// CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr addrspace(1), align 8
+// CHECK1-NEXT: [[TMP:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[_TMP1:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[_TMP2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[_TMP3:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[F:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[G:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[H:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[D:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META154:![0-9]+]], metadata !DIExpression()), !dbg [[DBG155:![0-9]+]]
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META156:![0-9]+]], metadata !DIExpression()), !dbg [[DBG155]]
+// CHECK1-NEXT: store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META157:![0-9]+]], metadata !DIExpression()), !dbg [[DBG158:![0-9]+]]
+// CHECK1-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META159:![0-9]+]], metadata !DIExpression()), !dbg [[DBG160:![0-9]+]]
+// CHECK1-NEXT: store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META161:![0-9]+]], metadata !DIExpression()), !dbg [[DBG162:![0-9]+]]
+// CHECK1-NEXT: store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META163:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164:![0-9]+]]
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG165:![0-9]+]]
+// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG165]]
+// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG165]]
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG165]]
+// CHECK1-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG165]]
+// CHECK1-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG165]]
+// CHECK1-NEXT: store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG165]]
+// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG165]]
+// CHECK1-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG165]]
+// CHECK1-NEXT: [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG165]]
+// CHECK1-NEXT: store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG165]]
+// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG165]]
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META166:![0-9]+]], metadata !DIExpression()), !dbg [[DBG155]]
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META167:![0-9]+]], metadata !DIExpression()), !dbg [[DBG155]]
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG168:![0-9]+]]
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META169:![0-9]+]], metadata !DIExpression()), !dbg [[DBG155]]
+// CHECK1-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG168]]
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META170:![0-9]+]], metadata !DIExpression()), !dbg [[DBG155]]
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG168]]
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META171:![0-9]+]], metadata !DIExpression()), !dbg [[DBG155]]
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG168]]
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META172:![0-9]+]], metadata !DIExpression()), !dbg [[DBG155]]
+// CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG165]]
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !dbg [[DBG165]]
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB12:[0-9]+]], i32 [[TMP10]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG173:![0-9]+]]
+// CHECK1-NEXT: br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG165]]
+// CHECK1: omp.dispatch.cond:
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG168]]
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP11]], 9, !dbg [[DBG168]]
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG168]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]], !dbg [[DBG168]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG168]]
+// CHECK1-NEXT: br label [[COND_END]], !dbg [[DBG168]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ], !dbg [[DBG168]]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG168]]
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG168]]
+// CHECK1-NEXT: store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG168]]
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG168]]
+// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG168]]
+// CHECK1-NEXT: [[CMP4:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]], !dbg [[DBG165]]
+// CHECK1-NEXT: br i1 [[CMP4]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG165]]
+// CHECK1: omp.dispatch.body:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG165]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG168]]
+// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG168]]
+// CHECK1-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]], !dbg [[DBG165]]
+// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG165]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG168]]
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1, !dbg [[DBG174:![0-9]+]]
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG174]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG174]]
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META175:![0-9]+]], metadata !DIExpression()), !dbg [[DBG177:![0-9]+]]
+// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG178:![0-9]+]]
+// CHECK1-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG178]]
+// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG178]]
+// CHECK1-NEXT: store ptr [[ARRAYIDX7]], ptr [[F]], align 8, !dbg [[DBG177]]
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META179:![0-9]+]], metadata !DIExpression()), !dbg [[DBG180:![0-9]+]]
+// CHECK1-NEXT: store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG180]]
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META181:![0-9]+]], metadata !DIExpression()), !dbg [[DBG182:![0-9]+]]
+// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 1, !dbg [[DBG183:![0-9]+]]
+// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 1, !dbg [[DBG183]]
+// CHECK1-NEXT: store ptr [[ARRAYIDX9]], ptr [[H]], align 8, !dbg [[DBG182]]
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META184:![0-9]+]], metadata !DIExpression()), !dbg [[DBG185:![0-9]+]]
+// CHECK1-NEXT: store i32 15, ptr [[D]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT: store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG186:![0-9]+]]
+// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG187:![0-9]+]]
+// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG188:![0-9]+]]
+// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG187]]
+// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX10]], i64 0, i64 [[IDXPROM]], !dbg [[DBG187]]
+// CHECK1-NEXT: store i32 10, ptr [[ARRAYIDX11]], align 4, !dbg [[DBG189:![0-9]+]]
+// CHECK1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG190:![0-9]+]]
+// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX12]], i64 0, i64 0, !dbg [[DBG190]]
+// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG191:![0-9]+]]
+// CHECK1-NEXT: [[IDXPROM14:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG190]]
+// CHECK1-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX13]], i64 0, i64 [[IDXPROM14]], !dbg [[DBG190]]
+// CHECK1-NEXT: store i32 11, ptr [[ARRAYIDX15]], align 4, !dbg [[DBG192:![0-9]+]]
+// CHECK1-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG193:![0-9]+]]
+// CHECK1-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX16]], i64 0, i64 0, !dbg [[DBG193]]
+// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG194:![0-9]+]]
+// CHECK1-NEXT: [[IDXPROM18:%.*]] = sext i32 [[TMP21]] to i64, !dbg [[DBG193]]
+// CHECK1-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX17]], i64 0, i64 [[IDXPROM18]], !dbg [[DBG193]]
+// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX19]], align 4, !dbg [[DBG193]]
+// CHECK1-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG195:![0-9]+]]
+// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG196:![0-9]+]]
+// CHECK1-NEXT: [[IDXPROM21:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG195]]
+// CHECK1-NEXT: [[ARRAYIDX22:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX20]], i64 0, i64 [[IDXPROM21]], !dbg [[DBG195]]
+// CHECK1-NEXT: store i32 [[TMP22]], ptr [[ARRAYIDX22]], align 4, !dbg [[DBG197:![0-9]+]]
+// CHECK1-NEXT: [[TMP24:%.*]] = load i8, ptr [[TMP8]], align 1, !dbg [[DBG198:![0-9]+]]
+// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP24]] to i1, !dbg [[DBG198]]
+// CHECK1-NEXT: [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG198]]
+// CHECK1-NEXT: store i32 [[CONV]], ptr [[D]], align 4, !dbg [[DBG199:![0-9]+]]
+// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG200:![0-9]+]]
+// CHECK1: omp.body.continue:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG173]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG168]]
+// CHECK1-NEXT: [[ADD23:%.*]] = add nsw i32 [[TMP25]], 1, !dbg [[DBG165]]
+// CHECK1-NEXT: store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG165]]
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !dbg [[DBG173]], !llvm.loop [[LOOP201:![0-9]+]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG173]]
+// CHECK1: omp.dispatch.inc:
+// CHECK1-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG168]]
+// CHECK1-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG168]]
+// CHECK1-NEXT: [[ADD24:%.*]] = add nsw i32 [[TMP26]], [[TMP27]], !dbg [[DBG165]]
+// CHECK1-NEXT: store i32 [[ADD24]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG165]]
+// CHECK1-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG168]]
+// CHECK1-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG168]]
+// CHECK1-NEXT: [[ADD25:%.*]] = add nsw i32 [[TMP28]], [[TMP29]], !dbg [[DBG165]]
+// CHECK1-NEXT: store i32 [[ADD25]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG165]]
+// CHECK1-NEXT: br label [[OMP_DISPATCH_COND]], !dbg [[DBG173]], !llvm.loop [[LOOP203:![0-9]+]]
+// CHECK1: omp.dispatch.end:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB14:[0-9]+]], i32 [[TMP10]]), !dbg [[DBG202:![0-9]+]]
+// CHECK1-NEXT: ret void, !dbg [[DBG204:![0-9]+]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG205:![0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META206:![0-9]+]], metadata !DIExpression()), !dbg [[DBG207:![0-9]+]]
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META208:![0-9]+]], metadata !DIExpression()), !dbg [[DBG207]]
+// CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META209:![0-9]+]], metadata !DIExpression()), !dbg [[DBG207]]
+// CHECK1-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META210:![0-9]+]], metadata !DIExpression()), !dbg [[DBG207]]
+// CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META211:![0-9]+]], metadata !DIExpression()), !dbg [[DBG207]]
+// CHECK1-NEXT: store ptr [[BB]], ptr [[BB_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META212:![0-9]+]], metadata !DIExpression()), !dbg [[DBG207]]
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG213:![0-9]+]]
+// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG213]]
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG213]]
+// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG213]]
+// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG213]]
+// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG213]]
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG213]]
+// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG213]]
+// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG213]]
+// CHECK1-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG213]]
+// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG213]]
+// CHECK1-NEXT: [[TMP11:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG213]]
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR3]], !dbg [[DBG213]]
+// CHECK1-NEXT: ret void, !dbg [[DBG213]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR5]] !dbg [[DBG214:![0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META217:![0-9]+]], metadata !DIExpression()), !dbg [[DBG218:![0-9]+]]
+// CHECK1-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META219:![0-9]+]], metadata !DIExpression()), !dbg [[DBG218]]
+// CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META220:![0-9]+]], metadata !DIExpression()), !dbg [[DBG218]]
+// CHECK1-NEXT: store ptr [[BB]], ptr [[BB_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META221:![0-9]+]], metadata !DIExpression()), !dbg [[DBG218]]
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG222:![0-9]+]]
+// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG222]]
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG222]]
+// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG222]]
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG222]]
+// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG222]]
+// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG222]]
+// CHECK1-NEXT: [[TMP7:%.*]] = addrspacecast ptr [[TMP3]] to ptr addrspace(1), !dbg [[DBG222]]
+// CHECK1-NEXT: [[TMP8:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG222]]
+// CHECK1-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG222]]
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug__(ptr addrspace(1) [[TMP7]], i32 [[TMP4]], ptr addrspace(1) [[TMP8]], ptr addrspace(1) [[TMP9]]) #[[ATTR3]], !dbg [[DBG222]]
+// CHECK1-NEXT: ret void, !dbg [[DBG222]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug__
+// CHECK1-SAME: (ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG223:![0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8
+// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr addrspace(1), align 8
+// CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr addrspace(1), align 8
+// CHECK1-NEXT: [[TMP:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[_TMP1:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[_TMP2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[_TMP3:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
+// CHECK1-NEXT: store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META228:![0-9]+]], metadata !DIExpression()), !dbg [[DBG229:![0-9]+]]
+// CHECK1-NEXT: store ptr addrspace(1) [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META230:![0-9]+]], metadata !DIExpression()), !dbg [[DBG231:![0-9]+]]
+// CHECK1-NEXT: store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META232:![0-9]+]], metadata !DIExpression()), !dbg [[DBG233:![0-9]+]]
+// CHECK1-NEXT: store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META234:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235:![0-9]+]]
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG236:![0-9]+]]
+// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG236]]
+// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG236]]
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG236]]
+// CHECK1-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG236]]
+// CHECK1-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG236]]
+// CHECK1-NEXT: store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG236]]
+// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG236]]
+// CHECK1-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG236]]
+// CHECK1-NEXT: [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG236]]
+// CHECK1-NEXT: store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG236]]
+// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG236]]
+// CHECK1-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG236]]
+// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG236]]
+// CHECK1-NEXT: store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG236]]
+// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG236]]
+// CHECK1-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB19:[0-9]+]], i8 2, i1 false), !dbg [[DBG236]]
+// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP12]], -1, !dbg [[DBG236]]
+// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG236]]
+// CHECK1: user_code.entry:
+// CHECK1-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB24:[0-9]+]])
+// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG237:![0-9]+]]
+// CHECK1-NEXT: store ptr [[TMP2]], ptr [[TMP14]], align 8, !dbg [[DBG237]]
+// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG237]]
+// CHECK1-NEXT: store ptr [[TMP5]], ptr [[TMP15]], align 8, !dbg [[DBG237]]
+// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG237]]
+// CHECK1-NEXT: store ptr [[TMP8]], ptr [[TMP16]], align 8, !dbg [[DBG237]]
+// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG237]]
+// CHECK1-NEXT: store ptr [[TMP11]], ptr [[TMP17]], align 8, !dbg [[DBG237]]
+// CHECK1-NEXT: call void @__kmpc_parallel_51(ptr @[[GLOB24]], i32 [[TMP13]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG237]]
+// CHECK1-NEXT: call void @__kmpc_target_deinit(ptr @[[GLOB26:[0-9]+]], i8 2), !dbg [[DBG238:![0-9]+]]
+// CHECK1-NEXT: ret void, !dbg [[DBG240:![0-9]+]]
+// CHECK1: worker.exit:
+// CHECK1-NEXT: ret void, !dbg [[DBG236]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined_debug__
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG241:![0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8
+// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr addrspace(1), align 8
+// CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr addrspace(1), align 8
+// CHECK1-NEXT: [[TMP:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[_TMP1:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[_TMP2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[_TMP3:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[_TMP4:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[F:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[G:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[H:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[D:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META244:![0-9]+]], metadata !DIExpression()), !dbg [[DBG245:![0-9]+]]
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META246:![0-9]+]], metadata !DIExpression()), !dbg [[DBG245]]
+// CHECK1-NEXT: store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META247:![0-9]+]], metadata !DIExpression()), !dbg [[DBG248:![0-9]+]]
+// CHECK1-NEXT: store ptr addrspace(1) [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META249:![0-9]+]], metadata !DIExpression()), !dbg [[DBG250:![0-9]+]]
+// CHECK1-NEXT: store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META251:![0-9]+]], metadata !DIExpression()), !dbg [[DBG252:![0-9]+]]
+// CHECK1-NEXT: store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META253:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254:![0-9]+]]
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG255:![0-9]+]]
+// CHECK1-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG255]]
+// CHECK1-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG255]]
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG255]]
+// CHECK1-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG255]]
+// CHECK1-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG255]]
+// CHECK1-NEXT: store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG255]]
+// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG255]]
+// CHECK1-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG255]]
+// CHECK1-NEXT: [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG255]]
+// CHECK1-NEXT: store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG255]]
+// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG255]]
+// CHECK1-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG255]]
+// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG255]]
+// CHECK1-NEXT: store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG255]]
+// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG255]]
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META256:![0-9]+]], metadata !DIExpression()), !dbg [[DBG245]]
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META257:![0-9]+]], metadata !DIExpression()), !dbg [[DBG245]]
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG258:![0-9]+]]
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META259:![0-9]+]], metadata !DIExpression()), !dbg [[DBG245]]
+// CHECK1-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG258]]
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META260:![0-9]+]], metadata !DIExpression()), !dbg [[DBG245]]
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG258]]
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META261:![0-9]+]], metadata !DIExpression()), !dbg [[DBG245]]
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG258]]
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META262:![0-9]+]], metadata !DIExpression()), !dbg [[DBG245]]
+// CHECK1-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG255]]
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !dbg [[DBG255]]
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB21:[0-9]+]], i32 [[TMP13]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG263:![0-9]+]]
+// CHECK1-NEXT: br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG255]]
+// CHECK1: omp.dispatch.cond:
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG258]]
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP14]], 9, !dbg [[DBG258]]
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG258]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]], !dbg [[DBG258]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG258]]
+// CHECK1-NEXT: br label [[COND_END]], !dbg [[DBG258]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ], !dbg [[DBG258]]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG258]]
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG258]]
+// CHECK1-NEXT: store i32 [[TMP16]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG258]]
+// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG258]]
+// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG258]]
+// CHECK1-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]], !dbg [[DBG255]]
+// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG255]]
+// CHECK1: omp.dispatch.body:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG255]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG258]]
+// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG258]]
+// CHECK1-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP19]], [[TMP20]], !dbg [[DBG255]]
+// CHECK1-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG255]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG258]]
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP21]], 1, !dbg [[DBG264:![0-9]+]]
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG264]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG264]]
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META265:![0-9]+]], metadata !DIExpression()), !dbg [[DBG267:![0-9]+]]
+// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG268:![0-9]+]]
+// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG268]]
+// CHECK1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG268]]
+// CHECK1-NEXT: store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[DBG267]]
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META269:![0-9]+]], metadata !DIExpression()), !dbg [[DBG270:![0-9]+]]
+// CHECK1-NEXT: store ptr [[TMP5]], ptr [[G]], align 8, !dbg [[DBG270]]
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META271:![0-9]+]], metadata !DIExpression()), !dbg [[DBG272:![0-9]+]]
+// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 1, !dbg [[DBG273:![0-9]+]]
+// CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG273]]
+// CHECK1-NEXT: store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[DBG272]]
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META274:![0-9]+]], metadata !DIExpression()), !dbg [[DBG275:![0-9]+]]
+// CHECK1-NEXT: store i32 15, ptr [[D]], align 4, !dbg [[DBG275]]
+// CHECK1-NEXT: store i32 5, ptr [[TMP5]], align 4, !dbg [[DBG276:![0-9]+]]
+// CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG277:![0-9]+]]
+// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG278:![0-9]+]]
+// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG277]]
+// CHECK1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG277]]
+// CHECK1-NEXT: store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG279:![0-9]+]]
+// CHECK1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG280:![0-9]+]]
+// CHECK1-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG280]]
+// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG281:![0-9]+]]
+// CHECK1-NEXT: [[IDXPROM15:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG280]]
+// CHECK1-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG280]]
+// CHECK1-NEXT: store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG282:![0-9]+]]
+// CHECK1-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG283:![0-9]+]]
+// CHECK1-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG283]]
+// CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG284:![0-9]+]]
+// CHECK1-NEXT: [[IDXPROM19:%.*]] = sext i32 [[TMP24]] to i64, !dbg [[DBG283]]
+// CHECK1-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG283]]
+// CHECK1-NEXT: [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG283]]
+// CHECK1-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG285:![0-9]+]]
+// CHECK1-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG286:![0-9]+]]
+// CHECK1-NEXT: [[IDXPROM22:%.*]] = sext i32 [[TMP26]] to i64, !dbg [[DBG285]]
+// CHECK1-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG285]]
+// CHECK1-NEXT: store i32 [[TMP25]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG287:![0-9]+]]
+// CHECK1-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG288:![0-9]+]]
+// CHECK1-NEXT: [[TMP27:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG289:![0-9]+]]
+// CHECK1-NEXT: [[IDXPROM25:%.*]] = sext i32 [[TMP27]] to i64, !dbg [[DBG288]]
+// CHECK1-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG288]]
+// CHECK1-NEXT: [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG288]]
+// CHECK1-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP28]], 0, !dbg [[DBG288]]
+// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8, !dbg [[DBG290:![0-9]+]]
+// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[TMP11]], align 1, !dbg [[DBG290]]
+// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG291:![0-9]+]]
+// CHECK1: omp.body.continue:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG263]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG258]]
+// CHECK1-NEXT: [[ADD27:%.*]] = add nsw i32 [[TMP29]], 1, !dbg [[DBG255]]
+// CHECK1-NEXT: store i32 [[ADD27]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG255]]
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !dbg [[DBG263]], !llvm.loop [[LOOP292:![0-9]+]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG263]]
+// CHECK1: omp.dispatch.inc:
+// CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG258]]
+// CHECK1-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG258]]
+// CHECK1-NEXT: [[ADD28:%.*]] = add nsw i32 [[TMP30]], [[TMP31]], !dbg [[DBG255]]
+// CHECK1-NEXT: store i32 [[ADD28]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG255]]
+// CHECK1-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG258]]
+// CHECK1-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG258]]
+// CHECK1-NEXT: [[ADD29:%.*]] = add nsw i32 [[TMP32]], [[TMP33]], !dbg [[DBG255]]
+// CHECK1-NEXT: store i32 [[ADD29]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG255]]
+// CHECK1-NEXT: br label [[OMP_DISPATCH_COND]], !dbg [[DBG263]], !llvm.loop [[LOOP294:![0-9]+]]
+// CHECK1: omp.dispatch.end:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB23:[0-9]+]], i32 [[TMP13]]), !dbg [[DBG293:![0-9]+]]
+// CHECK1-NEXT: ret void, !dbg [[DBG295:![0-9]+]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG296:![0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META299:![0-9]+]], metadata !DIExpression()), !dbg [[DBG300:![0-9]+]]
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META301:![0-9]+]], metadata !DIExpression()), !dbg [[DBG300]]
+// CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META302:![0-9]+]], metadata !DIExpression()), !dbg [[DBG300]]
+// CHECK1-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META303:![0-9]+]], metadata !DIExpression()), !dbg [[DBG300]]
+// CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META304:![0-9]+]], metadata !DIExpression()), !dbg [[DBG300]]
+// CHECK1-NEXT: store ptr [[BB]], ptr [[BB_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META305:![0-9]+]], metadata !DIExpression()), !dbg [[DBG300]]
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG306:![0-9]+]]
+// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG306]]
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG306]]
+// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG306]]
+// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG306]]
+// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG306]]
+// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG306]]
+// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG306]]
+// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG306]]
+// CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG306]]
+// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG306]]
+// CHECK1-NEXT: [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG306]]
+// CHECK1-NEXT: [[TMP12:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG306]]
+// CHECK1-NEXT: [[TMP13:%.*]] = addrspacecast ptr [[TMP9]] to ptr addrspace(1), !dbg [[DBG306]]
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined_debug__(ptr [[TMP4]], ptr [[TMP5]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]], ptr addrspace(1) [[TMP13]]) #[[ATTR3]], !dbg [[DBG306]]
+// CHECK1-NEXT: ret void, !dbg [[DBG306]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR5]] !dbg [[DBG307:![0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[BB_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META310:![0-9]+]], metadata !DIExpression()), !dbg [[DBG311:![0-9]+]]
+// CHECK1-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META312:![0-9]+]], metadata !DIExpression()), !dbg [[DBG311]]
+// CHECK1-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META313:![0-9]+]], metadata !DIExpression()), !dbg [[DBG311]]
+// CHECK1-NEXT: store ptr [[BB]], ptr [[BB_ADDR]], align 8
+// CHECK1-NEXT: call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META314:![0-9]+]], metadata !DIExpression()), !dbg [[DBG311]]
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG315:![0-9]+]]
+// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG315]]
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG315]]
+// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG315]]
+// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG315]]
+// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG315]]
+// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG315]]
+// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG315]]
+// CHECK1-NEXT: [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG315]]
+// CHECK1-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG315]]
+// CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG315]]
+// CHECK1-NEXT: [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG315]]
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug__(ptr addrspace(1) [[TMP8]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR3]], !dbg [[DBG315]]
+// CHECK1-NEXT: ret void, !dbg [[DBG315]]
+//
diff --git a/clang/test/OpenMP/target_parallel_generic_loop_codegen.cpp b/clang/test/OpenMP/target_parallel_generic_loop_codegen.cpp
new file mode 100644
index 00000000000000..43285f17fd87ac
--- /dev/null
+++ b/clang/test/OpenMP/target_parallel_generic_loop_codegen.cpp
@@ -0,0 +1,387 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=IR-GPU
+
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -emit-llvm %s -o - | FileCheck %s --check-prefix=IR
+
+// Check same results after serialization round-trip
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -emit-pch -o %t %s
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=IR-PCH
+
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+typedef void **omp_allocator_handle_t;
+extern const omp_allocator_handle_t omp_null_allocator;
+extern const omp_allocator_handle_t omp_default_mem_alloc;
+extern const omp_allocator_handle_t omp_large_cap_mem_alloc;
+extern const omp_allocator_handle_t omp_const_mem_alloc;
+extern const omp_allocator_handle_t omp_high_bw_mem_alloc;
+extern const omp_allocator_handle_t omp_low_lat_mem_alloc;
+extern const omp_allocator_handle_t omp_cgroup_mem_alloc;
+extern const omp_allocator_handle_t omp_pteam_mem_alloc;
+extern const omp_allocator_handle_t omp_thread_mem_alloc;
+
+extern int omp_get_thread_num(void);
+
+#define N 64
+
+int main() {
+ int x = 0;
+ int device_result[N] = {0};
+
+ #pragma omp target parallel loop num_threads(N) uses_allocators(omp_pteam_mem_alloc) allocate(omp_pteam_mem_alloc: x) private(x) map(from: device_result)
+ for (int i = 0; i < N; i++) {
+ x = omp_get_thread_num();
+ device_result[i] = i + x;
+ }
+}
+#endif
+// IR-GPU-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37
+// IR-GPU-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[DEVICE_RESULT:%.*]], ptr noundef [[OMP_PTEAM_MEM_ALLOC:%.*]]) #[[ATTR0:[0-9]+]] {
+// IR-GPU-NEXT: entry:
+// IR-GPU-NEXT: [[DEVICE_RESULT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// IR-GPU-NEXT: [[OMP_PTEAM_MEM_ALLOC_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// IR-GPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x ptr], align 8, addrspace(5)
+// IR-GPU-NEXT: [[DEVICE_RESULT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DEVICE_RESULT_ADDR]] to ptr
+// IR-GPU-NEXT: [[OMP_PTEAM_MEM_ALLOC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OMP_PTEAM_MEM_ALLOC_ADDR]] to ptr
+// IR-GPU-NEXT: [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
+// IR-GPU-NEXT: store ptr [[DEVICE_RESULT]], ptr [[DEVICE_RESULT_ADDR_ASCAST]], align 8
+// IR-GPU-NEXT: store ptr [[OMP_PTEAM_MEM_ALLOC]], ptr [[OMP_PTEAM_MEM_ALLOC_ADDR_ASCAST]], align 8
+// IR-GPU-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DEVICE_RESULT_ADDR_ASCAST]], align 8
+// IR-GPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), i8 2, i1 false)
+// IR-GPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// IR-GPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// IR-GPU: user_code.entry:
+// IR-GPU-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
+// IR-GPU-NEXT: [[TMP3:%.*]] = load ptr, ptr [[OMP_PTEAM_MEM_ALLOC_ADDR_ASCAST]], align 8
+// IR-GPU-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 0
+// IR-GPU-NEXT: store ptr [[TMP0]], ptr [[TMP4]], align 8
+// IR-GPU-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 1
+// IR-GPU-NEXT: store ptr [[TMP3]], ptr [[TMP5]], align 8
+// IR-GPU-NEXT: call void @__kmpc_parallel_51(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP2]], i32 1, i32 64, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 2)
+// IR-GPU-NEXT: call void @__kmpc_target_deinit(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i8 2)
+// IR-GPU-NEXT: ret void
+// IR-GPU: worker.exit:
+// IR-GPU-NEXT: ret void
+//
+//
+// IR-GPU-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_omp_outlined
+// IR-GPU-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(256) [[DEVICE_RESULT:%.*]], ptr noundef [[OMP_PTEAM_MEM_ALLOC:%.*]]) #[[ATTR1:[0-9]+]] {
+// IR-GPU-NEXT: entry:
+// IR-GPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// IR-GPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// IR-GPU-NEXT: [[DEVICE_RESULT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// IR-GPU-NEXT: [[OMP_PTEAM_MEM_ALLOC_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// IR-GPU-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[TMP:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
+// IR-GPU-NEXT: [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
+// IR-GPU-NEXT: [[DEVICE_RESULT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DEVICE_RESULT_ADDR]] to ptr
+// IR-GPU-NEXT: [[OMP_PTEAM_MEM_ALLOC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OMP_PTEAM_MEM_ALLOC_ADDR]] to ptr
+// IR-GPU-NEXT: [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr
+// IR-GPU-NEXT: [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
+// IR-GPU-NEXT: [[DOTOMP_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_LB]] to ptr
+// IR-GPU-NEXT: [[DOTOMP_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_UB]] to ptr
+// IR-GPU-NEXT: [[DOTOMP_STRIDE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_STRIDE]] to ptr
+// IR-GPU-NEXT: [[DOTOMP_IS_LAST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IS_LAST]] to ptr
+// IR-GPU-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
+// IR-GPU-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
+// IR-GPU-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
+// IR-GPU-NEXT: store ptr [[DEVICE_RESULT]], ptr [[DEVICE_RESULT_ADDR_ASCAST]], align 8
+// IR-GPU-NEXT: store ptr [[OMP_PTEAM_MEM_ALLOC]], ptr [[OMP_PTEAM_MEM_ALLOC_ADDR_ASCAST]], align 8
+// IR-GPU-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DEVICE_RESULT_ADDR_ASCAST]], align 8
+// IR-GPU-NEXT: store i32 0, ptr [[DOTOMP_LB_ASCAST]], align 4
+// IR-GPU-NEXT: store i32 63, ptr [[DOTOMP_UB_ASCAST]], align 4
+// IR-GPU-NEXT: store i32 1, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
+// IR-GPU-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
+// IR-GPU-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// IR-GPU-NEXT: call void @__kmpc_for_static_init_4(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr), i32 [[TMP2]], i32 33, ptr [[DOTOMP_IS_LAST_ASCAST]], ptr [[DOTOMP_LB_ASCAST]], ptr [[DOTOMP_UB_ASCAST]], ptr [[DOTOMP_STRIDE_ASCAST]], i32 1, i32 1)
+// IR-GPU-NEXT: br label [[OMP_DISPATCH_COND:%.*]]
+// IR-GPU: omp.dispatch.cond:
+// IR-GPU-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB_ASCAST]], align 4
+// IR-GPU-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 63
+// IR-GPU-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// IR-GPU: cond.true:
+// IR-GPU-NEXT: br label [[COND_END:%.*]]
+// IR-GPU: cond.false:
+// IR-GPU-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB_ASCAST]], align 4
+// IR-GPU-NEXT: br label [[COND_END]]
+// IR-GPU: cond.end:
+// IR-GPU-NEXT: [[COND:%.*]] = phi i32 [ 63, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// IR-GPU-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4
+// IR-GPU-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB_ASCAST]], align 4
+// IR-GPU-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// IR-GPU-NEXT: br i1 [[CMP1]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// IR-GPU: omp.dispatch.body:
+// IR-GPU-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// IR-GPU: omp.inner.for.cond:
+// IR-GPU-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB_ASCAST]], align 4
+// IR-GPU-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// IR-GPU-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// IR-GPU: omp.inner.for.body:
+// IR-GPU-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4
+// IR-GPU-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// IR-GPU-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// IR-GPU-NEXT: store i32 [[ADD]], ptr [[I_ASCAST]], align 4
+// IR-GPU-NEXT: [[CALL:%.*]] = call noundef i32 @_Z18omp_get_thread_numv() #[[ATTR5:[0-9]+]]
+// IR-GPU-NEXT: store i32 [[CALL]], ptr addrspacecast (ptr addrspace(3) @x to ptr), align 4
+// IR-GPU-NEXT: [[TMP11:%.*]] = load i32, ptr [[I_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP12:%.*]] = load i32, ptr addrspacecast (ptr addrspace(3) @x to ptr), align 4
+// IR-GPU-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// IR-GPU-NEXT: [[TMP13:%.*]] = load i32, ptr [[I_ASCAST]], align 4
+// IR-GPU-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64
+// IR-GPU-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [64 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// IR-GPU-NEXT: store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4
+// IR-GPU-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// IR-GPU: omp.body.continue:
+// IR-GPU-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// IR-GPU: omp.inner.for.inc:
+// IR-GPU-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4
+// IR-GPU-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP14]], 1
+// IR-GPU-NEXT: store i32 [[ADD4]], ptr [[DOTOMP_IV_ASCAST]], align 4
+// IR-GPU-NEXT: br label [[OMP_INNER_FOR_COND]]
+// IR-GPU: omp.inner.for.end:
+// IR-GPU-NEXT: br label [[OMP_DISPATCH_INC:%.*]]
+// IR-GPU: omp.dispatch.inc:
+// IR-GPU-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
+// IR-GPU-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// IR-GPU-NEXT: store i32 [[ADD5]], ptr [[DOTOMP_LB_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
+// IR-GPU-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// IR-GPU-NEXT: store i32 [[ADD6]], ptr [[DOTOMP_UB_ASCAST]], align 4
+// IR-GPU-NEXT: br label [[OMP_DISPATCH_COND]]
+// IR-GPU: omp.dispatch.end:
+// IR-GPU-NEXT: call void @__kmpc_for_static_fini(ptr addrspacecast (ptr addrspace(1) @[[GLOB2]] to ptr), i32 [[TMP2]])
+// IR-GPU-NEXT: ret void
+//
+//
+// IR-LABEL: define {{[^@]+}}@main
+// IR-SAME: () #[[ATTR0:[0-9]+]] {
+// IR-NEXT: entry:
+// IR-NEXT: [[X:%.*]] = alloca i32, align 4
+// IR-NEXT: [[DEVICE_RESULT:%.*]] = alloca [64 x i32], align 16
+// IR-NEXT: store i32 0, ptr [[X]], align 4
+// IR-NEXT: call void @llvm.memset.p0.i64(ptr align 16 [[DEVICE_RESULT]], i8 0, i64 256, i1 false)
+// IR-NEXT: [[TMP0:%.*]] = load ptr, ptr @omp_pteam_mem_alloc, align 8
+// IR-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37(ptr [[DEVICE_RESULT]], ptr [[TMP0]]) #[[ATTR3:[0-9]+]]
+// IR-NEXT: ret i32 0
+//
+//
+// IR-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37
+// IR-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[DEVICE_RESULT:%.*]], ptr noundef [[OMP_PTEAM_MEM_ALLOC:%.*]]) #[[ATTR2:[0-9]+]] {
+// IR-NEXT: entry:
+// IR-NEXT: [[DEVICE_RESULT_ADDR:%.*]] = alloca ptr, align 8
+// IR-NEXT: [[OMP_PTEAM_MEM_ALLOC_ADDR:%.*]] = alloca ptr, align 8
+// IR-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2:[0-9]+]])
+// IR-NEXT: store ptr [[DEVICE_RESULT]], ptr [[DEVICE_RESULT_ADDR]], align 8
+// IR-NEXT: store ptr [[OMP_PTEAM_MEM_ALLOC]], ptr [[OMP_PTEAM_MEM_ALLOC_ADDR]], align 8
+// IR-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DEVICE_RESULT_ADDR]], align 8
+// IR-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB2]], i32 [[TMP0]], i32 64)
+// IR-NEXT: [[TMP2:%.*]] = load ptr, ptr [[OMP_PTEAM_MEM_ALLOC_ADDR]], align 8
+// IR-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37.omp_outlined, ptr [[TMP1]], ptr [[TMP2]])
+// IR-NEXT: ret void
+//
+//
+// IR-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37.omp_outlined
+// IR-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(256) [[DEVICE_RESULT:%.*]], ptr noundef [[OMP_PTEAM_MEM_ALLOC:%.*]]) #[[ATTR2]] {
+// IR-NEXT: entry:
+// IR-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// IR-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// IR-NEXT: [[DEVICE_RESULT_ADDR:%.*]] = alloca ptr, align 8
+// IR-NEXT: [[OMP_PTEAM_MEM_ALLOC_ADDR:%.*]] = alloca ptr, align 8
+// IR-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// IR-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// IR-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// IR-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// IR-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// IR-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// IR-NEXT: [[I:%.*]] = alloca i32, align 4
+// IR-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// IR-NEXT: store ptr [[DEVICE_RESULT]], ptr [[DEVICE_RESULT_ADDR]], align 8
+// IR-NEXT: store ptr [[OMP_PTEAM_MEM_ALLOC]], ptr [[OMP_PTEAM_MEM_ALLOC_ADDR]], align 8
+// IR-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DEVICE_RESULT_ADDR]], align 8
+// IR-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// IR-NEXT: store i32 63, ptr [[DOTOMP_UB]], align 4
+// IR-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// IR-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// IR-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// IR-NEXT: [[TMP3:%.*]] = load ptr, ptr @omp_pteam_mem_alloc, align 8
+// IR-NEXT: [[DOTX__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP2]], i64 4, ptr [[TMP3]])
+// IR-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP2]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// IR-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// IR-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 63
+// IR-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// IR: cond.true:
+// IR-NEXT: br label [[COND_END:%.*]]
+// IR: cond.false:
+// IR-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// IR-NEXT: br label [[COND_END]]
+// IR: cond.end:
+// IR-NEXT: [[COND:%.*]] = phi i32 [ 63, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// IR-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// IR-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// IR-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
+// IR-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// IR: omp.inner.for.cond:
+// IR-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// IR-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// IR-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// IR-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
+// IR: omp.inner.for.cond.cleanup:
+// IR-NEXT: br label [[OMP_INNER_FOR_END:%.*]]
+// IR: omp.inner.for.body:
+// IR-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// IR-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// IR-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// IR-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// IR-NEXT: [[CALL:%.*]] = call noundef i32 @_Z18omp_get_thread_numv()
+// IR-NEXT: store i32 [[CALL]], ptr [[DOTX__VOID_ADDR]], align 4
+// IR-NEXT: [[TMP10:%.*]] = load i32, ptr [[I]], align 4
+// IR-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTX__VOID_ADDR]], align 4
+// IR-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP10]], [[TMP11]]
+// IR-NEXT: [[TMP12:%.*]] = load i32, ptr [[I]], align 4
+// IR-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP12]] to i64
+// IR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [64 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// IR-NEXT: store i32 [[ADD2]], ptr [[ARRAYIDX]], align 4
+// IR-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// IR: omp.body.continue:
+// IR-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// IR: omp.inner.for.inc:
+// IR-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// IR-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP13]], 1
+// IR-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// IR-NEXT: br label [[OMP_INNER_FOR_COND]]
+// IR: omp.inner.for.end:
+// IR-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// IR: omp.loop.exit:
+// IR-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP2]])
+// IR-NEXT: [[TMP14:%.*]] = load ptr, ptr @omp_pteam_mem_alloc, align 8
+// IR-NEXT: call void @__kmpc_free(i32 [[TMP2]], ptr [[DOTX__VOID_ADDR]], ptr [[TMP14]])
+// IR-NEXT: ret void
+//
+//
+// IR-PCH-LABEL: define {{[^@]+}}@main
+// IR-PCH-SAME: () #[[ATTR0:[0-9]+]] {
+// IR-PCH-NEXT: entry:
+// IR-PCH-NEXT: [[X:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[DEVICE_RESULT:%.*]] = alloca [64 x i32], align 16
+// IR-PCH-NEXT: store i32 0, ptr [[X]], align 4
+// IR-PCH-NEXT: call void @llvm.memset.p0.i64(ptr align 16 [[DEVICE_RESULT]], i8 0, i64 256, i1 false)
+// IR-PCH-NEXT: [[TMP0:%.*]] = load ptr, ptr @omp_pteam_mem_alloc, align 8
+// IR-PCH-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37(ptr [[DEVICE_RESULT]], ptr [[TMP0]]) #[[ATTR3:[0-9]+]]
+// IR-PCH-NEXT: ret i32 0
+//
+//
+// IR-PCH-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37
+// IR-PCH-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[DEVICE_RESULT:%.*]], ptr noundef [[OMP_PTEAM_MEM_ALLOC:%.*]]) #[[ATTR2:[0-9]+]] {
+// IR-PCH-NEXT: entry:
+// IR-PCH-NEXT: [[DEVICE_RESULT_ADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-NEXT: [[OMP_PTEAM_MEM_ALLOC_ADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2:[0-9]+]])
+// IR-PCH-NEXT: store ptr [[DEVICE_RESULT]], ptr [[DEVICE_RESULT_ADDR]], align 8
+// IR-PCH-NEXT: store ptr [[OMP_PTEAM_MEM_ALLOC]], ptr [[OMP_PTEAM_MEM_ALLOC_ADDR]], align 8
+// IR-PCH-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DEVICE_RESULT_ADDR]], align 8
+// IR-PCH-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB2]], i32 [[TMP0]], i32 64)
+// IR-PCH-NEXT: [[TMP2:%.*]] = load ptr, ptr [[OMP_PTEAM_MEM_ALLOC_ADDR]], align 8
+// IR-PCH-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37.omp_outlined, ptr [[TMP1]], ptr [[TMP2]])
+// IR-PCH-NEXT: ret void
+//
+//
+// IR-PCH-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37.omp_outlined
+// IR-PCH-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(256) [[DEVICE_RESULT:%.*]], ptr noundef [[OMP_PTEAM_MEM_ALLOC:%.*]]) #[[ATTR2]] {
+// IR-PCH-NEXT: entry:
+// IR-PCH-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-NEXT: [[DEVICE_RESULT_ADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-NEXT: [[OMP_PTEAM_MEM_ALLOC_ADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[I:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-PCH-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// IR-PCH-NEXT: store ptr [[DEVICE_RESULT]], ptr [[DEVICE_RESULT_ADDR]], align 8
+// IR-PCH-NEXT: store ptr [[OMP_PTEAM_MEM_ALLOC]], ptr [[OMP_PTEAM_MEM_ALLOC_ADDR]], align 8
+// IR-PCH-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DEVICE_RESULT_ADDR]], align 8
+// IR-PCH-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// IR-PCH-NEXT: store i32 63, ptr [[DOTOMP_UB]], align 4
+// IR-PCH-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// IR-PCH-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// IR-PCH-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-PCH-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// IR-PCH-NEXT: [[TMP3:%.*]] = load ptr, ptr @omp_pteam_mem_alloc, align 8
+// IR-PCH-NEXT: [[DOTX__VOID_ADDR:%.*]] = call ptr @__kmpc_alloc(i32 [[TMP2]], i64 4, ptr [[TMP3]])
+// IR-PCH-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP2]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// IR-PCH-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// IR-PCH-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 63
+// IR-PCH-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// IR-PCH: cond.true:
+// IR-PCH-NEXT: br label [[COND_END:%.*]]
+// IR-PCH: cond.false:
+// IR-PCH-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// IR-PCH-NEXT: br label [[COND_END]]
+// IR-PCH: cond.end:
+// IR-PCH-NEXT: [[COND:%.*]] = phi i32 [ 63, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// IR-PCH-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// IR-PCH-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// IR-PCH-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
+// IR-PCH-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// IR-PCH: omp.inner.for.cond:
+// IR-PCH-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// IR-PCH-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// IR-PCH-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// IR-PCH-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
+// IR-PCH: omp.inner.for.cond.cleanup:
+// IR-PCH-NEXT: br label [[OMP_INNER_FOR_END:%.*]]
+// IR-PCH: omp.inner.for.body:
+// IR-PCH-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// IR-PCH-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// IR-PCH-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// IR-PCH-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// IR-PCH-NEXT: [[CALL:%.*]] = call noundef i32 @_Z18omp_get_thread_numv()
+// IR-PCH-NEXT: store i32 [[CALL]], ptr [[DOTX__VOID_ADDR]], align 4
+// IR-PCH-NEXT: [[TMP10:%.*]] = load i32, ptr [[I]], align 4
+// IR-PCH-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTX__VOID_ADDR]], align 4
+// IR-PCH-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP10]], [[TMP11]]
+// IR-PCH-NEXT: [[TMP12:%.*]] = load i32, ptr [[I]], align 4
+// IR-PCH-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP12]] to i64
+// IR-PCH-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [64 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// IR-PCH-NEXT: store i32 [[ADD2]], ptr [[ARRAYIDX]], align 4
+// IR-PCH-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// IR-PCH: omp.body.continue:
+// IR-PCH-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// IR-PCH: omp.inner.for.inc:
+// IR-PCH-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// IR-PCH-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP13]], 1
+// IR-PCH-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// IR-PCH-NEXT: br label [[OMP_INNER_FOR_COND]]
+// IR-PCH: omp.inner.for.end:
+// IR-PCH-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// IR-PCH: omp.loop.exit:
+// IR-PCH-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP2]])
+// IR-PCH-NEXT: [[TMP14:%.*]] = load ptr, ptr @omp_pteam_mem_alloc, align 8
+// IR-PCH-NEXT: call void @__kmpc_free(i32 [[TMP2]], ptr [[DOTX__VOID_ADDR]], ptr [[TMP14]])
+// IR-PCH-NEXT: ret void
+//
diff --git a/clang/test/OpenMP/target_parallel_generic_loop_depend_codegen.cpp b/clang/test/OpenMP/target_parallel_generic_loop_depend_codegen.cpp
new file mode 100644
index 00000000000000..2ebee51e516d68
--- /dev/null
+++ b/clang/test/OpenMP/target_parallel_generic_loop_depend_codegen.cpp
@@ -0,0 +1,210 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// Test host codegen.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+
+
+
+
+// Check target registration is registered as a Ctor.
+
+
+template<typename tx, typename ty>
+struct TT{
+ tx X;
+ ty Y;
+};
+
+int global;
+extern int global;
+
+int foo(int n) {
+ int a = 0;
+ short aa = 0;
+ float b[10];
+ float bn[n];
+ double c[5][10];
+ double cn[5][n];
+ TT<long long, char> d;
+ static long *plocal;
+
+ #pragma omp target parallel loop device(global + a) depend(in: global) depend(out: a, b, cn[4])
+ for (int i = 0; i < 10; ++i) {
+ }
+
+
+
+
+
+
+
+ #pragma omp target parallel loop device(global + a) nowait depend(inout: global, a, bn) if(target:a)
+ for (int i = 0; i < *plocal; ++i) {
+ static int local1;
+ *plocal = global;
+ local1 = global;
+ }
+
+ #pragma omp target parallel loop if(0) firstprivate(global) depend(out:global)
+ for (int i = 0; i < global; ++i) {
+ global += 1;
+ }
+
+ return a;
+}
+
+// Check that the offloading functions are emitted and that the arguments are
+// correct and loaded correctly for the target regions in foo().
+
+
+
+
+
+// CHECK-64: [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
+// CHECK-64: store i32 [[BP1_I32]], i32* [[BP1_CAST]],
+// CHECK-32: store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
+
+// CHECK-64: [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
+// CHECK-64: store i32 [[BP1_I32]], i32* [[BP1_CAST]],
+// CHECK-32: store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
+
+// Create stack storage and store argument in there.
+// CHECK-64: [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i32*
+// CHECK-64: load i32, i32* [[AA_CADDR]], align
+// CHECK-32: load i32, i32* [[AA_ADDR]], align
+
+// CHECK-64: [[BP1_CAST:%.+]] = bitcast i[[SZ]]* [[BP1_PTR:%.+]] to i32*
+// CHECK-64: store i32 [[BP1_I32]], i32* [[BP1_CAST]],
+// CHECK-32: store i32 [[BP1_I32]], i32* [[BP1_PTR:%.+]],
+
+
+#endif
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l66
+// CHECK-SAME: () #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2:[0-9]+]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l66.omp_outlined)
+// CHECK-NEXT: ret void
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK-SAME: () #[[ATTR8:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK-NEXT: ret void
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+// TCHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l66
+// TCHECK-SAME: () #[[ATTR0:[0-9]+]] {
+// TCHECK-NEXT: entry:
+// TCHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2:[0-9]+]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l66.omp_outlined)
+// TCHECK-NEXT: ret void
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+// SIMD-ONLY0: {{.*}}
+// SIMD-ONLY1: {{.*}}
diff --git a/clang/test/OpenMP/target_parallel_generic_loop_uses_allocators_codegen.cpp b/clang/test/OpenMP/target_parallel_generic_loop_uses_allocators_codegen.cpp
new file mode 100644
index 00000000000000..364797e5a91aa3
--- /dev/null
+++ b/clang/test/OpenMP/target_parallel_generic_loop_uses_allocators_codegen.cpp
@@ -0,0 +1,211 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// Test host codegen.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+enum omp_allocator_handle_t {
+ omp_null_allocator = 0,
+ omp_default_mem_alloc = 1,
+ omp_large_cap_mem_alloc = 2,
+ omp_const_mem_alloc = 3,
+ omp_high_bw_mem_alloc = 4,
+ omp_low_lat_mem_alloc = 5,
+ omp_cgroup_mem_alloc = 6,
+ omp_pteam_mem_alloc = 7,
+ omp_thread_mem_alloc = 8,
+ KMP_ALLOCATOR_MAX_HANDLE = __UINTPTR_MAX__
+};
+
+typedef enum omp_alloctrait_key_t { omp_atk_sync_hint = 1,
+ omp_atk_alignment = 2,
+ omp_atk_access = 3,
+ omp_atk_pool_size = 4,
+ omp_atk_fallback = 5,
+ omp_atk_fb_data = 6,
+ omp_atk_pinned = 7,
+ omp_atk_partition = 8
+} omp_alloctrait_key_t;
+typedef enum omp_alloctrait_value_t {
+ omp_atv_false = 0,
+ omp_atv_true = 1,
+ omp_atv_default = 2,
+ omp_atv_contended = 3,
+ omp_atv_uncontended = 4,
+ omp_atv_sequential = 5,
+ omp_atv_private = 6,
+ omp_atv_all = 7,
+ omp_atv_thread = 8,
+ omp_atv_pteam = 9,
+ omp_atv_cgroup = 10,
+ omp_atv_default_mem_fb = 11,
+ omp_atv_null_fb = 12,
+ omp_atv_abort_fb = 13,
+ omp_atv_allocator_fb = 14,
+ omp_atv_environment = 15,
+ omp_atv_nearest = 16,
+ omp_atv_blocked = 17,
+ omp_atv_interleaved = 18
+} omp_alloctrait_value_t;
+
+typedef struct omp_alloctrait_t {
+ omp_alloctrait_key_t key;
+ __UINTPTR_TYPE__ value;
+} omp_alloctrait_t;
+
+// Just map the traits variable as a firstprivate variable.
+
+void foo() {
+ omp_alloctrait_t traits[10];
+ omp_allocator_handle_t my_allocator;
+
+#pragma omp target parallel loop uses_allocators(omp_null_allocator, omp_thread_mem_alloc, my_allocator(traits))
+ for (int i = 0; i < 10; ++i)
+ ;
+}
+
+
+// Destroy allocator upon exit from the region.
+
+#endif
+// CHECK-LABEL: define {{[^@]+}}@_Z3foov
+// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TRAITS:%.*]] = alloca [10 x %struct.omp_alloctrait_t], align 8
+// CHECK-NEXT: [[MY_ALLOCATOR:%.*]] = alloca i64, align 8
+// CHECK-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-NEXT: store ptr [[TRAITS]], ptr [[TMP0]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-NEXT: store ptr [[TRAITS]], ptr [[TMP1]], align 8
+// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK-NEXT: store ptr null, ptr [[TMP2]], align 8
+// CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK-NEXT: store i32 2, ptr [[TMP5]], align 4
+// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK-NEXT: store i32 1, ptr [[TMP6]], align 4
+// CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK-NEXT: store ptr [[TMP3]], ptr [[TMP7]], align 8
+// CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK-NEXT: store ptr [[TMP4]], ptr [[TMP8]], align 8
+// CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK-NEXT: store ptr @.offload_sizes, ptr [[TMP9]], align 8
+// CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK-NEXT: store ptr @.offload_maptypes, ptr [[TMP10]], align 8
+// CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK-NEXT: store ptr null, ptr [[TMP11]], align 8
+// CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK-NEXT: store ptr null, ptr [[TMP12]], align 8
+// CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK-NEXT: store i64 0, ptr [[TMP13]], align 8
+// CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK-NEXT: store i64 0, ptr [[TMP14]], align 8
+// CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP15]], align 4
+// CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4
+// CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK-NEXT: store i32 0, ptr [[TMP17]], align 4
+// CHECK-NEXT: [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l66.region_id, ptr [[KERNEL_ARGS]])
+// CHECK-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// CHECK-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK: omp_offload.failed:
+// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l66(ptr [[TRAITS]]) #[[ATTR2:[0-9]+]]
+// CHECK-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK: omp_offload.cont:
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l66
+// CHECK-SAME: (ptr noundef nonnull align 8 dereferenceable(160) [[TRAITS:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TRAITS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[MY_ALLOCATOR:%.*]] = alloca i64, align 8
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK-NEXT: store ptr [[TRAITS]], ptr [[TRAITS_ADDR]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TRAITS_ADDR]], align 8
+// CHECK-NEXT: [[TMP2:%.*]] = call ptr @__kmpc_init_allocator(i32 [[TMP0]], ptr null, i32 10, ptr [[TMP1]])
+// CHECK-NEXT: [[CONV:%.*]] = ptrtoint ptr [[TMP2]] to i64
+// CHECK-NEXT: store i64 [[CONV]], ptr [[MY_ALLOCATOR]], align 8
+// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l66.omp_outlined)
+// CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr [[MY_ALLOCATOR]], align 8
+// CHECK-NEXT: [[CONV1:%.*]] = inttoptr i64 [[TMP3]] to ptr
+// CHECK-NEXT: call void @__kmpc_destroy_allocator(i32 [[TMP0]], ptr [[CONV1]])
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l66.omp_outlined
+// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK: cond.true:
+// CHECK-NEXT: br label [[COND_END:%.*]]
+// CHECK: cond.false:
+// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: br label [[COND_END]]
+// CHECK: cond.end:
+// CHECK-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK: omp.inner.for.cond:
+// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK: omp.inner.for.body:
+// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK: omp.body.continue:
+// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK: omp.inner.for.inc:
+// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
+// CHECK-NEXT: store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK: omp.inner.for.end:
+// CHECK-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK: omp.loop.exit:
+// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP1]])
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK-NEXT: ret void
+//
diff --git a/clang/test/OpenMP/target_teams_generic_loop_codegen-1.cpp b/clang/test/OpenMP/target_teams_generic_loop_codegen-1.cpp
new file mode 100644
index 00000000000000..d7cb906134a41c
--- /dev/null
+++ b/clang/test/OpenMP/target_teams_generic_loop_codegen-1.cpp
@@ -0,0 +1,2664 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+// Test host codegen.
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK2
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK2
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK4
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK4
+
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+
+// Test target codegen - host bc file has to be created first. (no significant
diff erences with host version of target region)
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK10
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK10
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK12
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK12
+
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+
+#ifdef CK1
+
+
+int target_teams_fun(int *g){
+ int n = 1000;
+ int a[1000];
+ int te = n / 128;
+ int th = 128;
+ // discard n_addr
+ // discard capture expressions for te and th
+
+ #pragma omp target teams loop num_teams(te), thread_limit(th)
+ for(int i = 0; i < n; i++) {
+ a[i] = 0;
+ }
+
+ {{{
+ #pragma omp target teams loop is_device_ptr(g)
+ for(int i = 0; i < n; i++) {
+ a[i] = g[0];
+ }
+ }}}
+
+ // outlined target regions
+
+
+
+
+ return a[0];
+}
+
+#endif // CK1
+#endif // HEADER
+// CHECK1-LABEL: define {{[^@]+}}@_Z16target_teams_funPi
+// CHECK1-SAME: (ptr noundef [[G:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[N:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[A:%.*]] = alloca [1000 x i32], align 4
+// CHECK1-NEXT: [[TE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TH:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[N_CASTED3:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 8
+// CHECK1-NEXT: store i32 1000, ptr [[N]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[N]], align 4
+// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP0]], 128
+// CHECK1-NEXT: store i32 [[DIV]], ptr [[TE]], align 4
+// CHECK1-NEXT: store i32 128, ptr [[TH]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[TE]], align 4
+// CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[TH]], align 4
+// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[N]], align 4
+// CHECK1-NEXT: store i32 [[TMP3]], ptr [[N_CASTED]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 [[TMP7]], ptr [[DOTCAPTURE_EXPR__CASTED2]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED2]], align 8
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l51(i64 [[TMP4]], ptr [[A]], i64 [[TMP6]], i64 [[TMP8]]) #[[ATTR2:[0-9]+]]
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[N]], align 4
+// CHECK1-NEXT: store i32 [[TMP9]], ptr [[N_CASTED3]], align 4
+// CHECK1-NEXT: [[TMP10:%.*]] = load i64, ptr [[N_CASTED3]], align 8
+// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l57(i64 [[TMP10]], ptr [[A]], ptr [[TMP11]]) #[[ATTR2]]
+// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[A]], i64 0, i64 0
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK1-NEXT: ret i32 [[TMP12]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l51
+// CHECK1-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]])
+// CHECK1-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR2]], align 4
+// CHECK1-NEXT: call void @__kmpc_push_num_teams(ptr @[[GLOB3]], i32 [[TMP0]], i32 [[TMP2]], i32 [[TMP3]])
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP4]], ptr [[N_CASTED]], align 4
+// CHECK1-NEXT: [[TMP5:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l51.omp_outlined, i64 [[TMP5]], ptr [[TMP1]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l51.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK1-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK1: omp.precond.then:
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK1-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
+// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64
+// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP18]], ptr [[N_CASTED]], align 4
+// CHECK1-NEXT: [[TMP19:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l51.omp_outlined.omp_outlined, i64 [[TMP15]], i64 [[TMP17]], i64 [[TMP19]], ptr [[TMP0]])
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: [[TMP22:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP23]])
+// CHECK1-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK1: omp.precond.end:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l51.omp_outlined.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I4:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK1-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK1: omp.precond.then:
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP5:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP5]] to i32
+// CHECK1-NEXT: [[TMP6:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP6]] to i32
+// CHECK1-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[CONV3]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP8]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
+// CHECK1-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
+// CHECK1-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[I4]], align 4
+// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[I4]], align 4
+// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64
+// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK1-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4
+// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1: omp.body.continue:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP18]], 1
+// CHECK1-NEXT: store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: [[TMP19:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP20]])
+// CHECK1-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK1: omp.precond.end:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l57
+// CHECK1-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP1]], ptr [[N_CASTED]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l57.omp_outlined, i64 [[TMP2]], ptr [[TMP0]], ptr [[TMP3]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l57.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK1-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK1: omp.precond.then:
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP6]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK1-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
+// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64
+// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP18]], ptr [[N_CASTED]], align 4
+// CHECK1-NEXT: [[TMP19:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK1-NEXT: [[TMP20:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 5, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l57.omp_outlined.omp_outlined, i64 [[TMP15]], i64 [[TMP17]], i64 [[TMP19]], ptr [[TMP0]], ptr [[TMP20]])
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP21]], [[TMP22]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: [[TMP23:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP24]])
+// CHECK1-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK1: omp.precond.end:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l57.omp_outlined.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I4:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK1-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK1: omp.precond.then:
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP5:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP5]] to i32
+// CHECK1-NEXT: [[TMP6:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP6]] to i32
+// CHECK1-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[CONV3]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP8]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
+// CHECK1-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
+// CHECK1-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[I4]], align 4
+// CHECK1-NEXT: [[TMP17:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 0
+// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[I4]], align 4
+// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64
+// CHECK1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK1-NEXT: store i32 [[TMP18]], ptr [[ARRAYIDX7]], align 4
+// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1: omp.body.continue:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP20]], 1
+// CHECK1-NEXT: store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP22]])
+// CHECK1-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK1: omp.precond.end:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@_Z16target_teams_funPi
+// CHECK2-SAME: (ptr noundef [[G:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK2-NEXT: entry:
+// CHECK2-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[N:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[A:%.*]] = alloca [1000 x i32], align 4
+// CHECK2-NEXT: [[TE:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[TH:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR__CASTED2:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [4 x ptr], align 8
+// CHECK2-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [4 x ptr], align 8
+// CHECK2-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [4 x ptr], align 8
+// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_4:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK2-NEXT: [[N_CASTED7:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTOFFLOAD_BASEPTRS8:%.*]] = alloca [3 x ptr], align 8
+// CHECK2-NEXT: [[DOTOFFLOAD_PTRS9:%.*]] = alloca [3 x ptr], align 8
+// CHECK2-NEXT: [[DOTOFFLOAD_MAPPERS10:%.*]] = alloca [3 x ptr], align 8
+// CHECK2-NEXT: [[_TMP11:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_12:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_13:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[KERNEL_ARGS18:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK2-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 8
+// CHECK2-NEXT: store i32 1000, ptr [[N]], align 4
+// CHECK2-NEXT: [[TMP0:%.*]] = load i32, ptr [[N]], align 4
+// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP0]], 128
+// CHECK2-NEXT: store i32 [[DIV]], ptr [[TE]], align 4
+// CHECK2-NEXT: store i32 128, ptr [[TH]], align 4
+// CHECK2-NEXT: [[TMP1:%.*]] = load i32, ptr [[TE]], align 4
+// CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[TH]], align 4
+// CHECK2-NEXT: store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[N]], align 4
+// CHECK2-NEXT: store i32 [[TMP3]], ptr [[N_CASTED]], align 4
+// CHECK2-NEXT: [[TMP4:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 4
+// CHECK2-NEXT: [[TMP6:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8
+// CHECK2-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: store i32 [[TMP7]], ptr [[DOTCAPTURE_EXPR__CASTED2]], align 4
+// CHECK2-NEXT: [[TMP8:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED2]], align 8
+// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK2-NEXT: store i64 [[TMP4]], ptr [[TMP9]], align 8
+// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK2-NEXT: store i64 [[TMP4]], ptr [[TMP10]], align 8
+// CHECK2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK2-NEXT: store ptr null, ptr [[TMP11]], align 8
+// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK2-NEXT: store ptr [[A]], ptr [[TMP12]], align 8
+// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK2-NEXT: store ptr [[A]], ptr [[TMP13]], align 8
+// CHECK2-NEXT: [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK2-NEXT: store ptr null, ptr [[TMP14]], align 8
+// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK2-NEXT: store i64 [[TMP6]], ptr [[TMP15]], align 8
+// CHECK2-NEXT: [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK2-NEXT: store i64 [[TMP6]], ptr [[TMP16]], align 8
+// CHECK2-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
+// CHECK2-NEXT: store ptr null, ptr [[TMP17]], align 8
+// CHECK2-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 3
+// CHECK2-NEXT: store i64 [[TMP8]], ptr [[TMP18]], align 8
+// CHECK2-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 3
+// CHECK2-NEXT: store i64 [[TMP8]], ptr [[TMP19]], align 8
+// CHECK2-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 3
+// CHECK2-NEXT: store ptr null, ptr [[TMP20]], align 8
+// CHECK2-NEXT: [[TMP21:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK2-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK2-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: [[TMP24:%.*]] = load i32, ptr [[N]], align 4
+// CHECK2-NEXT: store i32 [[TMP24]], ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK2-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP25]], 0
+// CHECK2-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK2-NEXT: [[SUB6:%.*]] = sub nsw i32 [[DIV5]], 1
+// CHECK2-NEXT: store i32 [[SUB6]], ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK2-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP26]], 1
+// CHECK2-NEXT: [[TMP27:%.*]] = zext i32 [[ADD]] to i64
+// CHECK2-NEXT: [[TMP28:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP23]], 0
+// CHECK2-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK2-NEXT: store i32 2, ptr [[TMP29]], align 4
+// CHECK2-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK2-NEXT: store i32 4, ptr [[TMP30]], align 4
+// CHECK2-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK2-NEXT: store ptr [[TMP21]], ptr [[TMP31]], align 8
+// CHECK2-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK2-NEXT: store ptr [[TMP22]], ptr [[TMP32]], align 8
+// CHECK2-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK2-NEXT: store ptr @.offload_sizes, ptr [[TMP33]], align 8
+// CHECK2-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK2-NEXT: store ptr @.offload_maptypes, ptr [[TMP34]], align 8
+// CHECK2-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK2-NEXT: store ptr null, ptr [[TMP35]], align 8
+// CHECK2-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK2-NEXT: store ptr null, ptr [[TMP36]], align 8
+// CHECK2-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK2-NEXT: store i64 [[TMP27]], ptr [[TMP37]], align 8
+// CHECK2-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK2-NEXT: store i64 0, ptr [[TMP38]], align 8
+// CHECK2-NEXT: [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK2-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP39]], align 4
+// CHECK2-NEXT: [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK2-NEXT: store [3 x i32] [[TMP28]], ptr [[TMP40]], align 4
+// CHECK2-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK2-NEXT: store i32 0, ptr [[TMP41]], align 4
+// CHECK2-NEXT: [[TMP42:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3:[0-9]+]], i64 -1, i32 -1, i32 [[TMP23]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l51.region_id, ptr [[KERNEL_ARGS]])
+// CHECK2-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0
+// CHECK2-NEXT: br i1 [[TMP43]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK2: omp_offload.failed:
+// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l51(i64 [[TMP4]], ptr [[A]], i64 [[TMP6]], i64 [[TMP8]]) #[[ATTR2:[0-9]+]]
+// CHECK2-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK2: omp_offload.cont:
+// CHECK2-NEXT: [[TMP44:%.*]] = load i32, ptr [[N]], align 4
+// CHECK2-NEXT: store i32 [[TMP44]], ptr [[N_CASTED7]], align 4
+// CHECK2-NEXT: [[TMP45:%.*]] = load i64, ptr [[N_CASTED7]], align 8
+// CHECK2-NEXT: [[TMP46:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// CHECK2-NEXT: [[TMP47:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
+// CHECK2-NEXT: store i64 [[TMP45]], ptr [[TMP47]], align 8
+// CHECK2-NEXT: [[TMP48:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
+// CHECK2-NEXT: store i64 [[TMP45]], ptr [[TMP48]], align 8
+// CHECK2-NEXT: [[TMP49:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS10]], i64 0, i64 0
+// CHECK2-NEXT: store ptr null, ptr [[TMP49]], align 8
+// CHECK2-NEXT: [[TMP50:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 1
+// CHECK2-NEXT: store ptr [[A]], ptr [[TMP50]], align 8
+// CHECK2-NEXT: [[TMP51:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 1
+// CHECK2-NEXT: store ptr [[A]], ptr [[TMP51]], align 8
+// CHECK2-NEXT: [[TMP52:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS10]], i64 0, i64 1
+// CHECK2-NEXT: store ptr null, ptr [[TMP52]], align 8
+// CHECK2-NEXT: [[TMP53:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 2
+// CHECK2-NEXT: store ptr [[TMP46]], ptr [[TMP53]], align 8
+// CHECK2-NEXT: [[TMP54:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 2
+// CHECK2-NEXT: store ptr [[TMP46]], ptr [[TMP54]], align 8
+// CHECK2-NEXT: [[TMP55:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS10]], i64 0, i64 2
+// CHECK2-NEXT: store ptr null, ptr [[TMP55]], align 8
+// CHECK2-NEXT: [[TMP56:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
+// CHECK2-NEXT: [[TMP57:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
+// CHECK2-NEXT: [[TMP58:%.*]] = load i32, ptr [[N]], align 4
+// CHECK2-NEXT: store i32 [[TMP58]], ptr [[DOTCAPTURE_EXPR_12]], align 4
+// CHECK2-NEXT: [[TMP59:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_12]], align 4
+// CHECK2-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP59]], 0
+// CHECK2-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1
+// CHECK2-NEXT: [[SUB16:%.*]] = sub nsw i32 [[DIV15]], 1
+// CHECK2-NEXT: store i32 [[SUB16]], ptr [[DOTCAPTURE_EXPR_13]], align 4
+// CHECK2-NEXT: [[TMP60:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_13]], align 4
+// CHECK2-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP60]], 1
+// CHECK2-NEXT: [[TMP61:%.*]] = zext i32 [[ADD17]] to i64
+// CHECK2-NEXT: [[TMP62:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 0
+// CHECK2-NEXT: store i32 2, ptr [[TMP62]], align 4
+// CHECK2-NEXT: [[TMP63:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 1
+// CHECK2-NEXT: store i32 3, ptr [[TMP63]], align 4
+// CHECK2-NEXT: [[TMP64:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 2
+// CHECK2-NEXT: store ptr [[TMP56]], ptr [[TMP64]], align 8
+// CHECK2-NEXT: [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 3
+// CHECK2-NEXT: store ptr [[TMP57]], ptr [[TMP65]], align 8
+// CHECK2-NEXT: [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 4
+// CHECK2-NEXT: store ptr @.offload_sizes.1, ptr [[TMP66]], align 8
+// CHECK2-NEXT: [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 5
+// CHECK2-NEXT: store ptr @.offload_maptypes.2, ptr [[TMP67]], align 8
+// CHECK2-NEXT: [[TMP68:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 6
+// CHECK2-NEXT: store ptr null, ptr [[TMP68]], align 8
+// CHECK2-NEXT: [[TMP69:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 7
+// CHECK2-NEXT: store ptr null, ptr [[TMP69]], align 8
+// CHECK2-NEXT: [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 8
+// CHECK2-NEXT: store i64 [[TMP61]], ptr [[TMP70]], align 8
+// CHECK2-NEXT: [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 9
+// CHECK2-NEXT: store i64 0, ptr [[TMP71]], align 8
+// CHECK2-NEXT: [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 10
+// CHECK2-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP72]], align 4
+// CHECK2-NEXT: [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 11
+// CHECK2-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP73]], align 4
+// CHECK2-NEXT: [[TMP74:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 12
+// CHECK2-NEXT: store i32 0, ptr [[TMP74]], align 4
+// CHECK2-NEXT: [[TMP75:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l57.region_id, ptr [[KERNEL_ARGS18]])
+// CHECK2-NEXT: [[TMP76:%.*]] = icmp ne i32 [[TMP75]], 0
+// CHECK2-NEXT: br i1 [[TMP76]], label [[OMP_OFFLOAD_FAILED19:%.*]], label [[OMP_OFFLOAD_CONT20:%.*]]
+// CHECK2: omp_offload.failed19:
+// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l57(i64 [[TMP45]], ptr [[A]], ptr [[TMP46]]) #[[ATTR2]]
+// CHECK2-NEXT: br label [[OMP_OFFLOAD_CONT20]]
+// CHECK2: omp_offload.cont20:
+// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[A]], i64 0, i64 0
+// CHECK2-NEXT: [[TMP77:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK2-NEXT: ret i32 [[TMP77]]
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l51
+// CHECK2-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK2-NEXT: entry:
+// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]])
+// CHECK2-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK2-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK2-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK2-NEXT: store i64 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 8
+// CHECK2-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR2]], align 4
+// CHECK2-NEXT: call void @__kmpc_push_num_teams(ptr @[[GLOB3]], i32 [[TMP0]], i32 [[TMP2]], i32 [[TMP3]])
+// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP4]], ptr [[N_CASTED]], align 4
+// CHECK2-NEXT: [[TMP5:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK2-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l51.omp_outlined, i64 [[TMP5]], ptr [[TMP1]])
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l51.omp_outlined
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]]) #[[ATTR1]] {
+// CHECK2-NEXT: entry:
+// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK2-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK2-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK2-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK2-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK2: omp.precond.then:
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK2-NEXT: [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK2-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK2-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK2-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK2: cond.true:
+// CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: br label [[COND_END:%.*]]
+// CHECK2: cond.false:
+// CHECK2-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: br label [[COND_END]]
+// CHECK2: cond.end:
+// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ]
+// CHECK2-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK2: omp.inner.for.cond:
+// CHECK2-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
+// CHECK2-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK2: omp.inner.for.body:
+// CHECK2-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64
+// CHECK2-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64
+// CHECK2-NEXT: [[TMP18:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP18]], ptr [[N_CASTED]], align 4
+// CHECK2-NEXT: [[TMP19:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK2-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l51.omp_outlined.omp_outlined, i64 [[TMP15]], i64 [[TMP17]], i64 [[TMP19]], ptr [[TMP0]])
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK2: omp.inner.for.inc:
+// CHECK2-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK2-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK2: omp.inner.for.end:
+// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK2: omp.loop.exit:
+// CHECK2-NEXT: [[TMP22:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
+// CHECK2-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP23]])
+// CHECK2-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK2: omp.precond.end:
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l51.omp_outlined.omp_outlined
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]]) #[[ATTR1]] {
+// CHECK2-NEXT: entry:
+// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[I4:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK2-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK2-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK2-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK2-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK2: omp.precond.then:
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT: [[TMP5:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK2-NEXT: [[CONV:%.*]] = trunc i64 [[TMP5]] to i32
+// CHECK2-NEXT: [[TMP6:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK2-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP6]] to i32
+// CHECK2-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK2-NEXT: store i32 [[CONV3]], ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK2-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK2-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP8]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
+// CHECK2-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK2: cond.true:
+// CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: br label [[COND_END:%.*]]
+// CHECK2: cond.false:
+// CHECK2-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT: br label [[COND_END]]
+// CHECK2: cond.end:
+// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ]
+// CHECK2-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK2-NEXT: store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK2: omp.inner.for.cond:
+// CHECK2-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
+// CHECK2-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK2: omp.inner.for.body:
+// CHECK2-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
+// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK2-NEXT: store i32 [[ADD]], ptr [[I4]], align 4
+// CHECK2-NEXT: [[TMP17:%.*]] = load i32, ptr [[I4]], align 4
+// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64
+// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK2-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4
+// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK2: omp.body.continue:
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK2: omp.inner.for.inc:
+// CHECK2-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP18]], 1
+// CHECK2-NEXT: store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK2: omp.inner.for.end:
+// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK2: omp.loop.exit:
+// CHECK2-NEXT: [[TMP19:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
+// CHECK2-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP20]])
+// CHECK2-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK2: omp.precond.end:
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l57
+// CHECK2-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR1]] {
+// CHECK2-NEXT: entry:
+// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK2-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK2-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 8
+// CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK2-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP1]], ptr [[N_CASTED]], align 4
+// CHECK2-NEXT: [[TMP2:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// CHECK2-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l57.omp_outlined, i64 [[TMP2]], ptr [[TMP0]], ptr [[TMP3]])
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l57.omp_outlined
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR1]] {
+// CHECK2-NEXT: entry:
+// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK2-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK2-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK2-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 8
+// CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK2-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK2-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK2: omp.precond.then:
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK2-NEXT: [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK2-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP6]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK2-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK2-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK2: cond.true:
+// CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: br label [[COND_END:%.*]]
+// CHECK2: cond.false:
+// CHECK2-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: br label [[COND_END]]
+// CHECK2: cond.end:
+// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ]
+// CHECK2-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK2: omp.inner.for.cond:
+// CHECK2-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
+// CHECK2-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK2: omp.inner.for.body:
+// CHECK2-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64
+// CHECK2-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64
+// CHECK2-NEXT: [[TMP18:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP18]], ptr [[N_CASTED]], align 4
+// CHECK2-NEXT: [[TMP19:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK2-NEXT: [[TMP20:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// CHECK2-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 5, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l57.omp_outlined.omp_outlined, i64 [[TMP15]], i64 [[TMP17]], i64 [[TMP19]], ptr [[TMP0]], ptr [[TMP20]])
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK2: omp.inner.for.inc:
+// CHECK2-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP21]], [[TMP22]]
+// CHECK2-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK2: omp.inner.for.end:
+// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK2: omp.loop.exit:
+// CHECK2-NEXT: [[TMP23:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4
+// CHECK2-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP24]])
+// CHECK2-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK2: omp.precond.end:
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l57.omp_outlined.omp_outlined
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR1]] {
+// CHECK2-NEXT: entry:
+// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: [[I4:%.*]] = alloca i32, align 4
+// CHECK2-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK2-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK2-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK2-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK2-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK2-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 8
+// CHECK2-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK2-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK2-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK2-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK2-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK2-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK2-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK2: omp.precond.then:
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT: [[TMP5:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK2-NEXT: [[CONV:%.*]] = trunc i64 [[TMP5]] to i32
+// CHECK2-NEXT: [[TMP6:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK2-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP6]] to i32
+// CHECK2-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK2-NEXT: store i32 [[CONV3]], ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK2-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK2-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK2-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP8]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
+// CHECK2-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK2: cond.true:
+// CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT: br label [[COND_END:%.*]]
+// CHECK2: cond.false:
+// CHECK2-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT: br label [[COND_END]]
+// CHECK2: cond.end:
+// CHECK2-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ]
+// CHECK2-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK2-NEXT: store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK2: omp.inner.for.cond:
+// CHECK2-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK2-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
+// CHECK2-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK2: omp.inner.for.body:
+// CHECK2-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
+// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK2-NEXT: store i32 [[ADD]], ptr [[I4]], align 4
+// CHECK2-NEXT: [[TMP17:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 0
+// CHECK2-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK2-NEXT: [[TMP19:%.*]] = load i32, ptr [[I4]], align 4
+// CHECK2-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64
+// CHECK2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK2-NEXT: store i32 [[TMP18]], ptr [[ARRAYIDX7]], align 4
+// CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK2: omp.body.continue:
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK2: omp.inner.for.inc:
+// CHECK2-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP20]], 1
+// CHECK2-NEXT: store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK2: omp.inner.for.end:
+// CHECK2-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK2: omp.loop.exit:
+// CHECK2-NEXT: [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK2-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
+// CHECK2-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP22]])
+// CHECK2-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK2: omp.precond.end:
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK2-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK2-SAME: () #[[ATTR3:[0-9]+]] {
+// CHECK2-NEXT: entry:
+// CHECK2-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK2-NEXT: ret void
+//
+//
+// CHECK4-LABEL: define {{[^@]+}}@_Z16target_teams_funPi
+// CHECK4-SAME: (ptr noundef [[G:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK4-NEXT: entry:
+// CHECK4-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 4
+// CHECK4-NEXT: [[N:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[A:%.*]] = alloca [1000 x i32], align 4
+// CHECK4-NEXT: [[TE:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[TH:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTCAPTURE_EXPR__CASTED2:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [4 x ptr], align 4
+// CHECK4-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [4 x ptr], align 4
+// CHECK4-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [4 x ptr], align 4
+// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTCAPTURE_EXPR_4:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK4-NEXT: [[N_CASTED7:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTOFFLOAD_BASEPTRS8:%.*]] = alloca [3 x ptr], align 4
+// CHECK4-NEXT: [[DOTOFFLOAD_PTRS9:%.*]] = alloca [3 x ptr], align 4
+// CHECK4-NEXT: [[DOTOFFLOAD_MAPPERS10:%.*]] = alloca [3 x ptr], align 4
+// CHECK4-NEXT: [[_TMP11:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTCAPTURE_EXPR_12:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTCAPTURE_EXPR_13:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[KERNEL_ARGS18:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK4-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 4
+// CHECK4-NEXT: store i32 1000, ptr [[N]], align 4
+// CHECK4-NEXT: [[TMP0:%.*]] = load i32, ptr [[N]], align 4
+// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP0]], 128
+// CHECK4-NEXT: store i32 [[DIV]], ptr [[TE]], align 4
+// CHECK4-NEXT: store i32 128, ptr [[TH]], align 4
+// CHECK4-NEXT: [[TMP1:%.*]] = load i32, ptr [[TE]], align 4
+// CHECK4-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK4-NEXT: [[TMP2:%.*]] = load i32, ptr [[TH]], align 4
+// CHECK4-NEXT: store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK4-NEXT: [[TMP3:%.*]] = load i32, ptr [[N]], align 4
+// CHECK4-NEXT: store i32 [[TMP3]], ptr [[N_CASTED]], align 4
+// CHECK4-NEXT: [[TMP4:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK4-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK4-NEXT: store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 4
+// CHECK4-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED]], align 4
+// CHECK4-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK4-NEXT: store i32 [[TMP7]], ptr [[DOTCAPTURE_EXPR__CASTED2]], align 4
+// CHECK4-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED2]], align 4
+// CHECK4-NEXT: [[TMP9:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK4-NEXT: store i32 [[TMP4]], ptr [[TMP9]], align 4
+// CHECK4-NEXT: [[TMP10:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK4-NEXT: store i32 [[TMP4]], ptr [[TMP10]], align 4
+// CHECK4-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK4-NEXT: store ptr null, ptr [[TMP11]], align 4
+// CHECK4-NEXT: [[TMP12:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK4-NEXT: store ptr [[A]], ptr [[TMP12]], align 4
+// CHECK4-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK4-NEXT: store ptr [[A]], ptr [[TMP13]], align 4
+// CHECK4-NEXT: [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK4-NEXT: store ptr null, ptr [[TMP14]], align 4
+// CHECK4-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK4-NEXT: store i32 [[TMP6]], ptr [[TMP15]], align 4
+// CHECK4-NEXT: [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK4-NEXT: store i32 [[TMP6]], ptr [[TMP16]], align 4
+// CHECK4-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
+// CHECK4-NEXT: store ptr null, ptr [[TMP17]], align 4
+// CHECK4-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 3
+// CHECK4-NEXT: store i32 [[TMP8]], ptr [[TMP18]], align 4
+// CHECK4-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 3
+// CHECK4-NEXT: store i32 [[TMP8]], ptr [[TMP19]], align 4
+// CHECK4-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 3
+// CHECK4-NEXT: store ptr null, ptr [[TMP20]], align 4
+// CHECK4-NEXT: [[TMP21:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK4-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK4-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK4-NEXT: [[TMP24:%.*]] = load i32, ptr [[N]], align 4
+// CHECK4-NEXT: store i32 [[TMP24]], ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK4-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP25]], 0
+// CHECK4-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK4-NEXT: [[SUB6:%.*]] = sub nsw i32 [[DIV5]], 1
+// CHECK4-NEXT: store i32 [[SUB6]], ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK4-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP26]], 1
+// CHECK4-NEXT: [[TMP27:%.*]] = zext i32 [[ADD]] to i64
+// CHECK4-NEXT: [[TMP28:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP23]], 0
+// CHECK4-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK4-NEXT: store i32 2, ptr [[TMP29]], align 4
+// CHECK4-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK4-NEXT: store i32 4, ptr [[TMP30]], align 4
+// CHECK4-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK4-NEXT: store ptr [[TMP21]], ptr [[TMP31]], align 4
+// CHECK4-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK4-NEXT: store ptr [[TMP22]], ptr [[TMP32]], align 4
+// CHECK4-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK4-NEXT: store ptr @.offload_sizes, ptr [[TMP33]], align 4
+// CHECK4-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK4-NEXT: store ptr @.offload_maptypes, ptr [[TMP34]], align 4
+// CHECK4-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK4-NEXT: store ptr null, ptr [[TMP35]], align 4
+// CHECK4-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK4-NEXT: store ptr null, ptr [[TMP36]], align 4
+// CHECK4-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK4-NEXT: store i64 [[TMP27]], ptr [[TMP37]], align 8
+// CHECK4-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK4-NEXT: store i64 0, ptr [[TMP38]], align 8
+// CHECK4-NEXT: [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK4-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP39]], align 4
+// CHECK4-NEXT: [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK4-NEXT: store [3 x i32] [[TMP28]], ptr [[TMP40]], align 4
+// CHECK4-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK4-NEXT: store i32 0, ptr [[TMP41]], align 4
+// CHECK4-NEXT: [[TMP42:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3:[0-9]+]], i64 -1, i32 -1, i32 [[TMP23]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l51.region_id, ptr [[KERNEL_ARGS]])
+// CHECK4-NEXT: [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0
+// CHECK4-NEXT: br i1 [[TMP43]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK4: omp_offload.failed:
+// CHECK4-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l51(i32 [[TMP4]], ptr [[A]], i32 [[TMP6]], i32 [[TMP8]]) #[[ATTR2:[0-9]+]]
+// CHECK4-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK4: omp_offload.cont:
+// CHECK4-NEXT: [[TMP44:%.*]] = load i32, ptr [[N]], align 4
+// CHECK4-NEXT: store i32 [[TMP44]], ptr [[N_CASTED7]], align 4
+// CHECK4-NEXT: [[TMP45:%.*]] = load i32, ptr [[N_CASTED7]], align 4
+// CHECK4-NEXT: [[TMP46:%.*]] = load ptr, ptr [[G_ADDR]], align 4
+// CHECK4-NEXT: [[TMP47:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
+// CHECK4-NEXT: store i32 [[TMP45]], ptr [[TMP47]], align 4
+// CHECK4-NEXT: [[TMP48:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
+// CHECK4-NEXT: store i32 [[TMP45]], ptr [[TMP48]], align 4
+// CHECK4-NEXT: [[TMP49:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS10]], i32 0, i32 0
+// CHECK4-NEXT: store ptr null, ptr [[TMP49]], align 4
+// CHECK4-NEXT: [[TMP50:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 1
+// CHECK4-NEXT: store ptr [[A]], ptr [[TMP50]], align 4
+// CHECK4-NEXT: [[TMP51:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 1
+// CHECK4-NEXT: store ptr [[A]], ptr [[TMP51]], align 4
+// CHECK4-NEXT: [[TMP52:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS10]], i32 0, i32 1
+// CHECK4-NEXT: store ptr null, ptr [[TMP52]], align 4
+// CHECK4-NEXT: [[TMP53:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 2
+// CHECK4-NEXT: store ptr [[TMP46]], ptr [[TMP53]], align 4
+// CHECK4-NEXT: [[TMP54:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 2
+// CHECK4-NEXT: store ptr [[TMP46]], ptr [[TMP54]], align 4
+// CHECK4-NEXT: [[TMP55:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS10]], i32 0, i32 2
+// CHECK4-NEXT: store ptr null, ptr [[TMP55]], align 4
+// CHECK4-NEXT: [[TMP56:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
+// CHECK4-NEXT: [[TMP57:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS9]], i32 0, i32 0
+// CHECK4-NEXT: [[TMP58:%.*]] = load i32, ptr [[N]], align 4
+// CHECK4-NEXT: store i32 [[TMP58]], ptr [[DOTCAPTURE_EXPR_12]], align 4
+// CHECK4-NEXT: [[TMP59:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_12]], align 4
+// CHECK4-NEXT: [[SUB14:%.*]] = sub nsw i32 [[TMP59]], 0
+// CHECK4-NEXT: [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1
+// CHECK4-NEXT: [[SUB16:%.*]] = sub nsw i32 [[DIV15]], 1
+// CHECK4-NEXT: store i32 [[SUB16]], ptr [[DOTCAPTURE_EXPR_13]], align 4
+// CHECK4-NEXT: [[TMP60:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_13]], align 4
+// CHECK4-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP60]], 1
+// CHECK4-NEXT: [[TMP61:%.*]] = zext i32 [[ADD17]] to i64
+// CHECK4-NEXT: [[TMP62:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 0
+// CHECK4-NEXT: store i32 2, ptr [[TMP62]], align 4
+// CHECK4-NEXT: [[TMP63:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 1
+// CHECK4-NEXT: store i32 3, ptr [[TMP63]], align 4
+// CHECK4-NEXT: [[TMP64:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 2
+// CHECK4-NEXT: store ptr [[TMP56]], ptr [[TMP64]], align 4
+// CHECK4-NEXT: [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 3
+// CHECK4-NEXT: store ptr [[TMP57]], ptr [[TMP65]], align 4
+// CHECK4-NEXT: [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 4
+// CHECK4-NEXT: store ptr @.offload_sizes.1, ptr [[TMP66]], align 4
+// CHECK4-NEXT: [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 5
+// CHECK4-NEXT: store ptr @.offload_maptypes.2, ptr [[TMP67]], align 4
+// CHECK4-NEXT: [[TMP68:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 6
+// CHECK4-NEXT: store ptr null, ptr [[TMP68]], align 4
+// CHECK4-NEXT: [[TMP69:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 7
+// CHECK4-NEXT: store ptr null, ptr [[TMP69]], align 4
+// CHECK4-NEXT: [[TMP70:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 8
+// CHECK4-NEXT: store i64 [[TMP61]], ptr [[TMP70]], align 8
+// CHECK4-NEXT: [[TMP71:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 9
+// CHECK4-NEXT: store i64 0, ptr [[TMP71]], align 8
+// CHECK4-NEXT: [[TMP72:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 10
+// CHECK4-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP72]], align 4
+// CHECK4-NEXT: [[TMP73:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 11
+// CHECK4-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP73]], align 4
+// CHECK4-NEXT: [[TMP74:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS18]], i32 0, i32 12
+// CHECK4-NEXT: store i32 0, ptr [[TMP74]], align 4
+// CHECK4-NEXT: [[TMP75:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l57.region_id, ptr [[KERNEL_ARGS18]])
+// CHECK4-NEXT: [[TMP76:%.*]] = icmp ne i32 [[TMP75]], 0
+// CHECK4-NEXT: br i1 [[TMP76]], label [[OMP_OFFLOAD_FAILED19:%.*]], label [[OMP_OFFLOAD_CONT20:%.*]]
+// CHECK4: omp_offload.failed19:
+// CHECK4-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l57(i32 [[TMP45]], ptr [[A]], ptr [[TMP46]]) #[[ATTR2]]
+// CHECK4-NEXT: br label [[OMP_OFFLOAD_CONT20]]
+// CHECK4: omp_offload.cont20:
+// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[A]], i32 0, i32 0
+// CHECK4-NEXT: [[TMP77:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK4-NEXT: ret i32 [[TMP77]]
+//
+//
+// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l51
+// CHECK4-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK4-NEXT: entry:
+// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK4-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]])
+// CHECK4-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK4-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK4-NEXT: store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK4-NEXT: store i32 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 4
+// CHECK4-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK4-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK4-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR2]], align 4
+// CHECK4-NEXT: call void @__kmpc_push_num_teams(ptr @[[GLOB3]], i32 [[TMP0]], i32 [[TMP2]], i32 [[TMP3]])
+// CHECK4-NEXT: [[TMP4:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK4-NEXT: store i32 [[TMP4]], ptr [[N_CASTED]], align 4
+// CHECK4-NEXT: [[TMP5:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK4-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l51.omp_outlined, i32 [[TMP5]], ptr [[TMP1]])
+// CHECK4-NEXT: ret void
+//
+//
+// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l51.omp_outlined
+// CHECK4-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]]) #[[ATTR1]] {
+// CHECK4-NEXT: entry:
+// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK4-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK4-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK4-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK4-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK4-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK4-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK4-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK4-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK4-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK4-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK4: omp.precond.then:
+// CHECK4-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK4-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK4-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK4-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK4-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK4-NEXT: [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK4-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK4-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK4-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK4-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK4-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK4-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK4: cond.true:
+// CHECK4-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK4-NEXT: br label [[COND_END:%.*]]
+// CHECK4: cond.false:
+// CHECK4-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK4-NEXT: br label [[COND_END]]
+// CHECK4: cond.end:
+// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ]
+// CHECK4-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK4-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK4-NEXT: store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
+// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK4: omp.inner.for.cond:
+// CHECK4-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK4-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK4-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
+// CHECK4-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK4: omp.inner.for.body:
+// CHECK4-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK4-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK4-NEXT: [[TMP16:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK4-NEXT: store i32 [[TMP16]], ptr [[N_CASTED]], align 4
+// CHECK4-NEXT: [[TMP17:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK4-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l51.omp_outlined.omp_outlined, i32 [[TMP14]], i32 [[TMP15]], i32 [[TMP17]], ptr [[TMP0]])
+// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK4: omp.inner.for.inc:
+// CHECK4-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK4-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
+// CHECK4-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK4: omp.inner.for.end:
+// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK4: omp.loop.exit:
+// CHECK4-NEXT: [[TMP20:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK4-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
+// CHECK4-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP21]])
+// CHECK4-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK4: omp.precond.end:
+// CHECK4-NEXT: ret void
+//
+//
+// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l51.omp_outlined.omp_outlined
+// CHECK4-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]]) #[[ATTR1]] {
+// CHECK4-NEXT: entry:
+// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK4-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK4-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK4-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK4-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK4-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK4-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK4-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK4-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK4-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK4-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK4: omp.precond.then:
+// CHECK4-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK4-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK4-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_UB]], align 4
+// CHECK4-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK4-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK4-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_LB]], align 4
+// CHECK4-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_UB]], align 4
+// CHECK4-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK4-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK4-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK4-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK4-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP8]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK4-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK4-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK4-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
+// CHECK4-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK4: cond.true:
+// CHECK4-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK4-NEXT: br label [[COND_END:%.*]]
+// CHECK4: cond.false:
+// CHECK4-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK4-NEXT: br label [[COND_END]]
+// CHECK4: cond.end:
+// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ]
+// CHECK4-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK4-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK4-NEXT: store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4
+// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK4: omp.inner.for.cond:
+// CHECK4-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK4-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK4-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
+// CHECK4-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK4: omp.inner.for.body:
+// CHECK4-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
+// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK4-NEXT: store i32 [[ADD]], ptr [[I3]], align 4
+// CHECK4-NEXT: [[TMP17:%.*]] = load i32, ptr [[I3]], align 4
+// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i32 0, i32 [[TMP17]]
+// CHECK4-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4
+// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK4: omp.body.continue:
+// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK4: omp.inner.for.inc:
+// CHECK4-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK4-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP18]], 1
+// CHECK4-NEXT: store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4
+// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK4: omp.inner.for.end:
+// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK4: omp.loop.exit:
+// CHECK4-NEXT: [[TMP19:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK4-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
+// CHECK4-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP20]])
+// CHECK4-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK4: omp.precond.end:
+// CHECK4-NEXT: ret void
+//
+//
+// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l57
+// CHECK4-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR1]] {
+// CHECK4-NEXT: entry:
+// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK4-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 4
+// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK4-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK4-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 4
+// CHECK4-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK4-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK4-NEXT: store i32 [[TMP1]], ptr [[N_CASTED]], align 4
+// CHECK4-NEXT: [[TMP2:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK4-NEXT: [[TMP3:%.*]] = load ptr, ptr [[G_ADDR]], align 4
+// CHECK4-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l57.omp_outlined, i32 [[TMP2]], ptr [[TMP0]], ptr [[TMP3]])
+// CHECK4-NEXT: ret void
+//
+//
+// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l57.omp_outlined
+// CHECK4-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR1]] {
+// CHECK4-NEXT: entry:
+// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK4-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 4
+// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK4-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK4-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK4-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK4-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 4
+// CHECK4-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK4-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK4-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK4-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK4-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK4-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK4-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK4: omp.precond.then:
+// CHECK4-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK4-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK4-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK4-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK4-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK4-NEXT: [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK4-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK4-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP6]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK4-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK4-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK4-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK4-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK4: cond.true:
+// CHECK4-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK4-NEXT: br label [[COND_END:%.*]]
+// CHECK4: cond.false:
+// CHECK4-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK4-NEXT: br label [[COND_END]]
+// CHECK4: cond.end:
+// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ]
+// CHECK4-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK4-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK4-NEXT: store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
+// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK4: omp.inner.for.cond:
+// CHECK4-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK4-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK4-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
+// CHECK4-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK4: omp.inner.for.body:
+// CHECK4-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK4-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK4-NEXT: [[TMP16:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK4-NEXT: store i32 [[TMP16]], ptr [[N_CASTED]], align 4
+// CHECK4-NEXT: [[TMP17:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK4-NEXT: [[TMP18:%.*]] = load ptr, ptr [[G_ADDR]], align 4
+// CHECK4-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 5, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l57.omp_outlined.omp_outlined, i32 [[TMP14]], i32 [[TMP15]], i32 [[TMP17]], ptr [[TMP0]], ptr [[TMP18]])
+// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK4: omp.inner.for.inc:
+// CHECK4-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK4-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK4-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK4: omp.inner.for.end:
+// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK4: omp.loop.exit:
+// CHECK4-NEXT: [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK4-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
+// CHECK4-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP22]])
+// CHECK4-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK4: omp.precond.end:
+// CHECK4-NEXT: ret void
+//
+//
+// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l57.omp_outlined.omp_outlined
+// CHECK4-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR1]] {
+// CHECK4-NEXT: entry:
+// CHECK4-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK4-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK4-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK4-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 4
+// CHECK4-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK4-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK4-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK4-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK4-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK4-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK4-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK4-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 4
+// CHECK4-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK4-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK4-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK4-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK4-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK4-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK4-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK4-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK4-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK4-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK4-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK4: omp.precond.then:
+// CHECK4-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK4-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK4-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_UB]], align 4
+// CHECK4-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK4-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK4-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_LB]], align 4
+// CHECK4-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_UB]], align 4
+// CHECK4-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK4-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK4-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK4-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK4-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP8]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK4-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK4-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK4-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
+// CHECK4-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK4: cond.true:
+// CHECK4-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK4-NEXT: br label [[COND_END:%.*]]
+// CHECK4: cond.false:
+// CHECK4-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK4-NEXT: br label [[COND_END]]
+// CHECK4: cond.end:
+// CHECK4-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ]
+// CHECK4-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK4-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK4-NEXT: store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4
+// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK4: omp.inner.for.cond:
+// CHECK4-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK4-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK4-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
+// CHECK4-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK4: omp.inner.for.body:
+// CHECK4-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
+// CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK4-NEXT: store i32 [[ADD]], ptr [[I3]], align 4
+// CHECK4-NEXT: [[TMP17:%.*]] = load ptr, ptr [[G_ADDR]], align 4
+// CHECK4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
+// CHECK4-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK4-NEXT: [[TMP19:%.*]] = load i32, ptr [[I3]], align 4
+// CHECK4-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i32 0, i32 [[TMP19]]
+// CHECK4-NEXT: store i32 [[TMP18]], ptr [[ARRAYIDX6]], align 4
+// CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK4: omp.body.continue:
+// CHECK4-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK4: omp.inner.for.inc:
+// CHECK4-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK4-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP20]], 1
+// CHECK4-NEXT: store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4
+// CHECK4-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK4: omp.inner.for.end:
+// CHECK4-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK4: omp.loop.exit:
+// CHECK4-NEXT: [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK4-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
+// CHECK4-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP22]])
+// CHECK4-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK4: omp.precond.end:
+// CHECK4-NEXT: ret void
+//
+//
+// CHECK4-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK4-SAME: () #[[ATTR3:[0-9]+]] {
+// CHECK4-NEXT: entry:
+// CHECK4-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK4-NEXT: ret void
+//
+//
+// CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l51
+// CHECK10-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK10-NEXT: entry:
+// CHECK10-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK10-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK10-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK10-NEXT: [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i64, align 8
+// CHECK10-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK10-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]])
+// CHECK10-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK10-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK10-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK10-NEXT: store i64 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 8
+// CHECK10-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK10-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK10-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR2]], align 4
+// CHECK10-NEXT: call void @__kmpc_push_num_teams(ptr @[[GLOB3]], i32 [[TMP0]], i32 [[TMP2]], i32 [[TMP3]])
+// CHECK10-NEXT: [[TMP4:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK10-NEXT: store i32 [[TMP4]], ptr [[N_CASTED]], align 4
+// CHECK10-NEXT: [[TMP5:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK10-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l51.omp_outlined, i64 [[TMP5]], ptr [[TMP1]])
+// CHECK10-NEXT: ret void
+//
+//
+// CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l51.omp_outlined
+// CHECK10-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]]) #[[ATTR0]] {
+// CHECK10-NEXT: entry:
+// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK10-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK10-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK10-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK10-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK10-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK10-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK10-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK10-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK10-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK10-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK10-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK10-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK10-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK10-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK10-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK10-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK10-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK10-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK10: omp.precond.then:
+// CHECK10-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK10-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK10-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK10-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK10-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK10-NEXT: [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK10-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK10-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK10-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK10-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK10-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK10-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK10: cond.true:
+// CHECK10-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK10-NEXT: br label [[COND_END:%.*]]
+// CHECK10: cond.false:
+// CHECK10-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK10-NEXT: br label [[COND_END]]
+// CHECK10: cond.end:
+// CHECK10-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ]
+// CHECK10-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK10-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK10-NEXT: store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
+// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK10: omp.inner.for.cond:
+// CHECK10-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK10-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK10-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
+// CHECK10-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK10: omp.inner.for.body:
+// CHECK10-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK10-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64
+// CHECK10-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK10-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64
+// CHECK10-NEXT: [[TMP18:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK10-NEXT: store i32 [[TMP18]], ptr [[N_CASTED]], align 4
+// CHECK10-NEXT: [[TMP19:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK10-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l51.omp_outlined.omp_outlined, i64 [[TMP15]], i64 [[TMP17]], i64 [[TMP19]], ptr [[TMP0]])
+// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK10: omp.inner.for.inc:
+// CHECK10-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK10-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK10-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK10: omp.inner.for.end:
+// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK10: omp.loop.exit:
+// CHECK10-NEXT: [[TMP22:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK10-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
+// CHECK10-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP23]])
+// CHECK10-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK10: omp.precond.end:
+// CHECK10-NEXT: ret void
+//
+//
+// CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l51.omp_outlined.omp_outlined
+// CHECK10-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]]) #[[ATTR0]] {
+// CHECK10-NEXT: entry:
+// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK10-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK10-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK10-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK10-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[I4:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK10-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK10-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK10-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK10-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK10-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK10-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK10-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK10-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK10-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK10-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK10-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK10-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK10-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK10-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK10-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK10-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK10-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK10: omp.precond.then:
+// CHECK10-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK10-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK10-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_UB]], align 4
+// CHECK10-NEXT: [[TMP5:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK10-NEXT: [[CONV:%.*]] = trunc i64 [[TMP5]] to i32
+// CHECK10-NEXT: [[TMP6:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK10-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP6]] to i32
+// CHECK10-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK10-NEXT: store i32 [[CONV3]], ptr [[DOTOMP_UB]], align 4
+// CHECK10-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK10-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK10-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK10-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK10-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP8]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK10-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK10-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK10-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
+// CHECK10-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK10: cond.true:
+// CHECK10-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK10-NEXT: br label [[COND_END:%.*]]
+// CHECK10: cond.false:
+// CHECK10-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK10-NEXT: br label [[COND_END]]
+// CHECK10: cond.end:
+// CHECK10-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ]
+// CHECK10-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK10-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK10-NEXT: store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4
+// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK10: omp.inner.for.cond:
+// CHECK10-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK10-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK10-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
+// CHECK10-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK10: omp.inner.for.body:
+// CHECK10-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK10-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
+// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK10-NEXT: store i32 [[ADD]], ptr [[I4]], align 4
+// CHECK10-NEXT: [[TMP17:%.*]] = load i32, ptr [[I4]], align 4
+// CHECK10-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64
+// CHECK10-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK10-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4
+// CHECK10-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK10: omp.body.continue:
+// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK10: omp.inner.for.inc:
+// CHECK10-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK10-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP18]], 1
+// CHECK10-NEXT: store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4
+// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK10: omp.inner.for.end:
+// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK10: omp.loop.exit:
+// CHECK10-NEXT: [[TMP19:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK10-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
+// CHECK10-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP20]])
+// CHECK10-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK10: omp.precond.end:
+// CHECK10-NEXT: ret void
+//
+//
+// CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l57
+// CHECK10-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR0]] {
+// CHECK10-NEXT: entry:
+// CHECK10-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK10-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK10-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8
+// CHECK10-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK10-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK10-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK10-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 8
+// CHECK10-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK10-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK10-NEXT: store i32 [[TMP1]], ptr [[N_CASTED]], align 4
+// CHECK10-NEXT: [[TMP2:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK10-NEXT: [[TMP3:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// CHECK10-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l57.omp_outlined, i64 [[TMP2]], ptr [[TMP0]], ptr [[TMP3]])
+// CHECK10-NEXT: ret void
+//
+//
+// CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l57.omp_outlined
+// CHECK10-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR0]] {
+// CHECK10-NEXT: entry:
+// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK10-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK10-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK10-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8
+// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK10-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK10-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK10-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK10-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK10-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 8
+// CHECK10-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK10-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK10-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK10-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK10-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK10-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK10-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK10-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK10-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK10-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK10-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK10-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK10: omp.precond.then:
+// CHECK10-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK10-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK10-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK10-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK10-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK10-NEXT: [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK10-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK10-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP6]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK10-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK10-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK10-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK10-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK10: cond.true:
+// CHECK10-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK10-NEXT: br label [[COND_END:%.*]]
+// CHECK10: cond.false:
+// CHECK10-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK10-NEXT: br label [[COND_END]]
+// CHECK10: cond.end:
+// CHECK10-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ]
+// CHECK10-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK10-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK10-NEXT: store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
+// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK10: omp.inner.for.cond:
+// CHECK10-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK10-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK10-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
+// CHECK10-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK10: omp.inner.for.body:
+// CHECK10-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK10-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64
+// CHECK10-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK10-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64
+// CHECK10-NEXT: [[TMP18:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK10-NEXT: store i32 [[TMP18]], ptr [[N_CASTED]], align 4
+// CHECK10-NEXT: [[TMP19:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK10-NEXT: [[TMP20:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// CHECK10-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 5, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l57.omp_outlined.omp_outlined, i64 [[TMP15]], i64 [[TMP17]], i64 [[TMP19]], ptr [[TMP0]], ptr [[TMP20]])
+// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK10: omp.inner.for.inc:
+// CHECK10-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK10-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP21]], [[TMP22]]
+// CHECK10-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK10: omp.inner.for.end:
+// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK10: omp.loop.exit:
+// CHECK10-NEXT: [[TMP23:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK10-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4
+// CHECK10-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP24]])
+// CHECK10-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK10: omp.precond.end:
+// CHECK10-NEXT: ret void
+//
+//
+// CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l57.omp_outlined.omp_outlined
+// CHECK10-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR0]] {
+// CHECK10-NEXT: entry:
+// CHECK10-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK10-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK10-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK10-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK10-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK10-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK10-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8
+// CHECK10-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: [[I4:%.*]] = alloca i32, align 4
+// CHECK10-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK10-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK10-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK10-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK10-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK10-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK10-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 8
+// CHECK10-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK10-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK10-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK10-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK10-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK10-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK10-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK10-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK10-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK10-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK10-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK10-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK10: omp.precond.then:
+// CHECK10-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK10-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK10-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_UB]], align 4
+// CHECK10-NEXT: [[TMP5:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK10-NEXT: [[CONV:%.*]] = trunc i64 [[TMP5]] to i32
+// CHECK10-NEXT: [[TMP6:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK10-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP6]] to i32
+// CHECK10-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK10-NEXT: store i32 [[CONV3]], ptr [[DOTOMP_UB]], align 4
+// CHECK10-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK10-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK10-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK10-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK10-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP8]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK10-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK10-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK10-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
+// CHECK10-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK10: cond.true:
+// CHECK10-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK10-NEXT: br label [[COND_END:%.*]]
+// CHECK10: cond.false:
+// CHECK10-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK10-NEXT: br label [[COND_END]]
+// CHECK10: cond.end:
+// CHECK10-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ]
+// CHECK10-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK10-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK10-NEXT: store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4
+// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK10: omp.inner.for.cond:
+// CHECK10-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK10-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK10-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
+// CHECK10-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK10: omp.inner.for.body:
+// CHECK10-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK10-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
+// CHECK10-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK10-NEXT: store i32 [[ADD]], ptr [[I4]], align 4
+// CHECK10-NEXT: [[TMP17:%.*]] = load ptr, ptr [[G_ADDR]], align 8
+// CHECK10-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 0
+// CHECK10-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK10-NEXT: [[TMP19:%.*]] = load i32, ptr [[I4]], align 4
+// CHECK10-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64
+// CHECK10-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK10-NEXT: store i32 [[TMP18]], ptr [[ARRAYIDX7]], align 4
+// CHECK10-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK10: omp.body.continue:
+// CHECK10-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK10: omp.inner.for.inc:
+// CHECK10-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK10-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP20]], 1
+// CHECK10-NEXT: store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4
+// CHECK10-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK10: omp.inner.for.end:
+// CHECK10-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK10: omp.loop.exit:
+// CHECK10-NEXT: [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK10-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
+// CHECK10-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP22]])
+// CHECK10-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK10: omp.precond.end:
+// CHECK10-NEXT: ret void
+//
+//
+// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l51
+// CHECK12-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK12-NEXT: entry:
+// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK12-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]])
+// CHECK12-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK12-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK12-NEXT: store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK12-NEXT: store i32 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 4
+// CHECK12-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK12-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK12-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR2]], align 4
+// CHECK12-NEXT: call void @__kmpc_push_num_teams(ptr @[[GLOB3]], i32 [[TMP0]], i32 [[TMP2]], i32 [[TMP3]])
+// CHECK12-NEXT: [[TMP4:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK12-NEXT: store i32 [[TMP4]], ptr [[N_CASTED]], align 4
+// CHECK12-NEXT: [[TMP5:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK12-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l51.omp_outlined, i32 [[TMP5]], ptr [[TMP1]])
+// CHECK12-NEXT: ret void
+//
+//
+// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l51.omp_outlined
+// CHECK12-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]]) #[[ATTR0]] {
+// CHECK12-NEXT: entry:
+// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK12-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK12-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK12-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK12-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK12-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK12-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK12-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK12-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK12-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK12-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK12-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK12: omp.precond.then:
+// CHECK12-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK12-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK12-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK12-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK12-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK12-NEXT: [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK12-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK12-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP6]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK12-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK12-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK12-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK12-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK12: cond.true:
+// CHECK12-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK12-NEXT: br label [[COND_END:%.*]]
+// CHECK12: cond.false:
+// CHECK12-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK12-NEXT: br label [[COND_END]]
+// CHECK12: cond.end:
+// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ]
+// CHECK12-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK12-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK12-NEXT: store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
+// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK12: omp.inner.for.cond:
+// CHECK12-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK12-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK12-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
+// CHECK12-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK12: omp.inner.for.body:
+// CHECK12-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK12-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK12-NEXT: [[TMP16:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK12-NEXT: store i32 [[TMP16]], ptr [[N_CASTED]], align 4
+// CHECK12-NEXT: [[TMP17:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK12-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l51.omp_outlined.omp_outlined, i32 [[TMP14]], i32 [[TMP15]], i32 [[TMP17]], ptr [[TMP0]])
+// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK12: omp.inner.for.inc:
+// CHECK12-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK12-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
+// CHECK12-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK12: omp.inner.for.end:
+// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK12: omp.loop.exit:
+// CHECK12-NEXT: [[TMP20:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK12-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
+// CHECK12-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP21]])
+// CHECK12-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK12: omp.precond.end:
+// CHECK12-NEXT: ret void
+//
+//
+// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l51.omp_outlined.omp_outlined
+// CHECK12-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]]) #[[ATTR0]] {
+// CHECK12-NEXT: entry:
+// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK12-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK12-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK12-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK12-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK12-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK12-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK12-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK12-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK12-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK12-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK12-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK12-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK12-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK12-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK12: omp.precond.then:
+// CHECK12-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK12-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK12-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_UB]], align 4
+// CHECK12-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK12-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK12-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_LB]], align 4
+// CHECK12-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_UB]], align 4
+// CHECK12-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK12-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK12-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK12-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK12-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP8]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK12-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK12-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK12-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
+// CHECK12-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK12: cond.true:
+// CHECK12-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK12-NEXT: br label [[COND_END:%.*]]
+// CHECK12: cond.false:
+// CHECK12-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK12-NEXT: br label [[COND_END]]
+// CHECK12: cond.end:
+// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ]
+// CHECK12-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK12-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK12-NEXT: store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4
+// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK12: omp.inner.for.cond:
+// CHECK12-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK12-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK12-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
+// CHECK12-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK12: omp.inner.for.body:
+// CHECK12-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
+// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK12-NEXT: store i32 [[ADD]], ptr [[I3]], align 4
+// CHECK12-NEXT: [[TMP17:%.*]] = load i32, ptr [[I3]], align 4
+// CHECK12-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i32 0, i32 [[TMP17]]
+// CHECK12-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4
+// CHECK12-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK12: omp.body.continue:
+// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK12: omp.inner.for.inc:
+// CHECK12-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK12-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP18]], 1
+// CHECK12-NEXT: store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4
+// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK12: omp.inner.for.end:
+// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK12: omp.loop.exit:
+// CHECK12-NEXT: [[TMP19:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK12-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
+// CHECK12-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP20]])
+// CHECK12-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK12: omp.precond.end:
+// CHECK12-NEXT: ret void
+//
+//
+// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l57
+// CHECK12-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR0]] {
+// CHECK12-NEXT: entry:
+// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK12-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 4
+// CHECK12-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK12-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK12-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 4
+// CHECK12-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK12-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK12-NEXT: store i32 [[TMP1]], ptr [[N_CASTED]], align 4
+// CHECK12-NEXT: [[TMP2:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK12-NEXT: [[TMP3:%.*]] = load ptr, ptr [[G_ADDR]], align 4
+// CHECK12-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l57.omp_outlined, i32 [[TMP2]], ptr [[TMP0]], ptr [[TMP3]])
+// CHECK12-NEXT: ret void
+//
+//
+// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l57.omp_outlined
+// CHECK12-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR0]] {
+// CHECK12-NEXT: entry:
+// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK12-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 4
+// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK12-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK12-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK12-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK12-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 4
+// CHECK12-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK12-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK12-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK12-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK12-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK12-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK12-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK12-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK12: omp.precond.then:
+// CHECK12-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK12-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK12-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK12-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK12-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK12-NEXT: [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK12-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK12-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP6]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK12-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK12-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK12-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK12-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK12: cond.true:
+// CHECK12-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK12-NEXT: br label [[COND_END:%.*]]
+// CHECK12: cond.false:
+// CHECK12-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK12-NEXT: br label [[COND_END]]
+// CHECK12: cond.end:
+// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ]
+// CHECK12-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK12-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK12-NEXT: store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
+// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK12: omp.inner.for.cond:
+// CHECK12-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK12-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK12-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
+// CHECK12-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK12: omp.inner.for.body:
+// CHECK12-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK12-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK12-NEXT: [[TMP16:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK12-NEXT: store i32 [[TMP16]], ptr [[N_CASTED]], align 4
+// CHECK12-NEXT: [[TMP17:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK12-NEXT: [[TMP18:%.*]] = load ptr, ptr [[G_ADDR]], align 4
+// CHECK12-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 5, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l57.omp_outlined.omp_outlined, i32 [[TMP14]], i32 [[TMP15]], i32 [[TMP17]], ptr [[TMP0]], ptr [[TMP18]])
+// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK12: omp.inner.for.inc:
+// CHECK12-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK12-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK12-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK12: omp.inner.for.end:
+// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK12: omp.loop.exit:
+// CHECK12-NEXT: [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK12-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
+// CHECK12-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP22]])
+// CHECK12-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK12: omp.precond.end:
+// CHECK12-NEXT: ret void
+//
+//
+// CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l57.omp_outlined.omp_outlined
+// CHECK12-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR0]] {
+// CHECK12-NEXT: entry:
+// CHECK12-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK12-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK12-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK12-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 4
+// CHECK12-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK12-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK12-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK12-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK12-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK12-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK12-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK12-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 4
+// CHECK12-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK12-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK12-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK12-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK12-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK12-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK12-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK12-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK12-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK12-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK12-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK12: omp.precond.then:
+// CHECK12-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK12-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK12-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_UB]], align 4
+// CHECK12-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK12-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK12-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_LB]], align 4
+// CHECK12-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_UB]], align 4
+// CHECK12-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK12-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK12-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK12-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK12-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP8]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK12-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK12-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK12-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
+// CHECK12-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK12: cond.true:
+// CHECK12-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK12-NEXT: br label [[COND_END:%.*]]
+// CHECK12: cond.false:
+// CHECK12-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK12-NEXT: br label [[COND_END]]
+// CHECK12: cond.end:
+// CHECK12-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ]
+// CHECK12-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK12-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK12-NEXT: store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4
+// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK12: omp.inner.for.cond:
+// CHECK12-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK12-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK12-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
+// CHECK12-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK12: omp.inner.for.body:
+// CHECK12-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK12-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
+// CHECK12-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK12-NEXT: store i32 [[ADD]], ptr [[I3]], align 4
+// CHECK12-NEXT: [[TMP17:%.*]] = load ptr, ptr [[G_ADDR]], align 4
+// CHECK12-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
+// CHECK12-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK12-NEXT: [[TMP19:%.*]] = load i32, ptr [[I3]], align 4
+// CHECK12-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i32 0, i32 [[TMP19]]
+// CHECK12-NEXT: store i32 [[TMP18]], ptr [[ARRAYIDX6]], align 4
+// CHECK12-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK12: omp.body.continue:
+// CHECK12-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK12: omp.inner.for.inc:
+// CHECK12-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK12-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP20]], 1
+// CHECK12-NEXT: store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4
+// CHECK12-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK12: omp.inner.for.end:
+// CHECK12-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK12: omp.loop.exit:
+// CHECK12-NEXT: [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK12-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
+// CHECK12-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP22]])
+// CHECK12-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK12: omp.precond.end:
+// CHECK12-NEXT: ret void
+//
diff --git a/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
new file mode 100644
index 00000000000000..aa751976dfa269
--- /dev/null
+++ b/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
@@ -0,0 +1,2688 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=IR-GPU
+
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -emit-llvm %s -o - | FileCheck %s --check-prefix=IR
+
+// Check same results after serialization round-trip
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -emit-pch -o %t %s
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=IR-PCH
+
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+int foo() {
+ int i;
+ int j;
+ int sum[10][10];
+
+ #pragma omp target teams loop reduction(+:sum) collapse(2) \
+ bind(parallel) order(concurrent) lastprivate(j) map(tofrom:sum)
+ for(i=0; i<10; i++)
+ for(j=0; j<10; j++)
+ sum[i][j] += i;
+
+ return 0;
+}
+#endif
+// IR-PCH-HOST-LABEL: define {{[^@]+}}@_Z3foov
+// IR-PCH-HOST-SAME: () #[[ATTR0:[0-9]+]] {
+// IR-PCH-HOST-NEXT: entry:
+// IR-PCH-HOST-NEXT: [[I:%.*]] = alloca i32, align 4
+// IR-PCH-HOST-NEXT: [[J:%.*]] = alloca i32, align 4
+// IR-PCH-HOST-NEXT: [[SUM:%.*]] = alloca [10 x [10 x i32]], align 16
+// IR-PCH-HOST-NEXT: [[J_CASTED:%.*]] = alloca i64, align 8
+// IR-PCH-HOST-NEXT: [[TMP0:%.*]] = load i32, ptr [[J]], align 4
+// IR-PCH-HOST-NEXT: store i32 [[TMP0]], ptr [[J_CASTED]], align 4
+// IR-PCH-HOST-NEXT: [[TMP1:%.*]] = load i64, ptr [[J_CASTED]], align 8
+// IR-PCH-HOST-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22(i64 [[TMP1]], ptr [[SUM]]) #[[ATTR2:[0-9]+]]
+// IR-PCH-HOST-NEXT: ret i32 0
+// IR-PCH-HOST-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22
+// IR-PCH-HOST-SAME: (i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1:[0-9]+]] {
+// IR-PCH-HOST-NEXT: entry:
+// IR-PCH-HOST-NEXT: [[J_ADDR:%.*]] = alloca i64, align 8
+// IR-PCH-HOST-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-HOST-NEXT: [[J_CASTED:%.*]] = alloca i64, align 8
+// IR-PCH-HOST-NEXT: store i64 [[J]], ptr [[J_ADDR]], align 8
+// IR-PCH-HOST-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
+// IR-PCH-HOST-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
+// IR-PCH-HOST-NEXT: [[TMP1:%.*]] = load i32, ptr [[J_ADDR]], align 4
+// IR-PCH-HOST-NEXT: store i32 [[TMP1]], ptr [[J_CASTED]], align 4
+// IR-PCH-HOST-NEXT: [[TMP2:%.*]] = load i64, ptr [[J_CASTED]], align 8
+// IR-PCH-HOST-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB4:[0-9]+]], i32 2, ptr @.omp_outlined., i64 [[TMP2]], ptr [[TMP0]])
+// IR-PCH-HOST-NEXT: ret void
+// IR-PCH-HOST-LABEL: define {{[^@]+}}@.omp_outlined.
+// IR-PCH-HOST-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1]] {
+// IR-PCH-HOST-NEXT: entry:
+// IR-PCH-HOST-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-HOST-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-HOST-NEXT: [[J_ADDR:%.*]] = alloca i64, align 8
+// IR-PCH-HOST-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-HOST-NEXT: [[SUM1:%.*]] = alloca [10 x [10 x i32]], align 16
+// IR-PCH-HOST-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// IR-PCH-HOST-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// IR-PCH-HOST-NEXT: [[_TMP2:%.*]] = alloca i32, align 4
+// IR-PCH-HOST-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// IR-PCH-HOST-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// IR-PCH-HOST-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// IR-PCH-HOST-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// IR-PCH-HOST-NEXT: [[J3:%.*]] = alloca i32, align 4
+// IR-PCH-HOST-NEXT: [[I:%.*]] = alloca i32, align 4
+// IR-PCH-HOST-NEXT: [[J4:%.*]] = alloca i32, align 4
+// IR-PCH-HOST-NEXT: [[J_CASTED:%.*]] = alloca i64, align 8
+// IR-PCH-HOST-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
+// IR-PCH-HOST-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-PCH-HOST-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// IR-PCH-HOST-NEXT: store i64 [[J]], ptr [[J_ADDR]], align 8
+// IR-PCH-HOST-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
+// IR-PCH-HOST-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
+// IR-PCH-HOST-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM1]], i32 0, i32 0, i32 0
+// IR-PCH-HOST-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP1]]
+// IR-PCH-HOST-NEXT: br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
+// IR-PCH-HOST: omp.arrayinit.body:
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
+// IR-PCH-HOST-NEXT: store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP1]]
+// IR-PCH-HOST-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
+// IR-PCH-HOST: omp.arrayinit.done:
+// IR-PCH-HOST-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// IR-PCH-HOST-NEXT: store i32 99, ptr [[DOTOMP_COMB_UB]], align 4
+// IR-PCH-HOST-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// IR-PCH-HOST-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// IR-PCH-HOST-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-PCH-HOST-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// IR-PCH-HOST-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP3]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// IR-PCH-HOST-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// IR-PCH-HOST-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 99
+// IR-PCH-HOST-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// IR-PCH-HOST: cond.true:
+// IR-PCH-HOST-NEXT: br label [[COND_END:%.*]]
+// IR-PCH-HOST: cond.false:
+// IR-PCH-HOST-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// IR-PCH-HOST-NEXT: br label [[COND_END]]
+// IR-PCH-HOST: cond.end:
+// IR-PCH-HOST-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// IR-PCH-HOST-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// IR-PCH-HOST-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// IR-PCH-HOST-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
+// IR-PCH-HOST-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// IR-PCH-HOST: omp.inner.for.cond:
+// IR-PCH-HOST-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// IR-PCH-HOST-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// IR-PCH-HOST-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// IR-PCH-HOST-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// IR-PCH-HOST: omp.inner.for.body:
+// IR-PCH-HOST-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// IR-PCH-HOST-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// IR-PCH-HOST-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// IR-PCH-HOST-NEXT: [[TMP12:%.*]] = zext i32 [[TMP11]] to i64
+// IR-PCH-HOST-NEXT: [[TMP13:%.*]] = load i32, ptr [[J3]], align 4
+// IR-PCH-HOST-NEXT: store i32 [[TMP13]], ptr [[J_CASTED]], align 4
+// IR-PCH-HOST-NEXT: [[TMP14:%.*]] = load i64, ptr [[J_CASTED]], align 8
+// IR-PCH-HOST-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB4]], i32 4, ptr @.omp_outlined..1, i64 [[TMP10]], i64 [[TMP12]], i64 [[TMP14]], ptr [[SUM1]])
+// IR-PCH-HOST-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// IR-PCH-HOST: omp.inner.for.inc:
+// IR-PCH-HOST-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// IR-PCH-HOST-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// IR-PCH-HOST-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// IR-PCH-HOST-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// IR-PCH-HOST-NEXT: br label [[OMP_INNER_FOR_COND]]
+// IR-PCH-HOST: omp.inner.for.end:
+// IR-PCH-HOST-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// IR-PCH-HOST: omp.loop.exit:
+// IR-PCH-HOST-NEXT: [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-PCH-HOST-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
+// IR-PCH-HOST-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP18]])
+// IR-PCH-HOST-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// IR-PCH-HOST-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
+// IR-PCH-HOST-NEXT: br i1 [[TMP20]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
+// IR-PCH-HOST: .omp.lastprivate.then:
+// IR-PCH-HOST-NEXT: store i32 10, ptr [[J3]], align 4
+// IR-PCH-HOST-NEXT: [[TMP21:%.*]] = load i32, ptr [[J3]], align 4
+// IR-PCH-HOST-NEXT: store i32 [[TMP21]], ptr [[J_ADDR]], align 4
+// IR-PCH-HOST-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]]
+// IR-PCH-HOST: .omp.lastprivate.done:
+// IR-PCH-HOST-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
+// IR-PCH-HOST-NEXT: store ptr [[SUM1]], ptr [[TMP22]], align 8
+// IR-PCH-HOST-NEXT: [[TMP23:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-PCH-HOST-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4
+// IR-PCH-HOST-NEXT: [[TMP25:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB3:[0-9]+]], i32 [[TMP24]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @.omp.reduction.reduction_func.2, ptr @.gomp_critical_user_.reduction.var)
+// IR-PCH-HOST-NEXT: switch i32 [[TMP25]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+// IR-PCH-HOST-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+// IR-PCH-HOST-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+// IR-PCH-HOST-NEXT: ]
+// IR-PCH-HOST: .omp.reduction.case1:
+// IR-PCH-HOST-NEXT: [[TMP26:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP26]]
+// IR-PCH-HOST-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE10:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// IR-PCH-HOST: omp.arraycpy.body:
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM1]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST6:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT8:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-PCH-HOST-NEXT: [[TMP27:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], align 4
+// IR-PCH-HOST-NEXT: [[TMP28:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
+// IR-PCH-HOST-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP27]], [[TMP28]]
+// IR-PCH-HOST-NEXT: store i32 [[ADD7]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], align 4
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT8]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], i32 1
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DONE9:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT8]], [[TMP26]]
+// IR-PCH-HOST-NEXT: br i1 [[OMP_ARRAYCPY_DONE9]], label [[OMP_ARRAYCPY_DONE10]], label [[OMP_ARRAYCPY_BODY]]
+// IR-PCH-HOST: omp.arraycpy.done10:
+// IR-PCH-HOST-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP24]], ptr @.gomp_critical_user_.reduction.var)
+// IR-PCH-HOST-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// IR-PCH-HOST: .omp.reduction.case2:
+// IR-PCH-HOST-NEXT: [[TMP29:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_ISEMPTY11:%.*]] = icmp eq ptr [[TMP0]], [[TMP29]]
+// IR-PCH-HOST-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY11]], label [[OMP_ARRAYCPY_DONE18:%.*]], label [[OMP_ARRAYCPY_BODY12:%.*]]
+// IR-PCH-HOST: omp.arraycpy.body12:
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST13:%.*]] = phi ptr [ [[SUM1]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT16:%.*]], [[OMP_ARRAYCPY_BODY12]] ]
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST14:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT15:%.*]], [[OMP_ARRAYCPY_BODY12]] ]
+// IR-PCH-HOST-NEXT: [[TMP30:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST13]], align 4
+// IR-PCH-HOST-NEXT: [[TMP31:%.*]] = atomicrmw add ptr [[OMP_ARRAYCPY_DESTELEMENTPAST14]], i32 [[TMP30]] monotonic, align 4
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT15]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST14]], i32 1
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT16]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST13]], i32 1
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DONE17:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT15]], [[TMP29]]
+// IR-PCH-HOST-NEXT: br i1 [[OMP_ARRAYCPY_DONE17]], label [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_BODY12]]
+// IR-PCH-HOST: omp.arraycpy.done18:
+// IR-PCH-HOST-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP24]], ptr @.gomp_critical_user_.reduction.var)
+// IR-PCH-HOST-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// IR-PCH-HOST: .omp.reduction.default:
+// IR-PCH-HOST-NEXT: ret void
+// IR-PCH-HOST-LABEL: define {{[^@]+}}@.omp_outlined..1
+// IR-PCH-HOST-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1]] {
+// IR-PCH-HOST-NEXT: entry:
+// IR-PCH-HOST-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-HOST-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-HOST-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// IR-PCH-HOST-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// IR-PCH-HOST-NEXT: [[J_ADDR:%.*]] = alloca i64, align 8
+// IR-PCH-HOST-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-HOST-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// IR-PCH-HOST-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// IR-PCH-HOST-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// IR-PCH-HOST-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// IR-PCH-HOST-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// IR-PCH-HOST-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// IR-PCH-HOST-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// IR-PCH-HOST-NEXT: [[J3:%.*]] = alloca i32, align 4
+// IR-PCH-HOST-NEXT: [[SUM4:%.*]] = alloca [10 x [10 x i32]], align 16
+// IR-PCH-HOST-NEXT: [[I:%.*]] = alloca i32, align 4
+// IR-PCH-HOST-NEXT: [[J5:%.*]] = alloca i32, align 4
+// IR-PCH-HOST-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
+// IR-PCH-HOST-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-PCH-HOST-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// IR-PCH-HOST-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// IR-PCH-HOST-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// IR-PCH-HOST-NEXT: store i64 [[J]], ptr [[J_ADDR]], align 8
+// IR-PCH-HOST-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
+// IR-PCH-HOST-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
+// IR-PCH-HOST-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// IR-PCH-HOST-NEXT: store i32 99, ptr [[DOTOMP_UB]], align 4
+// IR-PCH-HOST-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// IR-PCH-HOST-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
+// IR-PCH-HOST-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// IR-PCH-HOST-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP2]] to i32
+// IR-PCH-HOST-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// IR-PCH-HOST-NEXT: store i32 [[CONV2]], ptr [[DOTOMP_UB]], align 4
+// IR-PCH-HOST-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// IR-PCH-HOST-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// IR-PCH-HOST-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4]], i32 0, i32 0, i32 0
+// IR-PCH-HOST-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP3]]
+// IR-PCH-HOST-NEXT: br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
+// IR-PCH-HOST: omp.arrayinit.body:
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
+// IR-PCH-HOST-NEXT: store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
+// IR-PCH-HOST-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
+// IR-PCH-HOST: omp.arrayinit.done:
+// IR-PCH-HOST-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-PCH-HOST-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+// IR-PCH-HOST-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP5]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// IR-PCH-HOST-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// IR-PCH-HOST-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP6]], 99
+// IR-PCH-HOST-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// IR-PCH-HOST: cond.true:
+// IR-PCH-HOST-NEXT: br label [[COND_END:%.*]]
+// IR-PCH-HOST: cond.false:
+// IR-PCH-HOST-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// IR-PCH-HOST-NEXT: br label [[COND_END]]
+// IR-PCH-HOST: cond.end:
+// IR-PCH-HOST-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP7]], [[COND_FALSE]] ]
+// IR-PCH-HOST-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// IR-PCH-HOST-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// IR-PCH-HOST-NEXT: store i32 [[TMP8]], ptr [[DOTOMP_IV]], align 4
+// IR-PCH-HOST-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// IR-PCH-HOST: omp.inner.for.cond:
+// IR-PCH-HOST-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3:![0-9]+]]
+// IR-PCH-HOST-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-HOST-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP9]], [[TMP10]]
+// IR-PCH-HOST-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// IR-PCH-HOST: omp.inner.for.body:
+// IR-PCH-HOST-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-HOST-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP11]], 10
+// IR-PCH-HOST-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
+// IR-PCH-HOST-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// IR-PCH-HOST-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-HOST-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-HOST-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-HOST-NEXT: [[DIV7:%.*]] = sdiv i32 [[TMP13]], 10
+// IR-PCH-HOST-NEXT: [[MUL8:%.*]] = mul nsw i32 [[DIV7]], 10
+// IR-PCH-HOST-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP12]], [[MUL8]]
+// IR-PCH-HOST-NEXT: [[MUL9:%.*]] = mul nsw i32 [[SUB]], 1
+// IR-PCH-HOST-NEXT: [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
+// IR-PCH-HOST-NEXT: store i32 [[ADD10]], ptr [[J3]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-HOST-NEXT: [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-HOST-NEXT: [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-HOST-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64
+// IR-PCH-HOST-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4]], i64 0, i64 [[IDXPROM]]
+// IR-PCH-HOST-NEXT: [[TMP16:%.*]] = load i32, ptr [[J3]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-HOST-NEXT: [[IDXPROM11:%.*]] = sext i32 [[TMP16]] to i64
+// IR-PCH-HOST-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM11]]
+// IR-PCH-HOST-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-HOST-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP17]], [[TMP14]]
+// IR-PCH-HOST-NEXT: store i32 [[ADD13]], ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-HOST-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// IR-PCH-HOST: omp.body.continue:
+// IR-PCH-HOST-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// IR-PCH-HOST: omp.inner.for.inc:
+// IR-PCH-HOST-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-HOST-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP18]], 1
+// IR-PCH-HOST-NEXT: store i32 [[ADD14]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-HOST-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]]
+// IR-PCH-HOST: omp.inner.for.end:
+// IR-PCH-HOST-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// IR-PCH-HOST: omp.loop.exit:
+// IR-PCH-HOST-NEXT: [[TMP19:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-PCH-HOST-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
+// IR-PCH-HOST-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP20]])
+// IR-PCH-HOST-NEXT: [[TMP21:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
+// IR-PCH-HOST-NEXT: store ptr [[SUM4]], ptr [[TMP21]], align 8
+// IR-PCH-HOST-NEXT: [[TMP22:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-PCH-HOST-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
+// IR-PCH-HOST-NEXT: [[TMP24:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB3]], i32 [[TMP23]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// IR-PCH-HOST-NEXT: switch i32 [[TMP24]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+// IR-PCH-HOST-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+// IR-PCH-HOST-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+// IR-PCH-HOST-NEXT: ]
+// IR-PCH-HOST: .omp.reduction.case1:
+// IR-PCH-HOST-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP25]]
+// IR-PCH-HOST-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE19:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// IR-PCH-HOST: omp.arraycpy.body:
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM4]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST15:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT17:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-PCH-HOST-NEXT: [[TMP26:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
+// IR-PCH-HOST-NEXT: [[TMP27:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
+// IR-PCH-HOST-NEXT: [[ADD16:%.*]] = add nsw i32 [[TMP26]], [[TMP27]]
+// IR-PCH-HOST-NEXT: store i32 [[ADD16]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT17]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], i32 1
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DONE18:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT17]], [[TMP25]]
+// IR-PCH-HOST-NEXT: br i1 [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_DONE19]], label [[OMP_ARRAYCPY_BODY]]
+// IR-PCH-HOST: omp.arraycpy.done19:
+// IR-PCH-HOST-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP23]], ptr @.gomp_critical_user_.reduction.var)
+// IR-PCH-HOST-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// IR-PCH-HOST: .omp.reduction.case2:
+// IR-PCH-HOST-NEXT: [[TMP28:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_ISEMPTY20:%.*]] = icmp eq ptr [[TMP0]], [[TMP28]]
+// IR-PCH-HOST-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY20]], label [[OMP_ARRAYCPY_DONE27:%.*]], label [[OMP_ARRAYCPY_BODY21:%.*]]
+// IR-PCH-HOST: omp.arraycpy.body21:
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST22:%.*]] = phi ptr [ [[SUM4]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT25:%.*]], [[OMP_ARRAYCPY_BODY21]] ]
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST23:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT24:%.*]], [[OMP_ARRAYCPY_BODY21]] ]
+// IR-PCH-HOST-NEXT: [[TMP29:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST22]], align 4
+// IR-PCH-HOST-NEXT: [[TMP30:%.*]] = atomicrmw add ptr [[OMP_ARRAYCPY_DESTELEMENTPAST23]], i32 [[TMP29]] monotonic, align 4
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT24]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST23]], i32 1
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT25]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST22]], i32 1
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DONE26:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT24]], [[TMP28]]
+// IR-PCH-HOST-NEXT: br i1 [[OMP_ARRAYCPY_DONE26]], label [[OMP_ARRAYCPY_DONE27]], label [[OMP_ARRAYCPY_BODY21]]
+// IR-PCH-HOST: omp.arraycpy.done27:
+// IR-PCH-HOST-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP23]], ptr @.gomp_critical_user_.reduction.var)
+// IR-PCH-HOST-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// IR-PCH-HOST: .omp.reduction.default:
+// IR-PCH-HOST-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// IR-PCH-HOST-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
+// IR-PCH-HOST-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
+// IR-PCH-HOST: .omp.lastprivate.then:
+// IR-PCH-HOST-NEXT: store i32 10, ptr [[J3]], align 4
+// IR-PCH-HOST-NEXT: [[TMP33:%.*]] = load i32, ptr [[J3]], align 4
+// IR-PCH-HOST-NEXT: store i32 [[TMP33]], ptr [[J_ADDR]], align 4
+// IR-PCH-HOST-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]]
+// IR-PCH-HOST: .omp.lastprivate.done:
+// IR-PCH-HOST-NEXT: ret void
+// IR-PCH-HOST-LABEL: define {{[^@]+}}@.omp.reduction.reduction_func
+// IR-PCH-HOST-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+// IR-PCH-HOST-NEXT: entry:
+// IR-PCH-HOST-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-HOST-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// IR-PCH-HOST-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// IR-PCH-HOST-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// IR-PCH-HOST-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// IR-PCH-HOST-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// IR-PCH-HOST-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
+// IR-PCH-HOST-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+// IR-PCH-HOST-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
+// IR-PCH-HOST-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// IR-PCH-HOST-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i64 100
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP7]], [[TMP8]]
+// IR-PCH-HOST-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE2:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// IR-PCH-HOST: omp.arraycpy.body:
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[TMP7]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-PCH-HOST-NEXT: [[TMP9:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
+// IR-PCH-HOST-NEXT: [[TMP10:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
+// IR-PCH-HOST-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
+// IR-PCH-HOST-NEXT: store i32 [[ADD]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP8]]
+// IR-PCH-HOST-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE2]], label [[OMP_ARRAYCPY_BODY]]
+// IR-PCH-HOST: omp.arraycpy.done2:
+// IR-PCH-HOST-NEXT: ret void
+// IR-PCH-HOST-LABEL: define {{[^@]+}}@.omp.reduction.reduction_func.2
+// IR-PCH-HOST-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
+// IR-PCH-HOST-NEXT: entry:
+// IR-PCH-HOST-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-HOST-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// IR-PCH-HOST-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// IR-PCH-HOST-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// IR-PCH-HOST-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// IR-PCH-HOST-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// IR-PCH-HOST-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
+// IR-PCH-HOST-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+// IR-PCH-HOST-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
+// IR-PCH-HOST-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// IR-PCH-HOST-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i64 100
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP7]], [[TMP8]]
+// IR-PCH-HOST-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE2:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// IR-PCH-HOST: omp.arraycpy.body:
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[TMP7]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-PCH-HOST-NEXT: [[TMP9:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
+// IR-PCH-HOST-NEXT: [[TMP10:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
+// IR-PCH-HOST-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
+// IR-PCH-HOST-NEXT: store i32 [[ADD]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
+// IR-PCH-HOST-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP8]]
+// IR-PCH-HOST-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE2]], label [[OMP_ARRAYCPY_BODY]]
+// IR-PCH-HOST: omp.arraycpy.done2:
+// IR-PCH-HOST-NEXT: ret void
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l23
+// CHECK-SAME: (i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[J_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
+// CHECK-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[J_CASTED:%.*]] = alloca i64, align 8, addrspace(5)
+// CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[J_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_ADDR]] to ptr
+// CHECK-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
+// CHECK-NEXT: [[J_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_CASTED]] to ptr
+// CHECK-NEXT: [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
+// CHECK-NEXT: [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
+// CHECK-NEXT: store i64 [[J]], ptr [[J_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), i8 2, i1 false)
+// CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK: user_code.entry:
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
+// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[J_ADDR_ASCAST]], align 4
+// CHECK-NEXT: store i32 [[TMP3]], ptr [[J_CASTED_ASCAST]], align 4
+// CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr [[J_CASTED_ASCAST]], align 8
+// CHECK-NEXT: store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4
+// CHECK-NEXT: store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP__ASCAST]], align 4
+// CHECK-NEXT: call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP__ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP4]], ptr [[TMP0]]) #[[ATTR2:[0-9]+]]
+// CHECK-NEXT: call void @__kmpc_target_deinit(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i8 2)
+// CHECK-NEXT: ret void
+// CHECK: worker.exit:
+// CHECK-NEXT: ret void
+// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__
+// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[J_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
+// CHECK-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[SUM1:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
+// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[_TMP2:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[J3:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[J4:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[J_CASTED:%.*]] = alloca i64, align 8, addrspace(5)
+// CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8, addrspace(5)
+// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
+// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
+// CHECK-NEXT: [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
+// CHECK-NEXT: [[J_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_ADDR]] to ptr
+// CHECK-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
+// CHECK-NEXT: [[SUM1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM1]] to ptr
+// CHECK-NEXT: [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr
+// CHECK-NEXT: [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
+// CHECK-NEXT: [[TMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[_TMP2]] to ptr
+// CHECK-NEXT: [[DOTOMP_COMB_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_COMB_LB]] to ptr
+// CHECK-NEXT: [[DOTOMP_COMB_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_COMB_UB]] to ptr
+// CHECK-NEXT: [[DOTOMP_STRIDE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_STRIDE]] to ptr
+// CHECK-NEXT: [[DOTOMP_IS_LAST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IS_LAST]] to ptr
+// CHECK-NEXT: [[J3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J3]] to ptr
+// CHECK-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
+// CHECK-NEXT: [[J4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J4]] to ptr
+// CHECK-NEXT: [[J_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_CASTED]] to ptr
+// CHECK-NEXT: [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
+// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
+// CHECK-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
+// CHECK-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i64 [[J]], ptr [[J_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM1_ASCAST]], i32 0, i32 0, i32 0
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
+// CHECK-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP1]]
+// CHECK-NEXT: br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
+// CHECK: omp.arrayinit.body:
+// CHECK-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
+// CHECK-NEXT: store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
+// CHECK-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
+// CHECK-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP1]]
+// CHECK-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
+// CHECK: omp.arrayinit.done:
+// CHECK-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
+// CHECK-NEXT: store i32 99, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
+// CHECK-NEXT: store i32 1, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
+// CHECK-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
+// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CHECK-NEXT: call void @__kmpc_distribute_static_init_4(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr), i32 [[TMP3]], i32 91, ptr [[DOTOMP_IS_LAST_ASCAST]], ptr [[DOTOMP_COMB_LB_ASCAST]], ptr [[DOTOMP_COMB_UB_ASCAST]], ptr [[DOTOMP_STRIDE_ASCAST]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
+// CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 99
+// CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK: cond.true:
+// CHECK-NEXT: br label [[COND_END:%.*]]
+// CHECK: cond.false:
+// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
+// CHECK-NEXT: br label [[COND_END]]
+// CHECK: cond.end:
+// CHECK-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
+// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
+// CHECK-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV_ASCAST]], align 4
+// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK: omp.inner.for.cond:
+// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4
+// CHECK-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP7]], 100
+// CHECK-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK: omp.inner.for.body:
+// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
+// CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
+// CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
+// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[J3_ASCAST]], align 4
+// CHECK-NEXT: store i32 [[TMP12]], ptr [[J_CASTED_ASCAST]], align 4
+// CHECK-NEXT: [[TMP13:%.*]] = load i64, ptr [[J_CASTED_ASCAST]], align 8
+// CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 0
+// CHECK-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP9]] to ptr
+// CHECK-NEXT: store ptr [[TMP15]], ptr [[TMP14]], align 8
+// CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 1
+// CHECK-NEXT: [[TMP17:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// CHECK-NEXT: store ptr [[TMP17]], ptr [[TMP16]], align 8
+// CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 2
+// CHECK-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP13]] to ptr
+// CHECK-NEXT: store ptr [[TMP19]], ptr [[TMP18]], align 8
+// CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 3
+// CHECK-NEXT: store ptr [[SUM1_ASCAST]], ptr [[TMP20]], align 8
+// CHECK-NEXT: [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
+// CHECK-NEXT: call void @__kmpc_parallel_51(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP22]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__.1, ptr null, ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 4)
+// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK: omp.inner.for.inc:
+// CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4
+// CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
+// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP23]], [[TMP24]]
+// CHECK-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV_ASCAST]], align 4
+// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
+// CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
+// CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP25]], [[TMP26]]
+// CHECK-NEXT: store i32 [[ADD6]], ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
+// CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
+// CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
+// CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP27]], [[TMP28]]
+// CHECK-NEXT: store i32 [[ADD7]], ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
+// CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
+// CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[TMP29]], 99
+// CHECK-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]]
+// CHECK: cond.true9:
+// CHECK-NEXT: br label [[COND_END11:%.*]]
+// CHECK: cond.false10:
+// CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
+// CHECK-NEXT: br label [[COND_END11]]
+// CHECK: cond.end11:
+// CHECK-NEXT: [[COND12:%.*]] = phi i32 [ 99, [[COND_TRUE9]] ], [ [[TMP30]], [[COND_FALSE10]] ]
+// CHECK-NEXT: store i32 [[COND12]], ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
+// CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
+// CHECK-NEXT: store i32 [[TMP31]], ptr [[DOTOMP_IV_ASCAST]], align 4
+// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK: omp.inner.for.end:
+// CHECK-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK: omp.loop.exit:
+// CHECK-NEXT: [[TMP32:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4
+// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr addrspacecast (ptr addrspace(1) @[[GLOB3:[0-9]+]] to ptr), i32 [[TMP33]])
+// CHECK-NEXT: [[TMP34:%.*]] = load i32, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
+// CHECK-NEXT: [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
+// CHECK-NEXT: br i1 [[TMP35]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
+// CHECK: .omp.lastprivate.then:
+// CHECK-NEXT: store i32 10, ptr [[J3_ASCAST]], align 4
+// CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[J3_ASCAST]], align 4
+// CHECK-NEXT: store i32 [[TMP36]], ptr [[J_ADDR_ASCAST]], align 4
+// CHECK-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]]
+// CHECK: .omp.lastprivate.done:
+// CHECK-NEXT: [[TMP37:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP38:%.*]] = load i32, ptr [[TMP37]], align 4
+// CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0
+// CHECK-NEXT: store ptr [[SUM1_ASCAST]], ptr [[TMP39]], align 8
+// CHECK-NEXT: [[TMP40:%.*]] = load ptr, ptr addrspace(1) @"_openmp_teams_reductions_buffer_$_$ptr", align 8
+// CHECK-NEXT: [[TMP41:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP38]], ptr [[TMP40]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr @_omp_reduction_shuffle_and_reduce_func.3, ptr @_omp_reduction_inter_warp_copy_func.4, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
+// CHECK-NEXT: [[TMP42:%.*]] = icmp eq i32 [[TMP41]], 1
+// CHECK-NEXT: br i1 [[TMP42]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK: .omp.reduction.then:
+// CHECK-NEXT: [[TMP43:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
+// CHECK-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP43]]
+// CHECK-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE17:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// CHECK: omp.arraycpy.body:
+// CHECK-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM1_ASCAST]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST13:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT15:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK-NEXT: [[TMP44:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], align 4
+// CHECK-NEXT: [[TMP45:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
+// CHECK-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP44]], [[TMP45]]
+// CHECK-NEXT: store i32 [[ADD14]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], align 4
+// CHECK-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT15]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], i32 1
+// CHECK-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
+// CHECK-NEXT: [[OMP_ARRAYCPY_DONE16:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT15]], [[TMP43]]
+// CHECK-NEXT: br i1 [[OMP_ARRAYCPY_DONE16]], label [[OMP_ARRAYCPY_DONE17]], label [[OMP_ARRAYCPY_BODY]]
+// CHECK: omp.arraycpy.done17:
+// CHECK-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP38]])
+// CHECK-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
+// CHECK: .omp.reduction.done:
+// CHECK-NEXT: ret void
+// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__.1
+// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8, addrspace(5)
+// CHECK-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8, addrspace(5)
+// CHECK-NEXT: [[J_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
+// CHECK-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[_TMP1:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[J3:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[SUM4:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
+// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[J5:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
+// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
+// CHECK-NEXT: [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
+// CHECK-NEXT: [[DOTPREVIOUS_LB__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTPREVIOUS_LB__ADDR]] to ptr
+// CHECK-NEXT: [[DOTPREVIOUS_UB__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTPREVIOUS_UB__ADDR]] to ptr
+// CHECK-NEXT: [[J_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_ADDR]] to ptr
+// CHECK-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
+// CHECK-NEXT: [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr
+// CHECK-NEXT: [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
+// CHECK-NEXT: [[TMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[_TMP1]] to ptr
+// CHECK-NEXT: [[DOTOMP_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_LB]] to ptr
+// CHECK-NEXT: [[DOTOMP_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_UB]] to ptr
+// CHECK-NEXT: [[DOTOMP_STRIDE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_STRIDE]] to ptr
+// CHECK-NEXT: [[DOTOMP_IS_LAST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IS_LAST]] to ptr
+// CHECK-NEXT: [[J3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J3]] to ptr
+// CHECK-NEXT: [[SUM4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM4]] to ptr
+// CHECK-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
+// CHECK-NEXT: [[J5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J5]] to ptr
+// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
+// CHECK-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
+// CHECK-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i64 [[J]], ptr [[J_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB_ASCAST]], align 4
+// CHECK-NEXT: store i32 99, ptr [[DOTOMP_UB_ASCAST]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP2]] to i32
+// CHECK-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB_ASCAST]], align 4
+// CHECK-NEXT: store i32 [[CONV2]], ptr [[DOTOMP_UB_ASCAST]], align 4
+// CHECK-NEXT: store i32 1, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
+// CHECK-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
+// CHECK-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4_ASCAST]], i32 0, i32 0, i32 0
+// CHECK-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
+// CHECK-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP3]]
+// CHECK-NEXT: br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
+// CHECK: omp.arrayinit.body:
+// CHECK-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
+// CHECK-NEXT: store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
+// CHECK-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
+// CHECK-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
+// CHECK-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
+// CHECK: omp.arrayinit.done:
+// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+// CHECK-NEXT: call void @__kmpc_for_static_init_4(ptr addrspacecast (ptr addrspace(1) @[[GLOB3]] to ptr), i32 [[TMP5]], i32 33, ptr [[DOTOMP_IS_LAST_ASCAST]], ptr [[DOTOMP_LB_ASCAST]], ptr [[DOTOMP_UB_ASCAST]], ptr [[DOTOMP_STRIDE_ASCAST]], i32 1, i32 1)
+// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4
+// CHECK-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV_ASCAST]], align 4
+// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK: omp.inner.for.cond:
+// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7:![0-9]+]]
+// CHECK-NEXT: [[CONV6:%.*]] = sext i32 [[TMP7]] to i64
+// CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR_ASCAST]], align 8, !llvm.access.group [[ACC_GRP7]]
+// CHECK-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV6]], [[TMP8]]
+// CHECK-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK: omp.inner.for.body:
+// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
+// CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP9]], 10
+// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
+// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NEXT: store i32 [[ADD]], ptr [[I_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
+// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
+// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
+// CHECK-NEXT: [[DIV7:%.*]] = sdiv i32 [[TMP11]], 10
+// CHECK-NEXT: [[MUL8:%.*]] = mul nsw i32 [[DIV7]], 10
+// CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP10]], [[MUL8]]
+// CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[SUB]], 1
+// CHECK-NEXT: [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
+// CHECK-NEXT: store i32 [[ADD10]], ptr [[J3_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
+// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[I_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
+// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[I_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
+// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64
+// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4_ASCAST]], i64 0, i64 [[IDXPROM]]
+// CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[J3_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
+// CHECK-NEXT: [[IDXPROM11:%.*]] = sext i32 [[TMP14]] to i64
+// CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM11]]
+// CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP7]]
+// CHECK-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP15]], [[TMP12]]
+// CHECK-NEXT: store i32 [[ADD13]], ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP7]]
+// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK: omp.body.continue:
+// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK: omp.inner.for.inc:
+// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
+// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
+// CHECK-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+// CHECK-NEXT: store i32 [[ADD14]], ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
+// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP8:![0-9]+]]
+// CHECK: omp.inner.for.end:
+// CHECK-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK: omp.loop.exit:
+// CHECK-NEXT: [[TMP18:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4
+// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr addrspacecast (ptr addrspace(1) @[[GLOB3]] to ptr), i32 [[TMP19]])
+// CHECK-NEXT: [[TMP20:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
+// CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0
+// CHECK-NEXT: store ptr [[SUM4_ASCAST]], ptr [[TMP22]], align 8
+// CHECK-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP21]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func)
+// CHECK-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP23]], 1
+// CHECK-NEXT: br i1 [[TMP24]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK: .omp.reduction.then:
+// CHECK-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
+// CHECK-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP25]]
+// CHECK-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE19:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// CHECK: omp.arraycpy.body:
+// CHECK-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM4_ASCAST]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST15:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT17:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
+// CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
+// CHECK-NEXT: [[ADD16:%.*]] = add nsw i32 [[TMP26]], [[TMP27]]
+// CHECK-NEXT: store i32 [[ADD16]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
+// CHECK-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT17]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], i32 1
+// CHECK-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
+// CHECK-NEXT: [[OMP_ARRAYCPY_DONE18:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT17]], [[TMP25]]
+// CHECK-NEXT: br i1 [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_DONE19]], label [[OMP_ARRAYCPY_BODY]]
+// CHECK: omp.arraycpy.done19:
+// CHECK-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP21]])
+// CHECK-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
+// CHECK: .omp.reduction.done:
+// CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
+// CHECK-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
+// CHECK-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
+// CHECK: .omp.lastprivate.then:
+// CHECK-NEXT: store i32 10, ptr [[J3_ASCAST]], align 4
+// CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[J3_ASCAST]], align 4
+// CHECK-NEXT: store i32 [[TMP30]], ptr [[J_ADDR_ASCAST]], align 4
+// CHECK-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]]
+// CHECK: .omp.lastprivate.done:
+// CHECK-NEXT: ret void
+// CHECK-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func
+// CHECK-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2, addrspace(5)
+// CHECK-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2, addrspace(5)
+// CHECK-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2, addrspace(5)
+// CHECK-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
+// CHECK-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
+// CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
+// CHECK-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
+// CHECK-NEXT: [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
+// CHECK-NEXT: [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr
+// CHECK-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to ptr
+// CHECK-NEXT: [[DOTOMP_REDUCTION_ELEMENT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_ELEMENT]] to ptr
+// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
+// CHECK-NEXT: store i16 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 2
+// CHECK-NEXT: store i16 [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 2
+// CHECK-NEXT: store i16 [[TMP3]], ptr [[DOTADDR3_ASCAST]], align 2
+// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP5:%.*]] = load i16, ptr [[DOTADDR1_ASCAST]], align 2
+// CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[DOTADDR2_ASCAST]], align 2
+// CHECK-NEXT: [[TMP7:%.*]] = load i16, ptr [[DOTADDR3_ASCAST]], align 2
+// CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
+// CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8
+// CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]], i64 0, i64 0
+// CHECK-NEXT: [[TMP11:%.*]] = getelementptr [10 x [10 x i32]], ptr [[TMP9]], i64 1
+// CHECK-NEXT: br label [[DOTSHUFFLE_PRE_COND:%.*]]
+// CHECK: .shuffle.pre_cond:
+// CHECK-NEXT: [[TMP12:%.*]] = phi ptr [ [[TMP9]], [[ENTRY:%.*]] ], [ [[TMP23:%.*]], [[DOTSHUFFLE_THEN:%.*]] ]
+// CHECK-NEXT: [[TMP13:%.*]] = phi ptr [ [[DOTOMP_REDUCTION_ELEMENT_ASCAST]], [[ENTRY]] ], [ [[TMP24:%.*]], [[DOTSHUFFLE_THEN]] ]
+// CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP11]] to i64
+// CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT: [[TMP16:%.*]] = sub i64 [[TMP14]], [[TMP15]]
+// CHECK-NEXT: [[TMP17:%.*]] = sdiv exact i64 [[TMP16]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
+// CHECK-NEXT: [[TMP18:%.*]] = icmp sgt i64 [[TMP17]], 7
+// CHECK-NEXT: br i1 [[TMP18]], label [[DOTSHUFFLE_THEN]], label [[DOTSHUFFLE_EXIT:%.*]]
+// CHECK: .shuffle.then:
+// CHECK-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP12]], align 4
+// CHECK-NEXT: [[TMP20:%.*]] = call i32 @__kmpc_get_warp_size()
+// CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
+// CHECK-NEXT: [[TMP22:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP19]], i16 [[TMP6]], i16 [[TMP21]])
+// CHECK-NEXT: store i64 [[TMP22]], ptr [[TMP13]], align 4
+// CHECK-NEXT: [[TMP23]] = getelementptr i64, ptr [[TMP12]], i64 1
+// CHECK-NEXT: [[TMP24]] = getelementptr i64, ptr [[TMP13]], i64 1
+// CHECK-NEXT: br label [[DOTSHUFFLE_PRE_COND]]
+// CHECK: .shuffle.exit:
+// CHECK-NEXT: store ptr [[DOTOMP_REDUCTION_ELEMENT_ASCAST]], ptr [[TMP10]], align 8
+// CHECK-NEXT: [[TMP25:%.*]] = icmp eq i16 [[TMP7]], 0
+// CHECK-NEXT: [[TMP26:%.*]] = icmp eq i16 [[TMP7]], 1
+// CHECK-NEXT: [[TMP27:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
+// CHECK-NEXT: [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
+// CHECK-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP7]], 2
+// CHECK-NEXT: [[TMP30:%.*]] = and i16 [[TMP5]], 1
+// CHECK-NEXT: [[TMP31:%.*]] = icmp eq i16 [[TMP30]], 0
+// CHECK-NEXT: [[TMP32:%.*]] = and i1 [[TMP29]], [[TMP31]]
+// CHECK-NEXT: [[TMP33:%.*]] = icmp sgt i16 [[TMP6]], 0
+// CHECK-NEXT: [[TMP34:%.*]] = and i1 [[TMP32]], [[TMP33]]
+// CHECK-NEXT: [[TMP35:%.*]] = or i1 [[TMP25]], [[TMP28]]
+// CHECK-NEXT: [[TMP36:%.*]] = or i1 [[TMP35]], [[TMP34]]
+// CHECK-NEXT: br i1 [[TMP36]], label [[THEN:%.*]], label [[ELSE:%.*]]
+// CHECK: then:
+// CHECK-NEXT: call void @"_omp$reduction$reduction_func"(ptr [[TMP4]], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]]) #[[ATTR2]]
+// CHECK-NEXT: br label [[IFCONT:%.*]]
+// CHECK: else:
+// CHECK-NEXT: br label [[IFCONT]]
+// CHECK: ifcont:
+// CHECK-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP7]], 1
+// CHECK-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP5]], [[TMP6]]
+// CHECK-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
+// CHECK-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
+// CHECK: then4:
+// CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]], i64 0, i64 0
+// CHECK-NEXT: [[TMP41:%.*]] = load ptr, ptr [[TMP40]], align 8
+// CHECK-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
+// CHECK-NEXT: [[TMP43:%.*]] = load ptr, ptr [[TMP42]], align 8
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP43]], ptr align 4 [[TMP41]], i64 400, i1 false)
+// CHECK-NEXT: br label [[IFCONT6:%.*]]
+// CHECK: else5:
+// CHECK-NEXT: br label [[IFCONT6]]
+// CHECK: ifcont6:
+// CHECK-NEXT: ret void
+// CHECK-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func
+// CHECK-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR3]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
+// CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
+// CHECK-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
+// CHECK-NEXT: [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
+// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 63
+// CHECK-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 6
+// CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 0, ptr [[DOTCNT_ADDR_ASCAST]], align 4
+// CHECK-NEXT: br label [[PRECOND:%.*]]
+// CHECK: precond:
+// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 100
+// CHECK-NEXT: br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
+// CHECK: body:
+// CHECK-NEXT: call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4:[0-9]+]] to ptr), i32 [[TMP2]])
+// CHECK-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
+// CHECK-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
+// CHECK: then:
+// CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
+// CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
+// CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 [[TMP7]]
+// CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
+// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP11]], align 4
+// CHECK-NEXT: store volatile i32 [[TMP13]], ptr addrspace(3) [[TMP12]], align 4
+// CHECK-NEXT: br label [[IFCONT:%.*]]
+// CHECK: else:
+// CHECK-NEXT: br label [[IFCONT]]
+// CHECK: ifcont:
+// CHECK-NEXT: call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), i32 [[TMP2]])
+// CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
+// CHECK-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
+// CHECK-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
+// CHECK: then2:
+// CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
+// CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
+// CHECK-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
+// CHECK-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 [[TMP7]]
+// CHECK-NEXT: [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
+// CHECK-NEXT: store i32 [[TMP19]], ptr [[TMP18]], align 4
+// CHECK-NEXT: br label [[IFCONT4:%.*]]
+// CHECK: else3:
+// CHECK-NEXT: br label [[IFCONT4]]
+// CHECK: ifcont4:
+// CHECK-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-NEXT: store i32 [[TMP20]], ptr [[DOTCNT_ADDR_ASCAST]], align 4
+// CHECK-NEXT: br label [[PRECOND]]
+// CHECK: exit:
+// CHECK-NEXT: ret void
+// CHECK-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func.3
+// CHECK-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR3]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2, addrspace(5)
+// CHECK-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2, addrspace(5)
+// CHECK-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2, addrspace(5)
+// CHECK-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
+// CHECK-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
+// CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
+// CHECK-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
+// CHECK-NEXT: [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
+// CHECK-NEXT: [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr
+// CHECK-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to ptr
+// CHECK-NEXT: [[DOTOMP_REDUCTION_ELEMENT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_ELEMENT]] to ptr
+// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
+// CHECK-NEXT: store i16 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 2
+// CHECK-NEXT: store i16 [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 2
+// CHECK-NEXT: store i16 [[TMP3]], ptr [[DOTADDR3_ASCAST]], align 2
+// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP5:%.*]] = load i16, ptr [[DOTADDR1_ASCAST]], align 2
+// CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[DOTADDR2_ASCAST]], align 2
+// CHECK-NEXT: [[TMP7:%.*]] = load i16, ptr [[DOTADDR3_ASCAST]], align 2
+// CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
+// CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8
+// CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]], i64 0, i64 0
+// CHECK-NEXT: [[TMP11:%.*]] = getelementptr [10 x [10 x i32]], ptr [[TMP9]], i64 1
+// CHECK-NEXT: br label [[DOTSHUFFLE_PRE_COND:%.*]]
+// CHECK: .shuffle.pre_cond:
+// CHECK-NEXT: [[TMP12:%.*]] = phi ptr [ [[TMP9]], [[ENTRY:%.*]] ], [ [[TMP23:%.*]], [[DOTSHUFFLE_THEN:%.*]] ]
+// CHECK-NEXT: [[TMP13:%.*]] = phi ptr [ [[DOTOMP_REDUCTION_ELEMENT_ASCAST]], [[ENTRY]] ], [ [[TMP24:%.*]], [[DOTSHUFFLE_THEN]] ]
+// CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP11]] to i64
+// CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK-NEXT: [[TMP16:%.*]] = sub i64 [[TMP14]], [[TMP15]]
+// CHECK-NEXT: [[TMP17:%.*]] = sdiv exact i64 [[TMP16]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
+// CHECK-NEXT: [[TMP18:%.*]] = icmp sgt i64 [[TMP17]], 7
+// CHECK-NEXT: br i1 [[TMP18]], label [[DOTSHUFFLE_THEN]], label [[DOTSHUFFLE_EXIT:%.*]]
+// CHECK: .shuffle.then:
+// CHECK-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP12]], align 4
+// CHECK-NEXT: [[TMP20:%.*]] = call i32 @__kmpc_get_warp_size()
+// CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
+// CHECK-NEXT: [[TMP22:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP19]], i16 [[TMP6]], i16 [[TMP21]])
+// CHECK-NEXT: store i64 [[TMP22]], ptr [[TMP13]], align 4
+// CHECK-NEXT: [[TMP23]] = getelementptr i64, ptr [[TMP12]], i64 1
+// CHECK-NEXT: [[TMP24]] = getelementptr i64, ptr [[TMP13]], i64 1
+// CHECK-NEXT: br label [[DOTSHUFFLE_PRE_COND]]
+// CHECK: .shuffle.exit:
+// CHECK-NEXT: store ptr [[DOTOMP_REDUCTION_ELEMENT_ASCAST]], ptr [[TMP10]], align 8
+// CHECK-NEXT: [[TMP25:%.*]] = icmp eq i16 [[TMP7]], 0
+// CHECK-NEXT: [[TMP26:%.*]] = icmp eq i16 [[TMP7]], 1
+// CHECK-NEXT: [[TMP27:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
+// CHECK-NEXT: [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
+// CHECK-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP7]], 2
+// CHECK-NEXT: [[TMP30:%.*]] = and i16 [[TMP5]], 1
+// CHECK-NEXT: [[TMP31:%.*]] = icmp eq i16 [[TMP30]], 0
+// CHECK-NEXT: [[TMP32:%.*]] = and i1 [[TMP29]], [[TMP31]]
+// CHECK-NEXT: [[TMP33:%.*]] = icmp sgt i16 [[TMP6]], 0
+// CHECK-NEXT: [[TMP34:%.*]] = and i1 [[TMP32]], [[TMP33]]
+// CHECK-NEXT: [[TMP35:%.*]] = or i1 [[TMP25]], [[TMP28]]
+// CHECK-NEXT: [[TMP36:%.*]] = or i1 [[TMP35]], [[TMP34]]
+// CHECK-NEXT: br i1 [[TMP36]], label [[THEN:%.*]], label [[ELSE:%.*]]
+// CHECK: then:
+// CHECK-NEXT: call void @"_omp$reduction$reduction_func.2"(ptr [[TMP4]], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]]) #[[ATTR2]]
+// CHECK-NEXT: br label [[IFCONT:%.*]]
+// CHECK: else:
+// CHECK-NEXT: br label [[IFCONT]]
+// CHECK: ifcont:
+// CHECK-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP7]], 1
+// CHECK-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP5]], [[TMP6]]
+// CHECK-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
+// CHECK-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
+// CHECK: then4:
+// CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]], i64 0, i64 0
+// CHECK-NEXT: [[TMP41:%.*]] = load ptr, ptr [[TMP40]], align 8
+// CHECK-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
+// CHECK-NEXT: [[TMP43:%.*]] = load ptr, ptr [[TMP42]], align 8
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP43]], ptr align 4 [[TMP41]], i64 400, i1 false)
+// CHECK-NEXT: br label [[IFCONT6:%.*]]
+// CHECK: else5:
+// CHECK-NEXT: br label [[IFCONT6]]
+// CHECK: ifcont6:
+// CHECK-NEXT: ret void
+// CHECK-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func.4
+// CHECK-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR3]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
+// CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
+// CHECK-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
+// CHECK-NEXT: [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
+// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 63
+// CHECK-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 6
+// CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 0, ptr [[DOTCNT_ADDR_ASCAST]], align 4
+// CHECK-NEXT: br label [[PRECOND:%.*]]
+// CHECK: precond:
+// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 100
+// CHECK-NEXT: br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
+// CHECK: body:
+// CHECK-NEXT: call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), i32 [[TMP2]])
+// CHECK-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
+// CHECK-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
+// CHECK: then:
+// CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
+// CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
+// CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 [[TMP7]]
+// CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
+// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP11]], align 4
+// CHECK-NEXT: store volatile i32 [[TMP13]], ptr addrspace(3) [[TMP12]], align 4
+// CHECK-NEXT: br label [[IFCONT:%.*]]
+// CHECK: else:
+// CHECK-NEXT: br label [[IFCONT]]
+// CHECK: ifcont:
+// CHECK-NEXT: call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), i32 [[TMP2]])
+// CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
+// CHECK-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
+// CHECK-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
+// CHECK: then2:
+// CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
+// CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
+// CHECK-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
+// CHECK-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 [[TMP7]]
+// CHECK-NEXT: [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
+// CHECK-NEXT: store i32 [[TMP19]], ptr [[TMP18]], align 4
+// CHECK-NEXT: br label [[IFCONT4:%.*]]
+// CHECK: else3:
+// CHECK-NEXT: br label [[IFCONT4]]
+// CHECK: ifcont4:
+// CHECK-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK-NEXT: store i32 [[TMP20]], ptr [[DOTCNT_ADDR_ASCAST]], align 4
+// CHECK-NEXT: br label [[PRECOND]]
+// CHECK: exit:
+// CHECK-NEXT: ret void
+// CHECK-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func
+// CHECK-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
+// CHECK-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
+// CHECK-NEXT: [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
+// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
+// CHECK-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 8
+// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
+// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
+// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
+// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// CHECK-NEXT: [[SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP4]], i32 0, i32 0
+// CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x [10 x [10 x i32]]], ptr [[SUM]], i32 0, i32 [[TMP5]]
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 128 [[TMP8]], ptr align 4 [[TMP7]], i64 400, i1 false)
+// CHECK-NEXT: ret void
+// CHECK-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func
+// CHECK-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
+// CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
+// CHECK-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
+// CHECK-NEXT: [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
+// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
+// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
+// CHECK-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 8
+// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
+// CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0
+// CHECK-NEXT: [[SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP3]], i32 0, i32 0
+// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x [10 x [10 x i32]]], ptr [[SUM]], i32 0, i32 [[TMP4]]
+// CHECK-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 8
+// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
+// CHECK-NEXT: call void @"_omp$reduction$reduction_func.2"(ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr [[TMP7]]) #[[ATTR2]]
+// CHECK-NEXT: ret void
+// CHECK-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func
+// CHECK-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
+// CHECK-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
+// CHECK-NEXT: [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
+// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
+// CHECK-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 8
+// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
+// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
+// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
+// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// CHECK-NEXT: [[SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP4]], i32 0, i32 0
+// CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x [10 x [10 x i32]]], ptr [[SUM]], i32 0, i32 [[TMP5]]
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP7]], ptr align 128 [[TMP8]], i64 400, i1 false)
+// CHECK-NEXT: ret void
+// CHECK-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func
+// CHECK-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
+// CHECK-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
+// CHECK-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
+// CHECK-NEXT: [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
+// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
+// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
+// CHECK-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 8
+// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
+// CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0
+// CHECK-NEXT: [[SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP3]], i32 0, i32 0
+// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x [10 x [10 x i32]]], ptr [[SUM]], i32 0, i32 [[TMP4]]
+// CHECK-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 8
+// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
+// CHECK-NEXT: call void @"_omp$reduction$reduction_func.2"(ptr [[TMP7]], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]]) #[[ATTR2]]
+// CHECK-NEXT: ret void
+// IR-GPU-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22
+// IR-GPU-SAME: (i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR0:[0-9]+]] {
+// IR-GPU-NEXT: entry:
+// IR-GPU-NEXT: [[J_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
+// IR-GPU-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// IR-GPU-NEXT: [[J_CASTED:%.*]] = alloca i64, align 8, addrspace(5)
+// IR-GPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[J_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_ADDR]] to ptr
+// IR-GPU-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
+// IR-GPU-NEXT: [[J_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_CASTED]] to ptr
+// IR-GPU-NEXT: [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
+// IR-GPU-NEXT: [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
+// IR-GPU-NEXT: store i64 [[J]], ptr [[J_ADDR_ASCAST]], align 8
+// IR-GPU-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
+// IR-GPU-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
+// IR-GPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), i8 2, i1 false)
+// IR-GPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// IR-GPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// IR-GPU: user_code.entry:
+// IR-GPU-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
+// IR-GPU-NEXT: [[TMP3:%.*]] = load i32, ptr [[J_ADDR_ASCAST]], align 4
+// IR-GPU-NEXT: store i32 [[TMP3]], ptr [[J_CASTED_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP4:%.*]] = load i64, ptr [[J_CASTED_ASCAST]], align 8
+// IR-GPU-NEXT: store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4
+// IR-GPU-NEXT: store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP__ASCAST]], align 4
+// IR-GPU-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined(ptr [[DOTTHREADID_TEMP__ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]], i64 [[TMP4]], ptr [[TMP0]]) #[[ATTR2:[0-9]+]]
+// IR-GPU-NEXT: call void @__kmpc_target_deinit(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i8 2)
+// IR-GPU-NEXT: ret void
+// IR-GPU: worker.exit:
+// IR-GPU-NEXT: ret void
+//
+//
+// IR-GPU-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined
+// IR-GPU-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1:[0-9]+]] {
+// IR-GPU-NEXT: entry:
+// IR-GPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// IR-GPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// IR-GPU-NEXT: [[J_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
+// IR-GPU-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// IR-GPU-NEXT: [[SUM1:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
+// IR-GPU-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[TMP:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[_TMP2:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[J3:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[J4:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[J_CASTED:%.*]] = alloca i64, align 8, addrspace(5)
+// IR-GPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8, addrspace(5)
+// IR-GPU-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
+// IR-GPU-NEXT: [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
+// IR-GPU-NEXT: [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
+// IR-GPU-NEXT: [[J_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_ADDR]] to ptr
+// IR-GPU-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
+// IR-GPU-NEXT: [[SUM1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM1]] to ptr
+// IR-GPU-NEXT: [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr
+// IR-GPU-NEXT: [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
+// IR-GPU-NEXT: [[TMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[_TMP2]] to ptr
+// IR-GPU-NEXT: [[DOTOMP_COMB_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_COMB_LB]] to ptr
+// IR-GPU-NEXT: [[DOTOMP_COMB_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_COMB_UB]] to ptr
+// IR-GPU-NEXT: [[DOTOMP_STRIDE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_STRIDE]] to ptr
+// IR-GPU-NEXT: [[DOTOMP_IS_LAST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IS_LAST]] to ptr
+// IR-GPU-NEXT: [[J3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J3]] to ptr
+// IR-GPU-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
+// IR-GPU-NEXT: [[J4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J4]] to ptr
+// IR-GPU-NEXT: [[J_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_CASTED]] to ptr
+// IR-GPU-NEXT: [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
+// IR-GPU-NEXT: [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
+// IR-GPU-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
+// IR-GPU-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
+// IR-GPU-NEXT: store i64 [[J]], ptr [[J_ADDR_ASCAST]], align 8
+// IR-GPU-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
+// IR-GPU-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
+// IR-GPU-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM1_ASCAST]], i32 0, i32 0, i32 0
+// IR-GPU-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
+// IR-GPU-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP1]]
+// IR-GPU-NEXT: br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
+// IR-GPU: omp.arrayinit.body:
+// IR-GPU-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
+// IR-GPU-NEXT: store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
+// IR-GPU-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
+// IR-GPU-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP1]]
+// IR-GPU-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
+// IR-GPU: omp.arrayinit.done:
+// IR-GPU-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
+// IR-GPU-NEXT: store i32 99, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
+// IR-GPU-NEXT: store i32 1, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
+// IR-GPU-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
+// IR-GPU-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
+// IR-GPU-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
+// IR-GPU-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// IR-GPU-NEXT: call void @__kmpc_distribute_static_init_4(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr), i32 [[TMP3]], i32 91, ptr [[DOTOMP_IS_LAST_ASCAST]], ptr [[DOTOMP_COMB_LB_ASCAST]], ptr [[DOTOMP_COMB_UB_ASCAST]], ptr [[DOTOMP_STRIDE_ASCAST]], i32 1, i32 [[NVPTX_NUM_THREADS]])
+// IR-GPU-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
+// IR-GPU-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 99
+// IR-GPU-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// IR-GPU: cond.true:
+// IR-GPU-NEXT: br label [[COND_END:%.*]]
+// IR-GPU: cond.false:
+// IR-GPU-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
+// IR-GPU-NEXT: br label [[COND_END]]
+// IR-GPU: cond.end:
+// IR-GPU-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// IR-GPU-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
+// IR-GPU-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV_ASCAST]], align 4
+// IR-GPU-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// IR-GPU: omp.inner.for.cond:
+// IR-GPU-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4
+// IR-GPU-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP7]], 100
+// IR-GPU-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// IR-GPU: omp.inner.for.body:
+// IR-GPU-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// IR-GPU-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
+// IR-GPU-NEXT: [[TMP12:%.*]] = load i32, ptr [[J3_ASCAST]], align 4
+// IR-GPU-NEXT: store i32 [[TMP12]], ptr [[J_CASTED_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP13:%.*]] = load i64, ptr [[J_CASTED_ASCAST]], align 8
+// IR-GPU-NEXT: [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 0
+// IR-GPU-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP9]] to ptr
+// IR-GPU-NEXT: store ptr [[TMP15]], ptr [[TMP14]], align 8
+// IR-GPU-NEXT: [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 1
+// IR-GPU-NEXT: [[TMP17:%.*]] = inttoptr i64 [[TMP11]] to ptr
+// IR-GPU-NEXT: store ptr [[TMP17]], ptr [[TMP16]], align 8
+// IR-GPU-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 2
+// IR-GPU-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP13]] to ptr
+// IR-GPU-NEXT: store ptr [[TMP19]], ptr [[TMP18]], align 8
+// IR-GPU-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 3
+// IR-GPU-NEXT: store ptr [[SUM1_ASCAST]], ptr [[TMP20]], align 8
+// IR-GPU-NEXT: [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
+// IR-GPU-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
+// IR-GPU-NEXT: call void @__kmpc_parallel_51(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP22]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 4)
+// IR-GPU-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// IR-GPU: omp.inner.for.inc:
+// IR-GPU-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP24:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
+// IR-GPU-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP23]], [[TMP24]]
+// IR-GPU-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
+// IR-GPU-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP25]], [[TMP26]]
+// IR-GPU-NEXT: store i32 [[ADD6]], ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
+// IR-GPU-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP27]], [[TMP28]]
+// IR-GPU-NEXT: store i32 [[ADD7]], ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
+// IR-GPU-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[TMP29]], 99
+// IR-GPU-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]]
+// IR-GPU: cond.true9:
+// IR-GPU-NEXT: br label [[COND_END11:%.*]]
+// IR-GPU: cond.false10:
+// IR-GPU-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
+// IR-GPU-NEXT: br label [[COND_END11]]
+// IR-GPU: cond.end11:
+// IR-GPU-NEXT: [[COND12:%.*]] = phi i32 [ 99, [[COND_TRUE9]] ], [ [[TMP30]], [[COND_FALSE10]] ]
+// IR-GPU-NEXT: store i32 [[COND12]], ptr [[DOTOMP_COMB_UB_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4
+// IR-GPU-NEXT: store i32 [[TMP31]], ptr [[DOTOMP_IV_ASCAST]], align 4
+// IR-GPU-NEXT: br label [[OMP_INNER_FOR_COND]]
+// IR-GPU: omp.inner.for.end:
+// IR-GPU-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// IR-GPU: omp.loop.exit:
+// IR-GPU-NEXT: [[TMP32:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
+// IR-GPU-NEXT: [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4
+// IR-GPU-NEXT: call void @__kmpc_for_static_fini(ptr addrspacecast (ptr addrspace(1) @[[GLOB3:[0-9]+]] to ptr), i32 [[TMP33]])
+// IR-GPU-NEXT: [[TMP34:%.*]] = load i32, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
+// IR-GPU-NEXT: br i1 [[TMP35]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
+// IR-GPU: .omp.lastprivate.then:
+// IR-GPU-NEXT: store i32 10, ptr [[J3_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP36:%.*]] = load i32, ptr [[J3_ASCAST]], align 4
+// IR-GPU-NEXT: store i32 [[TMP36]], ptr [[J_ADDR_ASCAST]], align 4
+// IR-GPU-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]]
+// IR-GPU: .omp.lastprivate.done:
+// IR-GPU-NEXT: [[TMP37:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
+// IR-GPU-NEXT: [[TMP38:%.*]] = load i32, ptr [[TMP37]], align 4
+// IR-GPU-NEXT: [[TMP39:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0
+// IR-GPU-NEXT: store ptr [[SUM1_ASCAST]], ptr [[TMP39]], align 8
+// IR-GPU-NEXT: [[TMP40:%.*]] = load ptr, ptr addrspace(1) @"_openmp_teams_reductions_buffer_$_$ptr", align 8
+// IR-GPU-NEXT: [[TMP41:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP38]], ptr [[TMP40]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr @_omp_reduction_shuffle_and_reduce_func.1, ptr @_omp_reduction_inter_warp_copy_func.2, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
+// IR-GPU-NEXT: [[TMP42:%.*]] = icmp eq i32 [[TMP41]], 1
+// IR-GPU-NEXT: br i1 [[TMP42]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// IR-GPU: .omp.reduction.then:
+// IR-GPU-NEXT: [[TMP43:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
+// IR-GPU-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP43]]
+// IR-GPU-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE17:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// IR-GPU: omp.arraycpy.body:
+// IR-GPU-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM1_ASCAST]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-GPU-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST13:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT15:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-GPU-NEXT: [[TMP44:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], align 4
+// IR-GPU-NEXT: [[TMP45:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
+// IR-GPU-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP44]], [[TMP45]]
+// IR-GPU-NEXT: store i32 [[ADD14]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], align 4
+// IR-GPU-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT15]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], i32 1
+// IR-GPU-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
+// IR-GPU-NEXT: [[OMP_ARRAYCPY_DONE16:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT15]], [[TMP43]]
+// IR-GPU-NEXT: br i1 [[OMP_ARRAYCPY_DONE16]], label [[OMP_ARRAYCPY_DONE17]], label [[OMP_ARRAYCPY_BODY]]
+// IR-GPU: omp.arraycpy.done17:
+// IR-GPU-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP38]])
+// IR-GPU-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
+// IR-GPU: .omp.reduction.done:
+// IR-GPU-NEXT: ret void
+//
+//
+// IR-GPU-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined_omp_outlined
+// IR-GPU-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1]] {
+// IR-GPU-NEXT: entry:
+// IR-GPU-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// IR-GPU-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// IR-GPU-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8, addrspace(5)
+// IR-GPU-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8, addrspace(5)
+// IR-GPU-NEXT: [[J_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
+// IR-GPU-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// IR-GPU-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[TMP:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[_TMP1:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[J3:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[SUM4:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
+// IR-GPU-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[J5:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
+// IR-GPU-NEXT: [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
+// IR-GPU-NEXT: [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
+// IR-GPU-NEXT: [[DOTPREVIOUS_LB__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTPREVIOUS_LB__ADDR]] to ptr
+// IR-GPU-NEXT: [[DOTPREVIOUS_UB__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTPREVIOUS_UB__ADDR]] to ptr
+// IR-GPU-NEXT: [[J_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_ADDR]] to ptr
+// IR-GPU-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
+// IR-GPU-NEXT: [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr
+// IR-GPU-NEXT: [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
+// IR-GPU-NEXT: [[TMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[_TMP1]] to ptr
+// IR-GPU-NEXT: [[DOTOMP_LB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_LB]] to ptr
+// IR-GPU-NEXT: [[DOTOMP_UB_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_UB]] to ptr
+// IR-GPU-NEXT: [[DOTOMP_STRIDE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_STRIDE]] to ptr
+// IR-GPU-NEXT: [[DOTOMP_IS_LAST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IS_LAST]] to ptr
+// IR-GPU-NEXT: [[J3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J3]] to ptr
+// IR-GPU-NEXT: [[SUM4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM4]] to ptr
+// IR-GPU-NEXT: [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
+// IR-GPU-NEXT: [[J5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J5]] to ptr
+// IR-GPU-NEXT: [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
+// IR-GPU-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
+// IR-GPU-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
+// IR-GPU-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR_ASCAST]], align 8
+// IR-GPU-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR_ASCAST]], align 8
+// IR-GPU-NEXT: store i64 [[J]], ptr [[J_ADDR_ASCAST]], align 8
+// IR-GPU-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
+// IR-GPU-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
+// IR-GPU-NEXT: store i32 0, ptr [[DOTOMP_LB_ASCAST]], align 4
+// IR-GPU-NEXT: store i32 99, ptr [[DOTOMP_UB_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR_ASCAST]], align 8
+// IR-GPU-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
+// IR-GPU-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR_ASCAST]], align 8
+// IR-GPU-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP2]] to i32
+// IR-GPU-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB_ASCAST]], align 4
+// IR-GPU-NEXT: store i32 [[CONV2]], ptr [[DOTOMP_UB_ASCAST]], align 4
+// IR-GPU-NEXT: store i32 1, ptr [[DOTOMP_STRIDE_ASCAST]], align 4
+// IR-GPU-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
+// IR-GPU-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4_ASCAST]], i32 0, i32 0, i32 0
+// IR-GPU-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
+// IR-GPU-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP3]]
+// IR-GPU-NEXT: br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
+// IR-GPU: omp.arrayinit.body:
+// IR-GPU-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
+// IR-GPU-NEXT: store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
+// IR-GPU-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
+// IR-GPU-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
+// IR-GPU-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
+// IR-GPU: omp.arrayinit.done:
+// IR-GPU-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
+// IR-GPU-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+// IR-GPU-NEXT: call void @__kmpc_for_static_init_4(ptr addrspacecast (ptr addrspace(1) @[[GLOB3]] to ptr), i32 [[TMP5]], i32 33, ptr [[DOTOMP_IS_LAST_ASCAST]], ptr [[DOTOMP_LB_ASCAST]], ptr [[DOTOMP_UB_ASCAST]], ptr [[DOTOMP_STRIDE_ASCAST]], i32 1, i32 1)
+// IR-GPU-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4
+// IR-GPU-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV_ASCAST]], align 4
+// IR-GPU-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// IR-GPU: omp.inner.for.cond:
+// IR-GPU-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7:![0-9]+]]
+// IR-GPU-NEXT: [[CONV6:%.*]] = sext i32 [[TMP7]] to i64
+// IR-GPU-NEXT: [[TMP8:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR_ASCAST]], align 8, !llvm.access.group [[ACC_GRP7]]
+// IR-GPU-NEXT: [[CMP:%.*]] = icmp ule i64 [[CONV6]], [[TMP8]]
+// IR-GPU-NEXT: br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// IR-GPU: omp.inner.for.body:
+// IR-GPU-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
+// IR-GPU-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP9]], 10
+// IR-GPU-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
+// IR-GPU-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// IR-GPU-NEXT: store i32 [[ADD]], ptr [[I_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
+// IR-GPU-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
+// IR-GPU-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
+// IR-GPU-NEXT: [[DIV7:%.*]] = sdiv i32 [[TMP11]], 10
+// IR-GPU-NEXT: [[MUL8:%.*]] = mul nsw i32 [[DIV7]], 10
+// IR-GPU-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP10]], [[MUL8]]
+// IR-GPU-NEXT: [[MUL9:%.*]] = mul nsw i32 [[SUB]], 1
+// IR-GPU-NEXT: [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
+// IR-GPU-NEXT: store i32 [[ADD10]], ptr [[J3_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
+// IR-GPU-NEXT: [[TMP12:%.*]] = load i32, ptr [[I_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
+// IR-GPU-NEXT: [[TMP13:%.*]] = load i32, ptr [[I_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
+// IR-GPU-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64
+// IR-GPU-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4_ASCAST]], i64 0, i64 [[IDXPROM]]
+// IR-GPU-NEXT: [[TMP14:%.*]] = load i32, ptr [[J3_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
+// IR-GPU-NEXT: [[IDXPROM11:%.*]] = sext i32 [[TMP14]] to i64
+// IR-GPU-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM11]]
+// IR-GPU-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP7]]
+// IR-GPU-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP15]], [[TMP12]]
+// IR-GPU-NEXT: store i32 [[ADD13]], ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP7]]
+// IR-GPU-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// IR-GPU: omp.body.continue:
+// IR-GPU-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// IR-GPU: omp.inner.for.inc:
+// IR-GPU-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
+// IR-GPU-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
+// IR-GPU-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+// IR-GPU-NEXT: store i32 [[ADD14]], ptr [[DOTOMP_IV_ASCAST]], align 4, !llvm.access.group [[ACC_GRP7]]
+// IR-GPU-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP8:![0-9]+]]
+// IR-GPU: omp.inner.for.end:
+// IR-GPU-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// IR-GPU: omp.loop.exit:
+// IR-GPU-NEXT: [[TMP18:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
+// IR-GPU-NEXT: [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4
+// IR-GPU-NEXT: call void @__kmpc_for_static_fini(ptr addrspacecast (ptr addrspace(1) @[[GLOB3]] to ptr), i32 [[TMP19]])
+// IR-GPU-NEXT: [[TMP20:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
+// IR-GPU-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
+// IR-GPU-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0
+// IR-GPU-NEXT: store ptr [[SUM4_ASCAST]], ptr [[TMP22]], align 8
+// IR-GPU-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP21]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func)
+// IR-GPU-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP23]], 1
+// IR-GPU-NEXT: br i1 [[TMP24]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// IR-GPU: .omp.reduction.then:
+// IR-GPU-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
+// IR-GPU-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP25]]
+// IR-GPU-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE19:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// IR-GPU: omp.arraycpy.body:
+// IR-GPU-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM4_ASCAST]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-GPU-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST15:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT17:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-GPU-NEXT: [[TMP26:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
+// IR-GPU-NEXT: [[TMP27:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
+// IR-GPU-NEXT: [[ADD16:%.*]] = add nsw i32 [[TMP26]], [[TMP27]]
+// IR-GPU-NEXT: store i32 [[ADD16]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
+// IR-GPU-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT17]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], i32 1
+// IR-GPU-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
+// IR-GPU-NEXT: [[OMP_ARRAYCPY_DONE18:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT17]], [[TMP25]]
+// IR-GPU-NEXT: br i1 [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_DONE19]], label [[OMP_ARRAYCPY_BODY]]
+// IR-GPU: omp.arraycpy.done19:
+// IR-GPU-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP21]])
+// IR-GPU-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
+// IR-GPU: .omp.reduction.done:
+// IR-GPU-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
+// IR-GPU-NEXT: br i1 [[TMP29]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
+// IR-GPU: .omp.lastprivate.then:
+// IR-GPU-NEXT: store i32 10, ptr [[J3_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP30:%.*]] = load i32, ptr [[J3_ASCAST]], align 4
+// IR-GPU-NEXT: store i32 [[TMP30]], ptr [[J_ADDR_ASCAST]], align 4
+// IR-GPU-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]]
+// IR-GPU: .omp.lastprivate.done:
+// IR-GPU-NEXT: ret void
+//
+//
+// IR-GPU-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func
+// IR-GPU-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR3:[0-9]+]] {
+// IR-GPU-NEXT: entry:
+// IR-GPU-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// IR-GPU-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2, addrspace(5)
+// IR-GPU-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2, addrspace(5)
+// IR-GPU-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2, addrspace(5)
+// IR-GPU-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
+// IR-GPU-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
+// IR-GPU-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
+// IR-GPU-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
+// IR-GPU-NEXT: [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
+// IR-GPU-NEXT: [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr
+// IR-GPU-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to ptr
+// IR-GPU-NEXT: [[DOTOMP_REDUCTION_ELEMENT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_ELEMENT]] to ptr
+// IR-GPU-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
+// IR-GPU-NEXT: store i16 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 2
+// IR-GPU-NEXT: store i16 [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 2
+// IR-GPU-NEXT: store i16 [[TMP3]], ptr [[DOTADDR3_ASCAST]], align 2
+// IR-GPU-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
+// IR-GPU-NEXT: [[TMP5:%.*]] = load i16, ptr [[DOTADDR1_ASCAST]], align 2
+// IR-GPU-NEXT: [[TMP6:%.*]] = load i16, ptr [[DOTADDR2_ASCAST]], align 2
+// IR-GPU-NEXT: [[TMP7:%.*]] = load i16, ptr [[DOTADDR3_ASCAST]], align 2
+// IR-GPU-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
+// IR-GPU-NEXT: [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8
+// IR-GPU-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]], i64 0, i64 0
+// IR-GPU-NEXT: [[TMP11:%.*]] = getelementptr [10 x [10 x i32]], ptr [[TMP9]], i64 1
+// IR-GPU-NEXT: br label [[DOTSHUFFLE_PRE_COND:%.*]]
+// IR-GPU: .shuffle.pre_cond:
+// IR-GPU-NEXT: [[TMP12:%.*]] = phi ptr [ [[TMP9]], [[ENTRY:%.*]] ], [ [[TMP23:%.*]], [[DOTSHUFFLE_THEN:%.*]] ]
+// IR-GPU-NEXT: [[TMP13:%.*]] = phi ptr [ [[DOTOMP_REDUCTION_ELEMENT_ASCAST]], [[ENTRY]] ], [ [[TMP24:%.*]], [[DOTSHUFFLE_THEN]] ]
+// IR-GPU-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP11]] to i64
+// IR-GPU-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// IR-GPU-NEXT: [[TMP16:%.*]] = sub i64 [[TMP14]], [[TMP15]]
+// IR-GPU-NEXT: [[TMP17:%.*]] = sdiv exact i64 [[TMP16]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
+// IR-GPU-NEXT: [[TMP18:%.*]] = icmp sgt i64 [[TMP17]], 7
+// IR-GPU-NEXT: br i1 [[TMP18]], label [[DOTSHUFFLE_THEN]], label [[DOTSHUFFLE_EXIT:%.*]]
+// IR-GPU: .shuffle.then:
+// IR-GPU-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP12]], align 4
+// IR-GPU-NEXT: [[TMP20:%.*]] = call i32 @__kmpc_get_warp_size()
+// IR-GPU-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
+// IR-GPU-NEXT: [[TMP22:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP19]], i16 [[TMP6]], i16 [[TMP21]])
+// IR-GPU-NEXT: store i64 [[TMP22]], ptr [[TMP13]], align 4
+// IR-GPU-NEXT: [[TMP23]] = getelementptr i64, ptr [[TMP12]], i64 1
+// IR-GPU-NEXT: [[TMP24]] = getelementptr i64, ptr [[TMP13]], i64 1
+// IR-GPU-NEXT: br label [[DOTSHUFFLE_PRE_COND]]
+// IR-GPU: .shuffle.exit:
+// IR-GPU-NEXT: store ptr [[DOTOMP_REDUCTION_ELEMENT_ASCAST]], ptr [[TMP10]], align 8
+// IR-GPU-NEXT: [[TMP25:%.*]] = icmp eq i16 [[TMP7]], 0
+// IR-GPU-NEXT: [[TMP26:%.*]] = icmp eq i16 [[TMP7]], 1
+// IR-GPU-NEXT: [[TMP27:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
+// IR-GPU-NEXT: [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
+// IR-GPU-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP7]], 2
+// IR-GPU-NEXT: [[TMP30:%.*]] = and i16 [[TMP5]], 1
+// IR-GPU-NEXT: [[TMP31:%.*]] = icmp eq i16 [[TMP30]], 0
+// IR-GPU-NEXT: [[TMP32:%.*]] = and i1 [[TMP29]], [[TMP31]]
+// IR-GPU-NEXT: [[TMP33:%.*]] = icmp sgt i16 [[TMP6]], 0
+// IR-GPU-NEXT: [[TMP34:%.*]] = and i1 [[TMP32]], [[TMP33]]
+// IR-GPU-NEXT: [[TMP35:%.*]] = or i1 [[TMP25]], [[TMP28]]
+// IR-GPU-NEXT: [[TMP36:%.*]] = or i1 [[TMP35]], [[TMP34]]
+// IR-GPU-NEXT: br i1 [[TMP36]], label [[THEN:%.*]], label [[ELSE:%.*]]
+// IR-GPU: then:
+// IR-GPU-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined_omp_outlined_omp$reduction$reduction_func"(ptr [[TMP4]], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]]) #[[ATTR2]]
+// IR-GPU-NEXT: br label [[IFCONT:%.*]]
+// IR-GPU: else:
+// IR-GPU-NEXT: br label [[IFCONT]]
+// IR-GPU: ifcont:
+// IR-GPU-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP7]], 1
+// IR-GPU-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP5]], [[TMP6]]
+// IR-GPU-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
+// IR-GPU-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
+// IR-GPU: then4:
+// IR-GPU-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]], i64 0, i64 0
+// IR-GPU-NEXT: [[TMP41:%.*]] = load ptr, ptr [[TMP40]], align 8
+// IR-GPU-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
+// IR-GPU-NEXT: [[TMP43:%.*]] = load ptr, ptr [[TMP42]], align 8
+// IR-GPU-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP43]], ptr align 4 [[TMP41]], i64 400, i1 false)
+// IR-GPU-NEXT: br label [[IFCONT6:%.*]]
+// IR-GPU: else5:
+// IR-GPU-NEXT: br label [[IFCONT6]]
+// IR-GPU: ifcont6:
+// IR-GPU-NEXT: ret void
+//
+//
+// IR-GPU-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func
+// IR-GPU-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR3]] {
+// IR-GPU-NEXT: entry:
+// IR-GPU-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// IR-GPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
+// IR-GPU-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
+// IR-GPU-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
+// IR-GPU-NEXT: [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
+// IR-GPU-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
+// IR-GPU-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// IR-GPU-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// IR-GPU-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 63
+// IR-GPU-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// IR-GPU-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 6
+// IR-GPU-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
+// IR-GPU-NEXT: store i32 0, ptr [[DOTCNT_ADDR_ASCAST]], align 4
+// IR-GPU-NEXT: br label [[PRECOND:%.*]]
+// IR-GPU: precond:
+// IR-GPU-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 100
+// IR-GPU-NEXT: br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
+// IR-GPU: body:
+// IR-GPU-NEXT: call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4:[0-9]+]] to ptr), i32 [[TMP2]])
+// IR-GPU-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
+// IR-GPU-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
+// IR-GPU: then:
+// IR-GPU-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
+// IR-GPU-NEXT: [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
+// IR-GPU-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 [[TMP7]]
+// IR-GPU-NEXT: [[TMP12:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
+// IR-GPU-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP11]], align 4
+// IR-GPU-NEXT: store volatile i32 [[TMP13]], ptr addrspace(3) [[TMP12]], align 4
+// IR-GPU-NEXT: br label [[IFCONT:%.*]]
+// IR-GPU: else:
+// IR-GPU-NEXT: br label [[IFCONT]]
+// IR-GPU: ifcont:
+// IR-GPU-NEXT: call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), i32 [[TMP2]])
+// IR-GPU-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
+// IR-GPU-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
+// IR-GPU-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
+// IR-GPU: then2:
+// IR-GPU-NEXT: [[TMP15:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
+// IR-GPU-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
+// IR-GPU-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
+// IR-GPU-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 [[TMP7]]
+// IR-GPU-NEXT: [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
+// IR-GPU-NEXT: store i32 [[TMP19]], ptr [[TMP18]], align 4
+// IR-GPU-NEXT: br label [[IFCONT4:%.*]]
+// IR-GPU: else3:
+// IR-GPU-NEXT: br label [[IFCONT4]]
+// IR-GPU: ifcont4:
+// IR-GPU-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
+// IR-GPU-NEXT: store i32 [[TMP20]], ptr [[DOTCNT_ADDR_ASCAST]], align 4
+// IR-GPU-NEXT: br label [[PRECOND]]
+// IR-GPU: exit:
+// IR-GPU-NEXT: ret void
+//
+//
+// IR-GPU-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func.1
+// IR-GPU-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR3]] {
+// IR-GPU-NEXT: entry:
+// IR-GPU-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// IR-GPU-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2, addrspace(5)
+// IR-GPU-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2, addrspace(5)
+// IR-GPU-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2, addrspace(5)
+// IR-GPU-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
+// IR-GPU-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
+// IR-GPU-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
+// IR-GPU-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
+// IR-GPU-NEXT: [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
+// IR-GPU-NEXT: [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr
+// IR-GPU-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to ptr
+// IR-GPU-NEXT: [[DOTOMP_REDUCTION_ELEMENT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_ELEMENT]] to ptr
+// IR-GPU-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
+// IR-GPU-NEXT: store i16 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 2
+// IR-GPU-NEXT: store i16 [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 2
+// IR-GPU-NEXT: store i16 [[TMP3]], ptr [[DOTADDR3_ASCAST]], align 2
+// IR-GPU-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
+// IR-GPU-NEXT: [[TMP5:%.*]] = load i16, ptr [[DOTADDR1_ASCAST]], align 2
+// IR-GPU-NEXT: [[TMP6:%.*]] = load i16, ptr [[DOTADDR2_ASCAST]], align 2
+// IR-GPU-NEXT: [[TMP7:%.*]] = load i16, ptr [[DOTADDR3_ASCAST]], align 2
+// IR-GPU-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
+// IR-GPU-NEXT: [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8
+// IR-GPU-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]], i64 0, i64 0
+// IR-GPU-NEXT: [[TMP11:%.*]] = getelementptr [10 x [10 x i32]], ptr [[TMP9]], i64 1
+// IR-GPU-NEXT: br label [[DOTSHUFFLE_PRE_COND:%.*]]
+// IR-GPU: .shuffle.pre_cond:
+// IR-GPU-NEXT: [[TMP12:%.*]] = phi ptr [ [[TMP9]], [[ENTRY:%.*]] ], [ [[TMP23:%.*]], [[DOTSHUFFLE_THEN:%.*]] ]
+// IR-GPU-NEXT: [[TMP13:%.*]] = phi ptr [ [[DOTOMP_REDUCTION_ELEMENT_ASCAST]], [[ENTRY]] ], [ [[TMP24:%.*]], [[DOTSHUFFLE_THEN]] ]
+// IR-GPU-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP11]] to i64
+// IR-GPU-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// IR-GPU-NEXT: [[TMP16:%.*]] = sub i64 [[TMP14]], [[TMP15]]
+// IR-GPU-NEXT: [[TMP17:%.*]] = sdiv exact i64 [[TMP16]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
+// IR-GPU-NEXT: [[TMP18:%.*]] = icmp sgt i64 [[TMP17]], 7
+// IR-GPU-NEXT: br i1 [[TMP18]], label [[DOTSHUFFLE_THEN]], label [[DOTSHUFFLE_EXIT:%.*]]
+// IR-GPU: .shuffle.then:
+// IR-GPU-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP12]], align 4
+// IR-GPU-NEXT: [[TMP20:%.*]] = call i32 @__kmpc_get_warp_size()
+// IR-GPU-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
+// IR-GPU-NEXT: [[TMP22:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP19]], i16 [[TMP6]], i16 [[TMP21]])
+// IR-GPU-NEXT: store i64 [[TMP22]], ptr [[TMP13]], align 4
+// IR-GPU-NEXT: [[TMP23]] = getelementptr i64, ptr [[TMP12]], i64 1
+// IR-GPU-NEXT: [[TMP24]] = getelementptr i64, ptr [[TMP13]], i64 1
+// IR-GPU-NEXT: br label [[DOTSHUFFLE_PRE_COND]]
+// IR-GPU: .shuffle.exit:
+// IR-GPU-NEXT: store ptr [[DOTOMP_REDUCTION_ELEMENT_ASCAST]], ptr [[TMP10]], align 8
+// IR-GPU-NEXT: [[TMP25:%.*]] = icmp eq i16 [[TMP7]], 0
+// IR-GPU-NEXT: [[TMP26:%.*]] = icmp eq i16 [[TMP7]], 1
+// IR-GPU-NEXT: [[TMP27:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
+// IR-GPU-NEXT: [[TMP28:%.*]] = and i1 [[TMP26]], [[TMP27]]
+// IR-GPU-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP7]], 2
+// IR-GPU-NEXT: [[TMP30:%.*]] = and i16 [[TMP5]], 1
+// IR-GPU-NEXT: [[TMP31:%.*]] = icmp eq i16 [[TMP30]], 0
+// IR-GPU-NEXT: [[TMP32:%.*]] = and i1 [[TMP29]], [[TMP31]]
+// IR-GPU-NEXT: [[TMP33:%.*]] = icmp sgt i16 [[TMP6]], 0
+// IR-GPU-NEXT: [[TMP34:%.*]] = and i1 [[TMP32]], [[TMP33]]
+// IR-GPU-NEXT: [[TMP35:%.*]] = or i1 [[TMP25]], [[TMP28]]
+// IR-GPU-NEXT: [[TMP36:%.*]] = or i1 [[TMP35]], [[TMP34]]
+// IR-GPU-NEXT: br i1 [[TMP36]], label [[THEN:%.*]], label [[ELSE:%.*]]
+// IR-GPU: then:
+// IR-GPU-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined_omp$reduction$reduction_func"(ptr [[TMP4]], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]]) #[[ATTR2]]
+// IR-GPU-NEXT: br label [[IFCONT:%.*]]
+// IR-GPU: else:
+// IR-GPU-NEXT: br label [[IFCONT]]
+// IR-GPU: ifcont:
+// IR-GPU-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP7]], 1
+// IR-GPU-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP5]], [[TMP6]]
+// IR-GPU-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
+// IR-GPU-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
+// IR-GPU: then4:
+// IR-GPU-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST]], i64 0, i64 0
+// IR-GPU-NEXT: [[TMP41:%.*]] = load ptr, ptr [[TMP40]], align 8
+// IR-GPU-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
+// IR-GPU-NEXT: [[TMP43:%.*]] = load ptr, ptr [[TMP42]], align 8
+// IR-GPU-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP43]], ptr align 4 [[TMP41]], i64 400, i1 false)
+// IR-GPU-NEXT: br label [[IFCONT6:%.*]]
+// IR-GPU: else5:
+// IR-GPU-NEXT: br label [[IFCONT6]]
+// IR-GPU: ifcont6:
+// IR-GPU-NEXT: ret void
+//
+//
+// IR-GPU-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func.2
+// IR-GPU-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR3]] {
+// IR-GPU-NEXT: entry:
+// IR-GPU-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// IR-GPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
+// IR-GPU-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
+// IR-GPU-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
+// IR-GPU-NEXT: [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
+// IR-GPU-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
+// IR-GPU-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// IR-GPU-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// IR-GPU-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 63
+// IR-GPU-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// IR-GPU-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 6
+// IR-GPU-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
+// IR-GPU-NEXT: store i32 0, ptr [[DOTCNT_ADDR_ASCAST]], align 4
+// IR-GPU-NEXT: br label [[PRECOND:%.*]]
+// IR-GPU: precond:
+// IR-GPU-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 100
+// IR-GPU-NEXT: br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
+// IR-GPU: body:
+// IR-GPU-NEXT: call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), i32 [[TMP2]])
+// IR-GPU-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
+// IR-GPU-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
+// IR-GPU: then:
+// IR-GPU-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
+// IR-GPU-NEXT: [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
+// IR-GPU-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 [[TMP7]]
+// IR-GPU-NEXT: [[TMP12:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
+// IR-GPU-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP11]], align 4
+// IR-GPU-NEXT: store volatile i32 [[TMP13]], ptr addrspace(3) [[TMP12]], align 4
+// IR-GPU-NEXT: br label [[IFCONT:%.*]]
+// IR-GPU: else:
+// IR-GPU-NEXT: br label [[IFCONT]]
+// IR-GPU: ifcont:
+// IR-GPU-NEXT: call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), i32 [[TMP2]])
+// IR-GPU-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
+// IR-GPU-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
+// IR-GPU-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
+// IR-GPU: then2:
+// IR-GPU-NEXT: [[TMP15:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
+// IR-GPU-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
+// IR-GPU-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
+// IR-GPU-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 [[TMP7]]
+// IR-GPU-NEXT: [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
+// IR-GPU-NEXT: store i32 [[TMP19]], ptr [[TMP18]], align 4
+// IR-GPU-NEXT: br label [[IFCONT4:%.*]]
+// IR-GPU: else3:
+// IR-GPU-NEXT: br label [[IFCONT4]]
+// IR-GPU: ifcont4:
+// IR-GPU-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
+// IR-GPU-NEXT: store i32 [[TMP20]], ptr [[DOTCNT_ADDR_ASCAST]], align 4
+// IR-GPU-NEXT: br label [[PRECOND]]
+// IR-GPU: exit:
+// IR-GPU-NEXT: ret void
+//
+//
+// IR-GPU-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func
+// IR-GPU-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
+// IR-GPU-NEXT: entry:
+// IR-GPU-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// IR-GPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 8, addrspace(5)
+// IR-GPU-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
+// IR-GPU-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
+// IR-GPU-NEXT: [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
+// IR-GPU-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
+// IR-GPU-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
+// IR-GPU-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 8
+// IR-GPU-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
+// IR-GPU-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
+// IR-GPU-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
+// IR-GPU-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// IR-GPU-NEXT: [[SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP4]], i32 0, i32 0
+// IR-GPU-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x [10 x [10 x i32]]], ptr [[SUM]], i32 0, i32 [[TMP5]]
+// IR-GPU-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 128 [[TMP8]], ptr align 4 [[TMP7]], i64 400, i1 false)
+// IR-GPU-NEXT: ret void
+//
+//
+// IR-GPU-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func
+// IR-GPU-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
+// IR-GPU-NEXT: entry:
+// IR-GPU-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// IR-GPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 8, addrspace(5)
+// IR-GPU-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
+// IR-GPU-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
+// IR-GPU-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
+// IR-GPU-NEXT: [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
+// IR-GPU-NEXT: [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
+// IR-GPU-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
+// IR-GPU-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
+// IR-GPU-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 8
+// IR-GPU-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
+// IR-GPU-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0
+// IR-GPU-NEXT: [[SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP3]], i32 0, i32 0
+// IR-GPU-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x [10 x [10 x i32]]], ptr [[SUM]], i32 0, i32 [[TMP4]]
+// IR-GPU-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 8
+// IR-GPU-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
+// IR-GPU-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined_omp$reduction$reduction_func"(ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr [[TMP7]]) #[[ATTR2]]
+// IR-GPU-NEXT: ret void
+//
+//
+// IR-GPU-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func
+// IR-GPU-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
+// IR-GPU-NEXT: entry:
+// IR-GPU-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// IR-GPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 8, addrspace(5)
+// IR-GPU-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
+// IR-GPU-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
+// IR-GPU-NEXT: [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
+// IR-GPU-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
+// IR-GPU-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
+// IR-GPU-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 8
+// IR-GPU-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
+// IR-GPU-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
+// IR-GPU-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
+// IR-GPU-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// IR-GPU-NEXT: [[SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP4]], i32 0, i32 0
+// IR-GPU-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x [10 x [10 x i32]]], ptr [[SUM]], i32 0, i32 [[TMP5]]
+// IR-GPU-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP7]], ptr align 128 [[TMP8]], i64 400, i1 false)
+// IR-GPU-NEXT: ret void
+//
+//
+// IR-GPU-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func
+// IR-GPU-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]]) #[[ATTR3]] {
+// IR-GPU-NEXT: entry:
+// IR-GPU-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// IR-GPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 8, addrspace(5)
+// IR-GPU-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
+// IR-GPU-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
+// IR-GPU-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
+// IR-GPU-NEXT: [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
+// IR-GPU-NEXT: [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
+// IR-GPU-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
+// IR-GPU-NEXT: store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
+// IR-GPU-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 8
+// IR-GPU-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR_ASCAST]], align 8
+// IR-GPU-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4
+// IR-GPU-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0
+// IR-GPU-NEXT: [[SUM:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], ptr [[TMP3]], i32 0, i32 0
+// IR-GPU-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x [10 x [10 x i32]]], ptr [[SUM]], i32 0, i32 [[TMP4]]
+// IR-GPU-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 8
+// IR-GPU-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTADDR2_ASCAST]], align 8
+// IR-GPU-NEXT: call void @"{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined_omp$reduction$reduction_func"(ptr [[TMP7]], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]]) #[[ATTR2]]
+// IR-GPU-NEXT: ret void
+//
+//
+// IR-LABEL: define {{[^@]+}}@_Z3foov
+// IR-SAME: () #[[ATTR0:[0-9]+]] {
+// IR-NEXT: entry:
+// IR-NEXT: [[I:%.*]] = alloca i32, align 4
+// IR-NEXT: [[J:%.*]] = alloca i32, align 4
+// IR-NEXT: [[SUM:%.*]] = alloca [10 x [10 x i32]], align 16
+// IR-NEXT: [[J_CASTED:%.*]] = alloca i64, align 8
+// IR-NEXT: [[TMP0:%.*]] = load i32, ptr [[J]], align 4
+// IR-NEXT: store i32 [[TMP0]], ptr [[J_CASTED]], align 4
+// IR-NEXT: [[TMP1:%.*]] = load i64, ptr [[J_CASTED]], align 8
+// IR-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22(i64 [[TMP1]], ptr [[SUM]]) #[[ATTR2:[0-9]+]]
+// IR-NEXT: ret i32 0
+//
+//
+// IR-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22
+// IR-SAME: (i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1:[0-9]+]] {
+// IR-NEXT: entry:
+// IR-NEXT: [[J_ADDR:%.*]] = alloca i64, align 8
+// IR-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8
+// IR-NEXT: [[J_CASTED:%.*]] = alloca i64, align 8
+// IR-NEXT: store i64 [[J]], ptr [[J_ADDR]], align 8
+// IR-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
+// IR-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
+// IR-NEXT: [[TMP1:%.*]] = load i32, ptr [[J_ADDR]], align 4
+// IR-NEXT: store i32 [[TMP1]], ptr [[J_CASTED]], align 4
+// IR-NEXT: [[TMP2:%.*]] = load i64, ptr [[J_CASTED]], align 8
+// IR-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB4:[0-9]+]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined, i64 [[TMP2]], ptr [[TMP0]])
+// IR-NEXT: ret void
+//
+//
+// IR-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined
+// IR-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1]] {
+// IR-NEXT: entry:
+// IR-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// IR-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// IR-NEXT: [[J_ADDR:%.*]] = alloca i64, align 8
+// IR-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8
+// IR-NEXT: [[SUM1:%.*]] = alloca [10 x [10 x i32]], align 16
+// IR-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// IR-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// IR-NEXT: [[_TMP2:%.*]] = alloca i32, align 4
+// IR-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// IR-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// IR-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// IR-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// IR-NEXT: [[J3:%.*]] = alloca i32, align 4
+// IR-NEXT: [[I:%.*]] = alloca i32, align 4
+// IR-NEXT: [[J4:%.*]] = alloca i32, align 4
+// IR-NEXT: [[J_CASTED:%.*]] = alloca i64, align 8
+// IR-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
+// IR-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// IR-NEXT: store i64 [[J]], ptr [[J_ADDR]], align 8
+// IR-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
+// IR-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
+// IR-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM1]], i32 0, i32 0, i32 0
+// IR-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
+// IR-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP1]]
+// IR-NEXT: br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
+// IR: omp.arrayinit.body:
+// IR-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
+// IR-NEXT: store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
+// IR-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
+// IR-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP1]]
+// IR-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
+// IR: omp.arrayinit.done:
+// IR-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// IR-NEXT: store i32 99, ptr [[DOTOMP_COMB_UB]], align 4
+// IR-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// IR-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// IR-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// IR-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP3]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// IR-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// IR-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 99
+// IR-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// IR: cond.true:
+// IR-NEXT: br label [[COND_END:%.*]]
+// IR: cond.false:
+// IR-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// IR-NEXT: br label [[COND_END]]
+// IR: cond.end:
+// IR-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// IR-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// IR-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// IR-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
+// IR-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// IR: omp.inner.for.cond:
+// IR-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// IR-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// IR-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// IR-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// IR: omp.inner.for.body:
+// IR-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// IR-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// IR-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// IR-NEXT: [[TMP12:%.*]] = zext i32 [[TMP11]] to i64
+// IR-NEXT: [[TMP13:%.*]] = load i32, ptr [[J3]], align 4
+// IR-NEXT: store i32 [[TMP13]], ptr [[J_CASTED]], align 4
+// IR-NEXT: [[TMP14:%.*]] = load i64, ptr [[J_CASTED]], align 8
+// IR-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB4]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp_outlined, i64 [[TMP10]], i64 [[TMP12]], i64 [[TMP14]], ptr [[SUM1]])
+// IR-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// IR: omp.inner.for.inc:
+// IR-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// IR-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// IR-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// IR-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// IR-NEXT: br label [[OMP_INNER_FOR_COND]]
+// IR: omp.inner.for.end:
+// IR-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// IR: omp.loop.exit:
+// IR-NEXT: [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
+// IR-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP18]])
+// IR-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// IR-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
+// IR-NEXT: br i1 [[TMP20]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
+// IR: .omp.lastprivate.then:
+// IR-NEXT: store i32 10, ptr [[J3]], align 4
+// IR-NEXT: [[TMP21:%.*]] = load i32, ptr [[J3]], align 4
+// IR-NEXT: store i32 [[TMP21]], ptr [[J_ADDR]], align 4
+// IR-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]]
+// IR: .omp.lastprivate.done:
+// IR-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
+// IR-NEXT: store ptr [[SUM1]], ptr [[TMP22]], align 8
+// IR-NEXT: [[TMP23:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4
+// IR-NEXT: [[TMP25:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB3:[0-9]+]], i32 [[TMP24]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// IR-NEXT: switch i32 [[TMP25]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+// IR-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+// IR-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+// IR-NEXT: ]
+// IR: .omp.reduction.case1:
+// IR-NEXT: [[TMP26:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
+// IR-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP26]]
+// IR-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE10:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// IR: omp.arraycpy.body:
+// IR-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM1]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST6:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT8:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-NEXT: [[TMP27:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], align 4
+// IR-NEXT: [[TMP28:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
+// IR-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP27]], [[TMP28]]
+// IR-NEXT: store i32 [[ADD7]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], align 4
+// IR-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT8]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], i32 1
+// IR-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
+// IR-NEXT: [[OMP_ARRAYCPY_DONE9:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT8]], [[TMP26]]
+// IR-NEXT: br i1 [[OMP_ARRAYCPY_DONE9]], label [[OMP_ARRAYCPY_DONE10]], label [[OMP_ARRAYCPY_BODY]]
+// IR: omp.arraycpy.done10:
+// IR-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP24]], ptr @.gomp_critical_user_.reduction.var)
+// IR-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// IR: .omp.reduction.case2:
+// IR-NEXT: [[TMP29:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
+// IR-NEXT: [[OMP_ARRAYCPY_ISEMPTY11:%.*]] = icmp eq ptr [[TMP0]], [[TMP29]]
+// IR-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY11]], label [[OMP_ARRAYCPY_DONE18:%.*]], label [[OMP_ARRAYCPY_BODY12:%.*]]
+// IR: omp.arraycpy.body12:
+// IR-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST13:%.*]] = phi ptr [ [[SUM1]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT16:%.*]], [[OMP_ARRAYCPY_BODY12]] ]
+// IR-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST14:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT15:%.*]], [[OMP_ARRAYCPY_BODY12]] ]
+// IR-NEXT: [[TMP30:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST13]], align 4
+// IR-NEXT: [[TMP31:%.*]] = atomicrmw add ptr [[OMP_ARRAYCPY_DESTELEMENTPAST14]], i32 [[TMP30]] monotonic, align 4
+// IR-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT15]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST14]], i32 1
+// IR-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT16]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST13]], i32 1
+// IR-NEXT: [[OMP_ARRAYCPY_DONE17:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT15]], [[TMP29]]
+// IR-NEXT: br i1 [[OMP_ARRAYCPY_DONE17]], label [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_BODY12]]
+// IR: omp.arraycpy.done18:
+// IR-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP24]], ptr @.gomp_critical_user_.reduction.var)
+// IR-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// IR: .omp.reduction.default:
+// IR-NEXT: ret void
+//
+//
+// IR-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp_outlined
+// IR-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1]] {
+// IR-NEXT: entry:
+// IR-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// IR-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// IR-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// IR-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// IR-NEXT: [[J_ADDR:%.*]] = alloca i64, align 8
+// IR-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8
+// IR-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// IR-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// IR-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// IR-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// IR-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// IR-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// IR-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// IR-NEXT: [[J3:%.*]] = alloca i32, align 4
+// IR-NEXT: [[SUM4:%.*]] = alloca [10 x [10 x i32]], align 16
+// IR-NEXT: [[I:%.*]] = alloca i32, align 4
+// IR-NEXT: [[J5:%.*]] = alloca i32, align 4
+// IR-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
+// IR-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// IR-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// IR-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// IR-NEXT: store i64 [[J]], ptr [[J_ADDR]], align 8
+// IR-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
+// IR-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
+// IR-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// IR-NEXT: store i32 99, ptr [[DOTOMP_UB]], align 4
+// IR-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// IR-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
+// IR-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// IR-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP2]] to i32
+// IR-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// IR-NEXT: store i32 [[CONV2]], ptr [[DOTOMP_UB]], align 4
+// IR-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// IR-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// IR-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4]], i32 0, i32 0, i32 0
+// IR-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
+// IR-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP3]]
+// IR-NEXT: br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
+// IR: omp.arrayinit.body:
+// IR-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
+// IR-NEXT: store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
+// IR-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
+// IR-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
+// IR-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
+// IR: omp.arrayinit.done:
+// IR-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+// IR-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP5]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// IR-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// IR-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP6]], 99
+// IR-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// IR: cond.true:
+// IR-NEXT: br label [[COND_END:%.*]]
+// IR: cond.false:
+// IR-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// IR-NEXT: br label [[COND_END]]
+// IR: cond.end:
+// IR-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP7]], [[COND_FALSE]] ]
+// IR-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// IR-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// IR-NEXT: store i32 [[TMP8]], ptr [[DOTOMP_IV]], align 4
+// IR-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// IR: omp.inner.for.cond:
+// IR-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3:![0-9]+]]
+// IR-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP9]], [[TMP10]]
+// IR-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// IR: omp.inner.for.body:
+// IR-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP11]], 10
+// IR-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
+// IR-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// IR-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT: [[DIV7:%.*]] = sdiv i32 [[TMP13]], 10
+// IR-NEXT: [[MUL8:%.*]] = mul nsw i32 [[DIV7]], 10
+// IR-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP12]], [[MUL8]]
+// IR-NEXT: [[MUL9:%.*]] = mul nsw i32 [[SUB]], 1
+// IR-NEXT: [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
+// IR-NEXT: store i32 [[ADD10]], ptr [[J3]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT: [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT: [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64
+// IR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4]], i64 0, i64 [[IDXPROM]]
+// IR-NEXT: [[TMP16:%.*]] = load i32, ptr [[J3]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT: [[IDXPROM11:%.*]] = sext i32 [[TMP16]] to i64
+// IR-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM11]]
+// IR-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP17]], [[TMP14]]
+// IR-NEXT: store i32 [[ADD13]], ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// IR: omp.body.continue:
+// IR-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// IR: omp.inner.for.inc:
+// IR-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP18]], 1
+// IR-NEXT: store i32 [[ADD14]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]]
+// IR: omp.inner.for.end:
+// IR-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// IR: omp.loop.exit:
+// IR-NEXT: [[TMP19:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
+// IR-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP20]])
+// IR-NEXT: [[TMP21:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
+// IR-NEXT: store ptr [[SUM4]], ptr [[TMP21]], align 8
+// IR-NEXT: [[TMP22:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
+// IR-NEXT: [[TMP24:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB3]], i32 [[TMP23]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// IR-NEXT: switch i32 [[TMP24]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+// IR-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+// IR-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+// IR-NEXT: ]
+// IR: .omp.reduction.case1:
+// IR-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
+// IR-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP25]]
+// IR-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE19:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// IR: omp.arraycpy.body:
+// IR-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM4]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST15:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT17:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-NEXT: [[TMP26:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
+// IR-NEXT: [[TMP27:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
+// IR-NEXT: [[ADD16:%.*]] = add nsw i32 [[TMP26]], [[TMP27]]
+// IR-NEXT: store i32 [[ADD16]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
+// IR-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT17]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], i32 1
+// IR-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
+// IR-NEXT: [[OMP_ARRAYCPY_DONE18:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT17]], [[TMP25]]
+// IR-NEXT: br i1 [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_DONE19]], label [[OMP_ARRAYCPY_BODY]]
+// IR: omp.arraycpy.done19:
+// IR-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP23]], ptr @.gomp_critical_user_.reduction.var)
+// IR-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// IR: .omp.reduction.case2:
+// IR-NEXT: [[TMP28:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
+// IR-NEXT: [[OMP_ARRAYCPY_ISEMPTY20:%.*]] = icmp eq ptr [[TMP0]], [[TMP28]]
+// IR-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY20]], label [[OMP_ARRAYCPY_DONE27:%.*]], label [[OMP_ARRAYCPY_BODY21:%.*]]
+// IR: omp.arraycpy.body21:
+// IR-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST22:%.*]] = phi ptr [ [[SUM4]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT25:%.*]], [[OMP_ARRAYCPY_BODY21]] ]
+// IR-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST23:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT24:%.*]], [[OMP_ARRAYCPY_BODY21]] ]
+// IR-NEXT: [[TMP29:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST22]], align 4
+// IR-NEXT: [[TMP30:%.*]] = atomicrmw add ptr [[OMP_ARRAYCPY_DESTELEMENTPAST23]], i32 [[TMP29]] monotonic, align 4
+// IR-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT24]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST23]], i32 1
+// IR-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT25]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST22]], i32 1
+// IR-NEXT: [[OMP_ARRAYCPY_DONE26:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT24]], [[TMP28]]
+// IR-NEXT: br i1 [[OMP_ARRAYCPY_DONE26]], label [[OMP_ARRAYCPY_DONE27]], label [[OMP_ARRAYCPY_BODY21]]
+// IR: omp.arraycpy.done27:
+// IR-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP23]], ptr @.gomp_critical_user_.reduction.var)
+// IR-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// IR: .omp.reduction.default:
+// IR-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// IR-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
+// IR-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
+// IR: .omp.lastprivate.then:
+// IR-NEXT: store i32 10, ptr [[J3]], align 4
+// IR-NEXT: [[TMP33:%.*]] = load i32, ptr [[J3]], align 4
+// IR-NEXT: store i32 [[TMP33]], ptr [[J_ADDR]], align 4
+// IR-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]]
+// IR: .omp.lastprivate.done:
+// IR-NEXT: ret void
+//
+//
+// IR-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp_outlined.omp.reduction.reduction_func
+// IR-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+// IR-NEXT: entry:
+// IR-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
+// IR-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// IR-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// IR-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// IR-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// IR-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// IR-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
+// IR-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+// IR-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
+// IR-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// IR-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i64 100
+// IR-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP7]], [[TMP8]]
+// IR-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE2:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// IR: omp.arraycpy.body:
+// IR-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[TMP7]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-NEXT: [[TMP9:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
+// IR-NEXT: [[TMP10:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
+// IR-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
+// IR-NEXT: store i32 [[ADD]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
+// IR-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
+// IR-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
+// IR-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP8]]
+// IR-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE2]], label [[OMP_ARRAYCPY_BODY]]
+// IR: omp.arraycpy.done2:
+// IR-NEXT: ret void
+//
+//
+// IR-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp.reduction.reduction_func
+// IR-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
+// IR-NEXT: entry:
+// IR-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
+// IR-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// IR-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// IR-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// IR-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// IR-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// IR-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
+// IR-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+// IR-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
+// IR-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// IR-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i64 100
+// IR-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP7]], [[TMP8]]
+// IR-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE2:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// IR: omp.arraycpy.body:
+// IR-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[TMP7]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-NEXT: [[TMP9:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
+// IR-NEXT: [[TMP10:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
+// IR-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
+// IR-NEXT: store i32 [[ADD]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
+// IR-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
+// IR-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
+// IR-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP8]]
+// IR-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE2]], label [[OMP_ARRAYCPY_BODY]]
+// IR: omp.arraycpy.done2:
+// IR-NEXT: ret void
+//
+//
+// IR-PCH-LABEL: define {{[^@]+}}@_Z3foov
+// IR-PCH-SAME: () #[[ATTR0:[0-9]+]] {
+// IR-PCH-NEXT: entry:
+// IR-PCH-NEXT: [[I:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[J:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[SUM:%.*]] = alloca [10 x [10 x i32]], align 16
+// IR-PCH-NEXT: [[J_CASTED:%.*]] = alloca i64, align 8
+// IR-PCH-NEXT: [[TMP0:%.*]] = load i32, ptr [[J]], align 4
+// IR-PCH-NEXT: store i32 [[TMP0]], ptr [[J_CASTED]], align 4
+// IR-PCH-NEXT: [[TMP1:%.*]] = load i64, ptr [[J_CASTED]], align 8
+// IR-PCH-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22(i64 [[TMP1]], ptr [[SUM]]) #[[ATTR2:[0-9]+]]
+// IR-PCH-NEXT: ret i32 0
+//
+//
+// IR-PCH-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22
+// IR-PCH-SAME: (i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1:[0-9]+]] {
+// IR-PCH-NEXT: entry:
+// IR-PCH-NEXT: [[J_ADDR:%.*]] = alloca i64, align 8
+// IR-PCH-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-NEXT: [[J_CASTED:%.*]] = alloca i64, align 8
+// IR-PCH-NEXT: store i64 [[J]], ptr [[J_ADDR]], align 8
+// IR-PCH-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
+// IR-PCH-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
+// IR-PCH-NEXT: [[TMP1:%.*]] = load i32, ptr [[J_ADDR]], align 4
+// IR-PCH-NEXT: store i32 [[TMP1]], ptr [[J_CASTED]], align 4
+// IR-PCH-NEXT: [[TMP2:%.*]] = load i64, ptr [[J_CASTED]], align 8
+// IR-PCH-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB4:[0-9]+]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined, i64 [[TMP2]], ptr [[TMP0]])
+// IR-PCH-NEXT: ret void
+//
+//
+// IR-PCH-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined
+// IR-PCH-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1]] {
+// IR-PCH-NEXT: entry:
+// IR-PCH-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-NEXT: [[J_ADDR:%.*]] = alloca i64, align 8
+// IR-PCH-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-NEXT: [[SUM1:%.*]] = alloca [10 x [10 x i32]], align 16
+// IR-PCH-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[_TMP2:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[J3:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[I:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[J4:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[J_CASTED:%.*]] = alloca i64, align 8
+// IR-PCH-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
+// IR-PCH-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-PCH-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// IR-PCH-NEXT: store i64 [[J]], ptr [[J_ADDR]], align 8
+// IR-PCH-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
+// IR-PCH-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
+// IR-PCH-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM1]], i32 0, i32 0, i32 0
+// IR-PCH-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
+// IR-PCH-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP1]]
+// IR-PCH-NEXT: br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
+// IR-PCH: omp.arrayinit.body:
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
+// IR-PCH-NEXT: store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP1]]
+// IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
+// IR-PCH: omp.arrayinit.done:
+// IR-PCH-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// IR-PCH-NEXT: store i32 99, ptr [[DOTOMP_COMB_UB]], align 4
+// IR-PCH-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// IR-PCH-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// IR-PCH-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-PCH-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// IR-PCH-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP3]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// IR-PCH-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// IR-PCH-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 99
+// IR-PCH-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// IR-PCH: cond.true:
+// IR-PCH-NEXT: br label [[COND_END:%.*]]
+// IR-PCH: cond.false:
+// IR-PCH-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// IR-PCH-NEXT: br label [[COND_END]]
+// IR-PCH: cond.end:
+// IR-PCH-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// IR-PCH-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// IR-PCH-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// IR-PCH-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
+// IR-PCH-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// IR-PCH: omp.inner.for.cond:
+// IR-PCH-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// IR-PCH-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// IR-PCH-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// IR-PCH-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// IR-PCH: omp.inner.for.body:
+// IR-PCH-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// IR-PCH-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// IR-PCH-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// IR-PCH-NEXT: [[TMP12:%.*]] = zext i32 [[TMP11]] to i64
+// IR-PCH-NEXT: [[TMP13:%.*]] = load i32, ptr [[J3]], align 4
+// IR-PCH-NEXT: store i32 [[TMP13]], ptr [[J_CASTED]], align 4
+// IR-PCH-NEXT: [[TMP14:%.*]] = load i64, ptr [[J_CASTED]], align 8
+// IR-PCH-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB4]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp_outlined, i64 [[TMP10]], i64 [[TMP12]], i64 [[TMP14]], ptr [[SUM1]])
+// IR-PCH-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// IR-PCH: omp.inner.for.inc:
+// IR-PCH-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// IR-PCH-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// IR-PCH-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
+// IR-PCH-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// IR-PCH-NEXT: br label [[OMP_INNER_FOR_COND]]
+// IR-PCH: omp.inner.for.end:
+// IR-PCH-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// IR-PCH: omp.loop.exit:
+// IR-PCH-NEXT: [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-PCH-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
+// IR-PCH-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP18]])
+// IR-PCH-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// IR-PCH-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
+// IR-PCH-NEXT: br i1 [[TMP20]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
+// IR-PCH: .omp.lastprivate.then:
+// IR-PCH-NEXT: store i32 10, ptr [[J3]], align 4
+// IR-PCH-NEXT: [[TMP21:%.*]] = load i32, ptr [[J3]], align 4
+// IR-PCH-NEXT: store i32 [[TMP21]], ptr [[J_ADDR]], align 4
+// IR-PCH-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]]
+// IR-PCH: .omp.lastprivate.done:
+// IR-PCH-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
+// IR-PCH-NEXT: store ptr [[SUM1]], ptr [[TMP22]], align 8
+// IR-PCH-NEXT: [[TMP23:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-PCH-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4
+// IR-PCH-NEXT: [[TMP25:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB3:[0-9]+]], i32 [[TMP24]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// IR-PCH-NEXT: switch i32 [[TMP25]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+// IR-PCH-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+// IR-PCH-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+// IR-PCH-NEXT: ]
+// IR-PCH: .omp.reduction.case1:
+// IR-PCH-NEXT: [[TMP26:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP26]]
+// IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE10:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// IR-PCH: omp.arraycpy.body:
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM1]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST6:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT8:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-PCH-NEXT: [[TMP27:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], align 4
+// IR-PCH-NEXT: [[TMP28:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
+// IR-PCH-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP27]], [[TMP28]]
+// IR-PCH-NEXT: store i32 [[ADD7]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], align 4
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT8]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], i32 1
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DONE9:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT8]], [[TMP26]]
+// IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_DONE9]], label [[OMP_ARRAYCPY_DONE10]], label [[OMP_ARRAYCPY_BODY]]
+// IR-PCH: omp.arraycpy.done10:
+// IR-PCH-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP24]], ptr @.gomp_critical_user_.reduction.var)
+// IR-PCH-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// IR-PCH: .omp.reduction.case2:
+// IR-PCH-NEXT: [[TMP29:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_ISEMPTY11:%.*]] = icmp eq ptr [[TMP0]], [[TMP29]]
+// IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY11]], label [[OMP_ARRAYCPY_DONE18:%.*]], label [[OMP_ARRAYCPY_BODY12:%.*]]
+// IR-PCH: omp.arraycpy.body12:
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST13:%.*]] = phi ptr [ [[SUM1]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT16:%.*]], [[OMP_ARRAYCPY_BODY12]] ]
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST14:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT15:%.*]], [[OMP_ARRAYCPY_BODY12]] ]
+// IR-PCH-NEXT: [[TMP30:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST13]], align 4
+// IR-PCH-NEXT: [[TMP31:%.*]] = atomicrmw add ptr [[OMP_ARRAYCPY_DESTELEMENTPAST14]], i32 [[TMP30]] monotonic, align 4
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT15]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST14]], i32 1
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT16]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST13]], i32 1
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DONE17:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT15]], [[TMP29]]
+// IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_DONE17]], label [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_BODY12]]
+// IR-PCH: omp.arraycpy.done18:
+// IR-PCH-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP24]], ptr @.gomp_critical_user_.reduction.var)
+// IR-PCH-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// IR-PCH: .omp.reduction.default:
+// IR-PCH-NEXT: ret void
+//
+//
+// IR-PCH-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp_outlined
+// IR-PCH-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1]] {
+// IR-PCH-NEXT: entry:
+// IR-PCH-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// IR-PCH-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// IR-PCH-NEXT: [[J_ADDR:%.*]] = alloca i64, align 8
+// IR-PCH-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[J3:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[SUM4:%.*]] = alloca [10 x [10 x i32]], align 16
+// IR-PCH-NEXT: [[I:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[J5:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
+// IR-PCH-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-PCH-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// IR-PCH-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// IR-PCH-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// IR-PCH-NEXT: store i64 [[J]], ptr [[J_ADDR]], align 8
+// IR-PCH-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
+// IR-PCH-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
+// IR-PCH-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// IR-PCH-NEXT: store i32 99, ptr [[DOTOMP_UB]], align 4
+// IR-PCH-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// IR-PCH-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
+// IR-PCH-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// IR-PCH-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP2]] to i32
+// IR-PCH-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// IR-PCH-NEXT: store i32 [[CONV2]], ptr [[DOTOMP_UB]], align 4
+// IR-PCH-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// IR-PCH-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// IR-PCH-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4]], i32 0, i32 0, i32 0
+// IR-PCH-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
+// IR-PCH-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP3]]
+// IR-PCH-NEXT: br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
+// IR-PCH: omp.arrayinit.body:
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
+// IR-PCH-NEXT: store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
+// IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
+// IR-PCH: omp.arrayinit.done:
+// IR-PCH-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-PCH-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+// IR-PCH-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP5]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// IR-PCH-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// IR-PCH-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP6]], 99
+// IR-PCH-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// IR-PCH: cond.true:
+// IR-PCH-NEXT: br label [[COND_END:%.*]]
+// IR-PCH: cond.false:
+// IR-PCH-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// IR-PCH-NEXT: br label [[COND_END]]
+// IR-PCH: cond.end:
+// IR-PCH-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP7]], [[COND_FALSE]] ]
+// IR-PCH-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// IR-PCH-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// IR-PCH-NEXT: store i32 [[TMP8]], ptr [[DOTOMP_IV]], align 4
+// IR-PCH-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// IR-PCH: omp.inner.for.cond:
+// IR-PCH-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3:![0-9]+]]
+// IR-PCH-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP9]], [[TMP10]]
+// IR-PCH-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// IR-PCH: omp.inner.for.body:
+// IR-PCH-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP11]], 10
+// IR-PCH-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
+// IR-PCH-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// IR-PCH-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT: [[DIV7:%.*]] = sdiv i32 [[TMP13]], 10
+// IR-PCH-NEXT: [[MUL8:%.*]] = mul nsw i32 [[DIV7]], 10
+// IR-PCH-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP12]], [[MUL8]]
+// IR-PCH-NEXT: [[MUL9:%.*]] = mul nsw i32 [[SUB]], 1
+// IR-PCH-NEXT: [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
+// IR-PCH-NEXT: store i32 [[ADD10]], ptr [[J3]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT: [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT: [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64
+// IR-PCH-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4]], i64 0, i64 [[IDXPROM]]
+// IR-PCH-NEXT: [[TMP16:%.*]] = load i32, ptr [[J3]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT: [[IDXPROM11:%.*]] = sext i32 [[TMP16]] to i64
+// IR-PCH-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM11]]
+// IR-PCH-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP17]], [[TMP14]]
+// IR-PCH-NEXT: store i32 [[ADD13]], ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// IR-PCH: omp.body.continue:
+// IR-PCH-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// IR-PCH: omp.inner.for.inc:
+// IR-PCH-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP18]], 1
+// IR-PCH-NEXT: store i32 [[ADD14]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]]
+// IR-PCH: omp.inner.for.end:
+// IR-PCH-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// IR-PCH: omp.loop.exit:
+// IR-PCH-NEXT: [[TMP19:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-PCH-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
+// IR-PCH-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP20]])
+// IR-PCH-NEXT: [[TMP21:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
+// IR-PCH-NEXT: store ptr [[SUM4]], ptr [[TMP21]], align 8
+// IR-PCH-NEXT: [[TMP22:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-PCH-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
+// IR-PCH-NEXT: [[TMP24:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB3]], i32 [[TMP23]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// IR-PCH-NEXT: switch i32 [[TMP24]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+// IR-PCH-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+// IR-PCH-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+// IR-PCH-NEXT: ]
+// IR-PCH: .omp.reduction.case1:
+// IR-PCH-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP25]]
+// IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE19:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// IR-PCH: omp.arraycpy.body:
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM4]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST15:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT17:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-PCH-NEXT: [[TMP26:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
+// IR-PCH-NEXT: [[TMP27:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
+// IR-PCH-NEXT: [[ADD16:%.*]] = add nsw i32 [[TMP26]], [[TMP27]]
+// IR-PCH-NEXT: store i32 [[ADD16]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT17]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], i32 1
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DONE18:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT17]], [[TMP25]]
+// IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_DONE19]], label [[OMP_ARRAYCPY_BODY]]
+// IR-PCH: omp.arraycpy.done19:
+// IR-PCH-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP23]], ptr @.gomp_critical_user_.reduction.var)
+// IR-PCH-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// IR-PCH: .omp.reduction.case2:
+// IR-PCH-NEXT: [[TMP28:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_ISEMPTY20:%.*]] = icmp eq ptr [[TMP0]], [[TMP28]]
+// IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY20]], label [[OMP_ARRAYCPY_DONE27:%.*]], label [[OMP_ARRAYCPY_BODY21:%.*]]
+// IR-PCH: omp.arraycpy.body21:
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST22:%.*]] = phi ptr [ [[SUM4]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT25:%.*]], [[OMP_ARRAYCPY_BODY21]] ]
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST23:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT24:%.*]], [[OMP_ARRAYCPY_BODY21]] ]
+// IR-PCH-NEXT: [[TMP29:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST22]], align 4
+// IR-PCH-NEXT: [[TMP30:%.*]] = atomicrmw add ptr [[OMP_ARRAYCPY_DESTELEMENTPAST23]], i32 [[TMP29]] monotonic, align 4
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT24]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST23]], i32 1
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT25]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST22]], i32 1
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DONE26:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT24]], [[TMP28]]
+// IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_DONE26]], label [[OMP_ARRAYCPY_DONE27]], label [[OMP_ARRAYCPY_BODY21]]
+// IR-PCH: omp.arraycpy.done27:
+// IR-PCH-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP23]], ptr @.gomp_critical_user_.reduction.var)
+// IR-PCH-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// IR-PCH: .omp.reduction.default:
+// IR-PCH-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// IR-PCH-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
+// IR-PCH-NEXT: br i1 [[TMP32]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
+// IR-PCH: .omp.lastprivate.then:
+// IR-PCH-NEXT: store i32 10, ptr [[J3]], align 4
+// IR-PCH-NEXT: [[TMP33:%.*]] = load i32, ptr [[J3]], align 4
+// IR-PCH-NEXT: store i32 [[TMP33]], ptr [[J_ADDR]], align 4
+// IR-PCH-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]]
+// IR-PCH: .omp.lastprivate.done:
+// IR-PCH-NEXT: ret void
+//
+//
+// IR-PCH-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp_outlined.omp.reduction.reduction_func
+// IR-PCH-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+// IR-PCH-NEXT: entry:
+// IR-PCH-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// IR-PCH-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// IR-PCH-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// IR-PCH-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// IR-PCH-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// IR-PCH-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
+// IR-PCH-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+// IR-PCH-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
+// IR-PCH-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// IR-PCH-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i64 100
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP7]], [[TMP8]]
+// IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE2:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// IR-PCH: omp.arraycpy.body:
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[TMP7]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-PCH-NEXT: [[TMP9:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
+// IR-PCH-NEXT: [[TMP10:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
+// IR-PCH-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
+// IR-PCH-NEXT: store i32 [[ADD]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP8]]
+// IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE2]], label [[OMP_ARRAYCPY_BODY]]
+// IR-PCH: omp.arraycpy.done2:
+// IR-PCH-NEXT: ret void
+//
+//
+// IR-PCH-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22.omp_outlined.omp.reduction.reduction_func
+// IR-PCH-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
+// IR-PCH-NEXT: entry:
+// IR-PCH-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// IR-PCH-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// IR-PCH-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// IR-PCH-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// IR-PCH-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// IR-PCH-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
+// IR-PCH-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+// IR-PCH-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
+// IR-PCH-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// IR-PCH-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i64 100
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP7]], [[TMP8]]
+// IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE2:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// IR-PCH: omp.arraycpy.body:
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[TMP7]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-PCH-NEXT: [[TMP9:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
+// IR-PCH-NEXT: [[TMP10:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
+// IR-PCH-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
+// IR-PCH-NEXT: store i32 [[ADD]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP8]]
+// IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE2]], label [[OMP_ARRAYCPY_BODY]]
+// IR-PCH: omp.arraycpy.done2:
+// IR-PCH-NEXT: ret void
+//
diff --git a/clang/test/OpenMP/target_teams_generic_loop_collapse_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_collapse_codegen.cpp
new file mode 100644
index 00000000000000..a2a34ce9bd83b3
--- /dev/null
+++ b/clang/test/OpenMP/target_teams_generic_loop_collapse_codegen.cpp
@@ -0,0 +1,1915 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// Test host codegen.
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK3
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK3
+
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+#ifdef CK1
+
+template <typename T, int X, long long Y>
+struct SS{
+ T a[X][Y];
+
+ int foo(void) {
+
+ #pragma omp target teams loop collapse(2)
+ for(int i = 0; i < X; i++) {
+ for(int j = 0; j < Y; j++) {
+ a[i][j] = (T)0;
+ }
+ }
+
+ // discard loop variables not needed here
+
+
+ return a[0][0];
+ }
+};
+
+int teams_template_struct(void) {
+ SS<int, 123, 456> V;
+ return V.foo();
+
+}
+#endif // CK1
+
+// Test host codegen.
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK9
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK9
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK11
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK11
+
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCK2 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCK2 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+#ifdef CK2
+
+template <typename T, int n, int m>
+int tmain(T argc) {
+ T a[n][m];
+ #pragma omp target teams loop collapse(2)
+ for(int i = 0; i < n; i++) {
+ for(int j = 0; j < m; j++) {
+ a[i][j] = (T)0;
+ }
+ }
+ return 0;
+}
+
+int main (int argc, char **argv) {
+ int n = 100;
+ int m = 2;
+ int a[n][m];
+ #pragma omp target teams loop collapse(2)
+ for(int i = 0; i < n; i++) {
+ for(int j = 0; j < m; j++) {
+ a[i][j] = 0;
+ }
+ }
+ return tmain<int, 10, 2>(argc);
+}
+
+
+
+
+
+
+
+
+// discard loop variables not needed here
+
+
+#endif // CK2
+#endif // #ifndef HEADER
+// CHECK1-LABEL: define {{[^@]+}}@_Z21teams_template_structv
+// CHECK1-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[V:%.*]] = alloca [[STRUCT_SS:%.*]], align 4
+// CHECK1-NEXT: [[CALL:%.*]] = call noundef signext i32 @_ZN2SSIiLi123ELx456EE3fooEv(ptr noundef nonnull align 4 dereferenceable(224352) [[V]])
+// CHECK1-NEXT: ret i32 [[CALL]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_ZN2SSIiLi123ELx456EE3fooEv
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(224352) [[THIS:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[_TMP2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_SS:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT: store ptr [[THIS1]], ptr [[TMP0]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT: store ptr [[A]], ptr [[TMP1]], align 8
+// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK1-NEXT: store ptr null, ptr [[TMP2]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK1-NEXT: store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK1-NEXT: store i32 1, ptr [[TMP6]], align 4
+// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK1-NEXT: store ptr [[TMP3]], ptr [[TMP7]], align 8
+// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK1-NEXT: store ptr [[TMP4]], ptr [[TMP8]], align 8
+// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK1-NEXT: store ptr @.offload_sizes, ptr [[TMP9]], align 8
+// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK1-NEXT: store ptr @.offload_maptypes, ptr [[TMP10]], align 8
+// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK1-NEXT: store ptr null, ptr [[TMP11]], align 8
+// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK1-NEXT: store ptr null, ptr [[TMP12]], align 8
+// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK1-NEXT: store i64 56088, ptr [[TMP13]], align 8
+// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK1-NEXT: store i64 0, ptr [[TMP14]], align 8
+// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP15]], align 4
+// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4
+// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK1-NEXT: store i32 0, ptr [[TMP17]], align 4
+// CHECK1-NEXT: [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.region_id, ptr [[KERNEL_ARGS]])
+// CHECK1-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// CHECK1-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK1: omp_offload.failed:
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]]
+// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK1: omp_offload.cont:
+// CHECK1-NEXT: [[A3:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
+// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [123 x [456 x i32]], ptr [[A3]], i64 0, i64 0
+// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [456 x i32], ptr [[ARRAYIDX]], i64 0, i64 0
+// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
+// CHECK1-NEXT: ret i32 [[TMP20]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28
+// CHECK1-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined, ptr [[TMP0]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 56087, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP2]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 56087
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 56087, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK1-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined.omp_outlined, i64 [[TMP9]], i64 [[TMP11]], ptr [[TMP0]])
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 56087, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK1-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP2]] to i32
+// CHECK1-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[CONV2]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP4]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP5]], 56087
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 56087, [[COND_TRUE]] ], [ [[TMP6]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP3:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK1-NEXT: br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP10]], 456
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[DIV4:%.*]] = sdiv i32 [[TMP12]], 456
+// CHECK1-NEXT: [[MUL5:%.*]] = mul nsw i32 [[DIV4]], 456
+// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP11]], [[MUL5]]
+// CHECK1-NEXT: [[MUL6:%.*]] = mul nsw i32 [[SUB]], 1
+// CHECK1-NEXT: [[ADD7:%.*]] = add nsw i32 0, [[MUL6]]
+// CHECK1-NEXT: store i32 [[ADD7]], ptr [[J]], align 4
+// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_SS:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64
+// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [123 x [456 x i32]], ptr [[A]], i64 0, i64 [[IDXPROM]]
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[J]], align 4
+// CHECK1-NEXT: [[IDXPROM8:%.*]] = sext i32 [[TMP14]] to i64
+// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [456 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM8]]
+// CHECK1-NEXT: store i32 0, ptr [[ARRAYIDX9]], align 4
+// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1: omp.body.continue:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP15]], 1
+// CHECK1-NEXT: store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP4]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_Z21teams_template_structv
+// CHECK3-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[V:%.*]] = alloca [[STRUCT_SS:%.*]], align 4
+// CHECK3-NEXT: [[CALL:%.*]] = call noundef i32 @_ZN2SSIiLi123ELx456EE3fooEv(ptr noundef nonnull align 4 dereferenceable(224352) [[V]])
+// CHECK3-NEXT: ret i32 [[CALL]]
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_ZN2SSIiLi123ELx456EE3fooEv
+// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(224352) [[THIS:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[_TMP2:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_SS:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT: store ptr [[THIS1]], ptr [[TMP0]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT: store ptr [[A]], ptr [[TMP1]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK3-NEXT: store ptr null, ptr [[TMP2]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK3-NEXT: store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK3-NEXT: store i32 1, ptr [[TMP6]], align 4
+// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK3-NEXT: store ptr [[TMP3]], ptr [[TMP7]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK3-NEXT: store ptr [[TMP4]], ptr [[TMP8]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK3-NEXT: store ptr @.offload_sizes, ptr [[TMP9]], align 4
+// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK3-NEXT: store ptr @.offload_maptypes, ptr [[TMP10]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK3-NEXT: store ptr null, ptr [[TMP11]], align 4
+// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK3-NEXT: store ptr null, ptr [[TMP12]], align 4
+// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK3-NEXT: store i64 56088, ptr [[TMP13]], align 8
+// CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK3-NEXT: store i64 0, ptr [[TMP14]], align 8
+// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP15]], align 4
+// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4
+// CHECK3-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK3-NEXT: store i32 0, ptr [[TMP17]], align 4
+// CHECK3-NEXT: [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.region_id, ptr [[KERNEL_ARGS]])
+// CHECK3-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// CHECK3-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK3: omp_offload.failed:
+// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]]
+// CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK3: omp_offload.cont:
+// CHECK3-NEXT: [[A3:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
+// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [123 x [456 x i32]], ptr [[A3]], i32 0, i32 0
+// CHECK3-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [456 x i32], ptr [[ARRAYIDX]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
+// CHECK3-NEXT: ret i32 [[TMP20]]
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28
+// CHECK3-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined, ptr [[TMP0]])
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: store i32 56087, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP2]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 56087
+// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK3: cond.true:
+// CHECK3-NEXT: br label [[COND_END:%.*]]
+// CHECK3: cond.false:
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: br label [[COND_END]]
+// CHECK3: cond.end:
+// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 56087, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK3-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3: omp.inner.for.cond:
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK3-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3: omp.inner.for.body:
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined.omp_outlined, i32 [[TMP8]], i32 [[TMP9]], ptr [[TMP0]])
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3: omp.inner.for.inc:
+// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], [[TMP11]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK3: omp.inner.for.end:
+// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3: omp.loop.exit:
+// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]])
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 56087, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP4]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP5]], 56087
+// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK3: cond.true:
+// CHECK3-NEXT: br label [[COND_END:%.*]]
+// CHECK3: cond.false:
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: br label [[COND_END]]
+// CHECK3: cond.end:
+// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 56087, [[COND_TRUE]] ], [ [[TMP6]], [[COND_FALSE]] ]
+// CHECK3-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3: omp.inner.for.cond:
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK3-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3: omp.inner.for.body:
+// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP10]], 456
+// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[DIV3:%.*]] = sdiv i32 [[TMP12]], 456
+// CHECK3-NEXT: [[MUL4:%.*]] = mul nsw i32 [[DIV3]], 456
+// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP11]], [[MUL4]]
+// CHECK3-NEXT: [[MUL5:%.*]] = mul nsw i32 [[SUB]], 1
+// CHECK3-NEXT: [[ADD6:%.*]] = add nsw i32 0, [[MUL5]]
+// CHECK3-NEXT: store i32 [[ADD6]], ptr [[J]], align 4
+// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_SS:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4
+// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [123 x [456 x i32]], ptr [[A]], i32 0, i32 [[TMP13]]
+// CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[J]], align 4
+// CHECK3-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [456 x i32], ptr [[ARRAYIDX]], i32 0, i32 [[TMP14]]
+// CHECK3-NEXT: store i32 0, ptr [[ARRAYIDX7]], align 4
+// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK3: omp.body.continue:
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3: omp.inner.for.inc:
+// CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP15]], 1
+// CHECK3-NEXT: store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK3: omp.inner.for.end:
+// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3: omp.loop.exit:
+// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP4]])
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK3-SAME: () #[[ATTR3:[0-9]+]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@main
+// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[ARGV_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[N:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[M:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[SAVED_STACK:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[__VLA_EXPR1:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[M_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [5 x ptr], align 8
+// CHECK9-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [5 x ptr], align 8
+// CHECK9-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [5 x ptr], align 8
+// CHECK9-NEXT: [[DOTOFFLOAD_SIZES:%.*]] = alloca [5 x i64], align 8
+// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK9-NEXT: store i32 0, ptr [[RETVAL]], align 4
+// CHECK9-NEXT: store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4
+// CHECK9-NEXT: store ptr [[ARGV]], ptr [[ARGV_ADDR]], align 8
+// CHECK9-NEXT: store i32 100, ptr [[N]], align 4
+// CHECK9-NEXT: store i32 2, ptr [[M]], align 4
+// CHECK9-NEXT: [[TMP0:%.*]] = load i32, ptr [[N]], align 4
+// CHECK9-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+// CHECK9-NEXT: [[TMP2:%.*]] = load i32, ptr [[M]], align 4
+// CHECK9-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
+// CHECK9-NEXT: [[TMP4:%.*]] = call ptr @llvm.stacksave()
+// CHECK9-NEXT: store ptr [[TMP4]], ptr [[SAVED_STACK]], align 8
+// CHECK9-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP1]], [[TMP3]]
+// CHECK9-NEXT: [[VLA:%.*]] = alloca i32, i64 [[TMP5]], align 4
+// CHECK9-NEXT: store i64 [[TMP1]], ptr [[__VLA_EXPR0]], align 8
+// CHECK9-NEXT: store i64 [[TMP3]], ptr [[__VLA_EXPR1]], align 8
+// CHECK9-NEXT: [[TMP6:%.*]] = load i32, ptr [[N]], align 4
+// CHECK9-NEXT: store i32 [[TMP6]], ptr [[N_CASTED]], align 4
+// CHECK9-NEXT: [[TMP7:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK9-NEXT: [[TMP8:%.*]] = load i32, ptr [[M]], align 4
+// CHECK9-NEXT: store i32 [[TMP8]], ptr [[M_CASTED]], align 4
+// CHECK9-NEXT: [[TMP9:%.*]] = load i64, ptr [[M_CASTED]], align 8
+// CHECK9-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP1]], [[TMP3]]
+// CHECK9-NEXT: [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 4
+// CHECK9-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DOTOFFLOAD_SIZES]], ptr align 8 @.offload_sizes, i64 40, i1 false)
+// CHECK9-NEXT: [[TMP12:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK9-NEXT: store i64 [[TMP7]], ptr [[TMP12]], align 8
+// CHECK9-NEXT: [[TMP13:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK9-NEXT: store i64 [[TMP7]], ptr [[TMP13]], align 8
+// CHECK9-NEXT: [[TMP14:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK9-NEXT: store ptr null, ptr [[TMP14]], align 8
+// CHECK9-NEXT: [[TMP15:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK9-NEXT: store i64 [[TMP9]], ptr [[TMP15]], align 8
+// CHECK9-NEXT: [[TMP16:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK9-NEXT: store i64 [[TMP9]], ptr [[TMP16]], align 8
+// CHECK9-NEXT: [[TMP17:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK9-NEXT: store ptr null, ptr [[TMP17]], align 8
+// CHECK9-NEXT: [[TMP18:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK9-NEXT: store i64 [[TMP1]], ptr [[TMP18]], align 8
+// CHECK9-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK9-NEXT: store i64 [[TMP1]], ptr [[TMP19]], align 8
+// CHECK9-NEXT: [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
+// CHECK9-NEXT: store ptr null, ptr [[TMP20]], align 8
+// CHECK9-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 3
+// CHECK9-NEXT: store i64 [[TMP3]], ptr [[TMP21]], align 8
+// CHECK9-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 3
+// CHECK9-NEXT: store i64 [[TMP3]], ptr [[TMP22]], align 8
+// CHECK9-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 3
+// CHECK9-NEXT: store ptr null, ptr [[TMP23]], align 8
+// CHECK9-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 4
+// CHECK9-NEXT: store ptr [[VLA]], ptr [[TMP24]], align 8
+// CHECK9-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 4
+// CHECK9-NEXT: store ptr [[VLA]], ptr [[TMP25]], align 8
+// CHECK9-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 4
+// CHECK9-NEXT: store i64 [[TMP11]], ptr [[TMP26]], align 8
+// CHECK9-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 4
+// CHECK9-NEXT: store ptr null, ptr [[TMP27]], align 8
+// CHECK9-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK9-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK9-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CHECK9-NEXT: [[TMP31:%.*]] = load i32, ptr [[N]], align 4
+// CHECK9-NEXT: store i32 [[TMP31]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT: [[TMP32:%.*]] = load i32, ptr [[M]], align 4
+// CHECK9-NEXT: store i32 [[TMP32]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK9-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP33]], 0
+// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK9-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64
+// CHECK9-NEXT: [[TMP34:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK9-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP34]], 0
+// CHECK9-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1
+// CHECK9-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64
+// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]]
+// CHECK9-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1
+// CHECK9-NEXT: store i64 [[SUB7]], ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK9-NEXT: [[TMP35:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK9-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP35]], 1
+// CHECK9-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK9-NEXT: store i32 2, ptr [[TMP36]], align 4
+// CHECK9-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK9-NEXT: store i32 5, ptr [[TMP37]], align 4
+// CHECK9-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK9-NEXT: store ptr [[TMP28]], ptr [[TMP38]], align 8
+// CHECK9-NEXT: [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK9-NEXT: store ptr [[TMP29]], ptr [[TMP39]], align 8
+// CHECK9-NEXT: [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK9-NEXT: store ptr [[TMP30]], ptr [[TMP40]], align 8
+// CHECK9-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK9-NEXT: store ptr @.offload_maptypes, ptr [[TMP41]], align 8
+// CHECK9-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK9-NEXT: store ptr null, ptr [[TMP42]], align 8
+// CHECK9-NEXT: [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK9-NEXT: store ptr null, ptr [[TMP43]], align 8
+// CHECK9-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK9-NEXT: store i64 [[ADD]], ptr [[TMP44]], align 8
+// CHECK9-NEXT: [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK9-NEXT: store i64 0, ptr [[TMP45]], align 8
+// CHECK9-NEXT: [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK9-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP46]], align 4
+// CHECK9-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK9-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP47]], align 4
+// CHECK9-NEXT: [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK9-NEXT: store i32 0, ptr [[TMP48]], align 4
+// CHECK9-NEXT: [[TMP49:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81.region_id, ptr [[KERNEL_ARGS]])
+// CHECK9-NEXT: [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0
+// CHECK9-NEXT: br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK9: omp_offload.failed:
+// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81(i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP1]], i64 [[TMP3]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
+// CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK9: omp_offload.cont:
+// CHECK9-NEXT: [[TMP51:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
+// CHECK9-NEXT: [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiLi10ELi2EEiT_(i32 noundef signext [[TMP51]])
+// CHECK9-NEXT: store i32 [[CALL]], ptr [[RETVAL]], align 4
+// CHECK9-NEXT: [[TMP52:%.*]] = load ptr, ptr [[SAVED_STACK]], align 8
+// CHECK9-NEXT: call void @llvm.stackrestore(ptr [[TMP52]])
+// CHECK9-NEXT: [[TMP53:%.*]] = load i32, ptr [[RETVAL]], align 4
+// CHECK9-NEXT: ret i32 [[TMP53]]
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81
+// CHECK9-SAME: (i64 noundef [[N:%.*]], i64 noundef [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[M_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[VLA_ADDR2:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[M_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK9-NEXT: store i64 [[M]], ptr [[M_ADDR]], align 8
+// CHECK9-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
+// CHECK9-NEXT: store i64 [[VLA1]], ptr [[VLA_ADDR2]], align 8
+// CHECK9-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK9-NEXT: [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8
+// CHECK9-NEXT: [[TMP1:%.*]] = load i64, ptr [[VLA_ADDR2]], align 8
+// CHECK9-NEXT: [[TMP2:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK9-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK9-NEXT: store i32 [[TMP3]], ptr [[N_CASTED]], align 4
+// CHECK9-NEXT: [[TMP4:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK9-NEXT: [[TMP5:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK9-NEXT: store i32 [[TMP5]], ptr [[M_CASTED]], align 4
+// CHECK9-NEXT: [[TMP6:%.*]] = load i64, ptr [[M_CASTED]], align 8
+// CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 5, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81.omp_outlined, i64 [[TMP4]], i64 [[TMP6]], i64 [[TMP0]], i64 [[TMP1]], ptr [[TMP2]])
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81.omp_outlined
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[M_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[VLA_ADDR2:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[_TMP3:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTCAPTURE_EXPR_4:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTCAPTURE_EXPR_5:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[I11:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[J12:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[M_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK9-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK9-NEXT: store i64 [[M]], ptr [[M_ADDR]], align 8
+// CHECK9-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
+// CHECK9-NEXT: store i64 [[VLA1]], ptr [[VLA_ADDR2]], align 8
+// CHECK9-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK9-NEXT: [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8
+// CHECK9-NEXT: [[TMP1:%.*]] = load i64, ptr [[VLA_ADDR2]], align 8
+// CHECK9-NEXT: [[TMP2:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK9-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK9-NEXT: store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT: [[TMP4:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK9-NEXT: store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK9-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP5]], 0
+// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK9-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64
+// CHECK9-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK9-NEXT: [[SUB6:%.*]] = sub nsw i32 [[TMP6]], 0
+// CHECK9-NEXT: [[DIV7:%.*]] = sdiv i32 [[SUB6]], 1
+// CHECK9-NEXT: [[CONV8:%.*]] = sext i32 [[DIV7]] to i64
+// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV8]]
+// CHECK9-NEXT: [[SUB9:%.*]] = sub nsw i64 [[MUL]], 1
+// CHECK9-NEXT: store i64 [[SUB9]], ptr [[DOTCAPTURE_EXPR_5]], align 8
+// CHECK9-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK9-NEXT: store i32 0, ptr [[J]], align 4
+// CHECK9-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]]
+// CHECK9-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK9: land.lhs.true:
+// CHECK9-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK9-NEXT: [[CMP10:%.*]] = icmp slt i32 0, [[TMP8]]
+// CHECK9-NEXT: br i1 [[CMP10]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]]
+// CHECK9: omp.precond.then:
+// CHECK9-NEXT: store i64 0, ptr [[DOTOMP_COMB_LB]], align 8
+// CHECK9-NEXT: [[TMP9:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_5]], align 8
+// CHECK9-NEXT: store i64 [[TMP9]], ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK9-NEXT: store i64 1, ptr [[DOTOMP_STRIDE]], align 8
+// CHECK9-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK9-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
+// CHECK9-NEXT: call void @__kmpc_for_static_init_8(ptr @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i64 1, i64 1)
+// CHECK9-NEXT: [[TMP12:%.*]] = load i64, ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK9-NEXT: [[TMP13:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_5]], align 8
+// CHECK9-NEXT: [[CMP13:%.*]] = icmp sgt i64 [[TMP12]], [[TMP13]]
+// CHECK9-NEXT: br i1 [[CMP13]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK9: cond.true:
+// CHECK9-NEXT: [[TMP14:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_5]], align 8
+// CHECK9-NEXT: br label [[COND_END:%.*]]
+// CHECK9: cond.false:
+// CHECK9-NEXT: [[TMP15:%.*]] = load i64, ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK9-NEXT: br label [[COND_END]]
+// CHECK9: cond.end:
+// CHECK9-NEXT: [[COND:%.*]] = phi i64 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ]
+// CHECK9-NEXT: store i64 [[COND]], ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK9-NEXT: [[TMP16:%.*]] = load i64, ptr [[DOTOMP_COMB_LB]], align 8
+// CHECK9-NEXT: store i64 [[TMP16]], ptr [[DOTOMP_IV]], align 8
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK9: omp.inner.for.cond:
+// CHECK9-NEXT: [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK9-NEXT: [[TMP18:%.*]] = load i64, ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK9-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP17]], [[TMP18]]
+// CHECK9-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK9: omp.inner.for.body:
+// CHECK9-NEXT: [[TMP19:%.*]] = load i64, ptr [[DOTOMP_COMB_LB]], align 8
+// CHECK9-NEXT: [[TMP20:%.*]] = load i64, ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK9-NEXT: [[TMP21:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK9-NEXT: store i32 [[TMP21]], ptr [[N_CASTED]], align 4
+// CHECK9-NEXT: [[TMP22:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK9-NEXT: [[TMP23:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK9-NEXT: store i32 [[TMP23]], ptr [[M_CASTED]], align 4
+// CHECK9-NEXT: [[TMP24:%.*]] = load i64, ptr [[M_CASTED]], align 8
+// CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 7, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81.omp_outlined.omp_outlined, i64 [[TMP19]], i64 [[TMP20]], i64 [[TMP22]], i64 [[TMP24]], i64 [[TMP0]], i64 [[TMP1]], ptr [[TMP2]])
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK9: omp.inner.for.inc:
+// CHECK9-NEXT: [[TMP25:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK9-NEXT: [[TMP26:%.*]] = load i64, ptr [[DOTOMP_STRIDE]], align 8
+// CHECK9-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP25]], [[TMP26]]
+// CHECK9-NEXT: store i64 [[ADD]], ptr [[DOTOMP_IV]], align 8
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK9: omp.inner.for.end:
+// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK9: omp.loop.exit:
+// CHECK9-NEXT: [[TMP27:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
+// CHECK9-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP28]])
+// CHECK9-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK9: omp.precond.end:
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81.omp_outlined.omp_outlined
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[M_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[VLA_ADDR2:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[_TMP3:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTCAPTURE_EXPR_4:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTCAPTURE_EXPR_5:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[I11:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[J12:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK9-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK9-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK9-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK9-NEXT: store i64 [[M]], ptr [[M_ADDR]], align 8
+// CHECK9-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
+// CHECK9-NEXT: store i64 [[VLA1]], ptr [[VLA_ADDR2]], align 8
+// CHECK9-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK9-NEXT: [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8
+// CHECK9-NEXT: [[TMP1:%.*]] = load i64, ptr [[VLA_ADDR2]], align 8
+// CHECK9-NEXT: [[TMP2:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK9-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK9-NEXT: store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT: [[TMP4:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK9-NEXT: store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK9-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP5]], 0
+// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK9-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64
+// CHECK9-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK9-NEXT: [[SUB6:%.*]] = sub nsw i32 [[TMP6]], 0
+// CHECK9-NEXT: [[DIV7:%.*]] = sdiv i32 [[SUB6]], 1
+// CHECK9-NEXT: [[CONV8:%.*]] = sext i32 [[DIV7]] to i64
+// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV8]]
+// CHECK9-NEXT: [[SUB9:%.*]] = sub nsw i64 [[MUL]], 1
+// CHECK9-NEXT: store i64 [[SUB9]], ptr [[DOTCAPTURE_EXPR_5]], align 8
+// CHECK9-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK9-NEXT: store i32 0, ptr [[J]], align 4
+// CHECK9-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]]
+// CHECK9-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK9: land.lhs.true:
+// CHECK9-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK9-NEXT: [[CMP10:%.*]] = icmp slt i32 0, [[TMP8]]
+// CHECK9-NEXT: br i1 [[CMP10]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]]
+// CHECK9: omp.precond.then:
+// CHECK9-NEXT: store i64 0, ptr [[DOTOMP_LB]], align 8
+// CHECK9-NEXT: [[TMP9:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_5]], align 8
+// CHECK9-NEXT: store i64 [[TMP9]], ptr [[DOTOMP_UB]], align 8
+// CHECK9-NEXT: [[TMP10:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK9-NEXT: [[TMP11:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK9-NEXT: store i64 [[TMP10]], ptr [[DOTOMP_LB]], align 8
+// CHECK9-NEXT: store i64 [[TMP11]], ptr [[DOTOMP_UB]], align 8
+// CHECK9-NEXT: store i64 1, ptr [[DOTOMP_STRIDE]], align 8
+// CHECK9-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK9-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
+// CHECK9-NEXT: call void @__kmpc_for_static_init_8(ptr @[[GLOB2]], i32 [[TMP13]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i64 1, i64 1)
+// CHECK9-NEXT: [[TMP14:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8
+// CHECK9-NEXT: [[TMP15:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_5]], align 8
+// CHECK9-NEXT: [[CMP13:%.*]] = icmp sgt i64 [[TMP14]], [[TMP15]]
+// CHECK9-NEXT: br i1 [[CMP13]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK9: cond.true:
+// CHECK9-NEXT: [[TMP16:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_5]], align 8
+// CHECK9-NEXT: br label [[COND_END:%.*]]
+// CHECK9: cond.false:
+// CHECK9-NEXT: [[TMP17:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8
+// CHECK9-NEXT: br label [[COND_END]]
+// CHECK9: cond.end:
+// CHECK9-NEXT: [[COND:%.*]] = phi i64 [ [[TMP16]], [[COND_TRUE]] ], [ [[TMP17]], [[COND_FALSE]] ]
+// CHECK9-NEXT: store i64 [[COND]], ptr [[DOTOMP_UB]], align 8
+// CHECK9-NEXT: [[TMP18:%.*]] = load i64, ptr [[DOTOMP_LB]], align 8
+// CHECK9-NEXT: store i64 [[TMP18]], ptr [[DOTOMP_IV]], align 8
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK9: omp.inner.for.cond:
+// CHECK9-NEXT: [[TMP19:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK9-NEXT: [[TMP20:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8
+// CHECK9-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP19]], [[TMP20]]
+// CHECK9-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK9: omp.inner.for.body:
+// CHECK9-NEXT: [[TMP21:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK9-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK9-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP22]], 0
+// CHECK9-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1
+// CHECK9-NEXT: [[MUL17:%.*]] = mul nsw i32 1, [[DIV16]]
+// CHECK9-NEXT: [[CONV18:%.*]] = sext i32 [[MUL17]] to i64
+// CHECK9-NEXT: [[DIV19:%.*]] = sdiv i64 [[TMP21]], [[CONV18]]
+// CHECK9-NEXT: [[MUL20:%.*]] = mul nsw i64 [[DIV19]], 1
+// CHECK9-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL20]]
+// CHECK9-NEXT: [[CONV21:%.*]] = trunc i64 [[ADD]] to i32
+// CHECK9-NEXT: store i32 [[CONV21]], ptr [[I11]], align 4
+// CHECK9-NEXT: [[TMP23:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK9-NEXT: [[TMP24:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK9-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK9-NEXT: [[SUB22:%.*]] = sub nsw i32 [[TMP25]], 0
+// CHECK9-NEXT: [[DIV23:%.*]] = sdiv i32 [[SUB22]], 1
+// CHECK9-NEXT: [[MUL24:%.*]] = mul nsw i32 1, [[DIV23]]
+// CHECK9-NEXT: [[CONV25:%.*]] = sext i32 [[MUL24]] to i64
+// CHECK9-NEXT: [[DIV26:%.*]] = sdiv i64 [[TMP24]], [[CONV25]]
+// CHECK9-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK9-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP26]], 0
+// CHECK9-NEXT: [[DIV28:%.*]] = sdiv i32 [[SUB27]], 1
+// CHECK9-NEXT: [[MUL29:%.*]] = mul nsw i32 1, [[DIV28]]
+// CHECK9-NEXT: [[CONV30:%.*]] = sext i32 [[MUL29]] to i64
+// CHECK9-NEXT: [[MUL31:%.*]] = mul nsw i64 [[DIV26]], [[CONV30]]
+// CHECK9-NEXT: [[SUB32:%.*]] = sub nsw i64 [[TMP23]], [[MUL31]]
+// CHECK9-NEXT: [[MUL33:%.*]] = mul nsw i64 [[SUB32]], 1
+// CHECK9-NEXT: [[ADD34:%.*]] = add nsw i64 0, [[MUL33]]
+// CHECK9-NEXT: [[CONV35:%.*]] = trunc i64 [[ADD34]] to i32
+// CHECK9-NEXT: store i32 [[CONV35]], ptr [[J12]], align 4
+// CHECK9-NEXT: [[TMP27:%.*]] = load i32, ptr [[I11]], align 4
+// CHECK9-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP27]] to i64
+// CHECK9-NEXT: [[TMP28:%.*]] = mul nsw i64 [[IDXPROM]], [[TMP1]]
+// CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[TMP28]]
+// CHECK9-NEXT: [[TMP29:%.*]] = load i32, ptr [[J12]], align 4
+// CHECK9-NEXT: [[IDXPROM36:%.*]] = sext i32 [[TMP29]] to i64
+// CHECK9-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX]], i64 [[IDXPROM36]]
+// CHECK9-NEXT: store i32 0, ptr [[ARRAYIDX37]], align 4
+// CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK9: omp.body.continue:
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK9: omp.inner.for.inc:
+// CHECK9-NEXT: [[TMP30:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK9-NEXT: [[ADD38:%.*]] = add nsw i64 [[TMP30]], 1
+// CHECK9-NEXT: store i64 [[ADD38]], ptr [[DOTOMP_IV]], align 8
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK9: omp.inner.for.end:
+// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK9: omp.loop.exit:
+// CHECK9-NEXT: [[TMP31:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4
+// CHECK9-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP32]])
+// CHECK9-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK9: omp.precond.end:
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiLi10ELi2EEiT_
+// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[A:%.*]] = alloca [10 x [2 x i32]], align 4
+// CHECK9-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK9-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK9-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK9-NEXT: store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4
+// CHECK9-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK9-NEXT: store ptr [[A]], ptr [[TMP0]], align 8
+// CHECK9-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK9-NEXT: store ptr [[A]], ptr [[TMP1]], align 8
+// CHECK9-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK9-NEXT: store ptr null, ptr [[TMP2]], align 8
+// CHECK9-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK9-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK9-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK9-NEXT: store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK9-NEXT: store i32 1, ptr [[TMP6]], align 4
+// CHECK9-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK9-NEXT: store ptr [[TMP3]], ptr [[TMP7]], align 8
+// CHECK9-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK9-NEXT: store ptr [[TMP4]], ptr [[TMP8]], align 8
+// CHECK9-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK9-NEXT: store ptr @.offload_sizes.1, ptr [[TMP9]], align 8
+// CHECK9-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK9-NEXT: store ptr @.offload_maptypes.2, ptr [[TMP10]], align 8
+// CHECK9-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK9-NEXT: store ptr null, ptr [[TMP11]], align 8
+// CHECK9-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK9-NEXT: store ptr null, ptr [[TMP12]], align 8
+// CHECK9-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK9-NEXT: store i64 20, ptr [[TMP13]], align 8
+// CHECK9-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK9-NEXT: store i64 0, ptr [[TMP14]], align 8
+// CHECK9-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK9-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP15]], align 4
+// CHECK9-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK9-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4
+// CHECK9-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK9-NEXT: store i32 0, ptr [[TMP17]], align 4
+// CHECK9-NEXT: [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68.region_id, ptr [[KERNEL_ARGS]])
+// CHECK9-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// CHECK9-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK9: omp_offload.failed:
+// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68(ptr [[A]]) #[[ATTR3]]
+// CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK9: omp_offload.cont:
+// CHECK9-NEXT: ret i32 0
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68
+// CHECK9-SAME: (ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68.omp_outlined, ptr [[TMP0]])
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68.omp_outlined
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK9-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK9-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK9-NEXT: store i32 19, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK9-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK9-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP2]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK9-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 19
+// CHECK9-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK9: cond.true:
+// CHECK9-NEXT: br label [[COND_END:%.*]]
+// CHECK9: cond.false:
+// CHECK9-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT: br label [[COND_END]]
+// CHECK9: cond.end:
+// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ 19, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK9-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK9-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK9: omp.inner.for.cond:
+// CHECK9-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK9-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK9: omp.inner.for.body:
+// CHECK9-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK9-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK9-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
+// CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68.omp_outlined.omp_outlined, i64 [[TMP9]], i64 [[TMP11]], ptr [[TMP0]])
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK9: omp.inner.for.inc:
+// CHECK9-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
+// CHECK9-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK9: omp.inner.for.end:
+// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK9: omp.loop.exit:
+// CHECK9-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP2]])
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68.omp_outlined.omp_outlined
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK9-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK9-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK9-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK9-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK9-NEXT: store i32 19, ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK9-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK9-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK9-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP2]] to i32
+// CHECK9-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK9-NEXT: store i32 [[CONV2]], ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK9-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK9-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+// CHECK9-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP4]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK9-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP5]], 19
+// CHECK9-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK9: cond.true:
+// CHECK9-NEXT: br label [[COND_END:%.*]]
+// CHECK9: cond.false:
+// CHECK9-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT: br label [[COND_END]]
+// CHECK9: cond.end:
+// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ 19, [[COND_TRUE]] ], [ [[TMP6]], [[COND_FALSE]] ]
+// CHECK9-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK9-NEXT: store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK9: omp.inner.for.cond:
+// CHECK9-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT: [[CMP3:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK9-NEXT: br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK9: omp.inner.for.body:
+// CHECK9-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP10]], 2
+// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
+// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK9-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK9-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: [[DIV4:%.*]] = sdiv i32 [[TMP12]], 2
+// CHECK9-NEXT: [[MUL5:%.*]] = mul nsw i32 [[DIV4]], 2
+// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP11]], [[MUL5]]
+// CHECK9-NEXT: [[MUL6:%.*]] = mul nsw i32 [[SUB]], 1
+// CHECK9-NEXT: [[ADD7:%.*]] = add nsw i32 0, [[MUL6]]
+// CHECK9-NEXT: store i32 [[ADD7]], ptr [[J]], align 4
+// CHECK9-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4
+// CHECK9-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64
+// CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [2 x i32]], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK9-NEXT: [[TMP14:%.*]] = load i32, ptr [[J]], align 4
+// CHECK9-NEXT: [[IDXPROM8:%.*]] = sext i32 [[TMP14]] to i64
+// CHECK9-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [2 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM8]]
+// CHECK9-NEXT: store i32 0, ptr [[ARRAYIDX9]], align 4
+// CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK9: omp.body.continue:
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK9: omp.inner.for.inc:
+// CHECK9-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP15]], 1
+// CHECK9-NEXT: store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK9: omp.inner.for.end:
+// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK9: omp.loop.exit:
+// CHECK9-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP4]])
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK9-SAME: () #[[ATTR6:[0-9]+]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@main
+// CHECK11-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK11-NEXT: entry:
+// CHECK11-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[ARGV_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[N:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[M:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[SAVED_STACK:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[__VLA_EXPR1:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[M_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [5 x ptr], align 4
+// CHECK11-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [5 x ptr], align 4
+// CHECK11-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [5 x ptr], align 4
+// CHECK11-NEXT: [[DOTOFFLOAD_SIZES:%.*]] = alloca [5 x i64], align 4
+// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8
+// CHECK11-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK11-NEXT: store i32 0, ptr [[RETVAL]], align 4
+// CHECK11-NEXT: store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4
+// CHECK11-NEXT: store ptr [[ARGV]], ptr [[ARGV_ADDR]], align 4
+// CHECK11-NEXT: store i32 100, ptr [[N]], align 4
+// CHECK11-NEXT: store i32 2, ptr [[M]], align 4
+// CHECK11-NEXT: [[TMP0:%.*]] = load i32, ptr [[N]], align 4
+// CHECK11-NEXT: [[TMP1:%.*]] = load i32, ptr [[M]], align 4
+// CHECK11-NEXT: [[TMP2:%.*]] = call ptr @llvm.stacksave()
+// CHECK11-NEXT: store ptr [[TMP2]], ptr [[SAVED_STACK]], align 4
+// CHECK11-NEXT: [[TMP3:%.*]] = mul nuw i32 [[TMP0]], [[TMP1]]
+// CHECK11-NEXT: [[VLA:%.*]] = alloca i32, i32 [[TMP3]], align 4
+// CHECK11-NEXT: store i32 [[TMP0]], ptr [[__VLA_EXPR0]], align 4
+// CHECK11-NEXT: store i32 [[TMP1]], ptr [[__VLA_EXPR1]], align 4
+// CHECK11-NEXT: [[TMP4:%.*]] = load i32, ptr [[N]], align 4
+// CHECK11-NEXT: store i32 [[TMP4]], ptr [[N_CASTED]], align 4
+// CHECK11-NEXT: [[TMP5:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK11-NEXT: [[TMP6:%.*]] = load i32, ptr [[M]], align 4
+// CHECK11-NEXT: store i32 [[TMP6]], ptr [[M_CASTED]], align 4
+// CHECK11-NEXT: [[TMP7:%.*]] = load i32, ptr [[M_CASTED]], align 4
+// CHECK11-NEXT: [[TMP8:%.*]] = mul nuw i32 [[TMP0]], [[TMP1]]
+// CHECK11-NEXT: [[TMP9:%.*]] = mul nuw i32 [[TMP8]], 4
+// CHECK11-NEXT: [[TMP10:%.*]] = sext i32 [[TMP9]] to i64
+// CHECK11-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[DOTOFFLOAD_SIZES]], ptr align 4 @.offload_sizes, i32 40, i1 false)
+// CHECK11-NEXT: [[TMP11:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK11-NEXT: store i32 [[TMP5]], ptr [[TMP11]], align 4
+// CHECK11-NEXT: [[TMP12:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK11-NEXT: store i32 [[TMP5]], ptr [[TMP12]], align 4
+// CHECK11-NEXT: [[TMP13:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK11-NEXT: store ptr null, ptr [[TMP13]], align 4
+// CHECK11-NEXT: [[TMP14:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK11-NEXT: store i32 [[TMP7]], ptr [[TMP14]], align 4
+// CHECK11-NEXT: [[TMP15:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK11-NEXT: store i32 [[TMP7]], ptr [[TMP15]], align 4
+// CHECK11-NEXT: [[TMP16:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK11-NEXT: store ptr null, ptr [[TMP16]], align 4
+// CHECK11-NEXT: [[TMP17:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK11-NEXT: store i32 [[TMP0]], ptr [[TMP17]], align 4
+// CHECK11-NEXT: [[TMP18:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK11-NEXT: store i32 [[TMP0]], ptr [[TMP18]], align 4
+// CHECK11-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
+// CHECK11-NEXT: store ptr null, ptr [[TMP19]], align 4
+// CHECK11-NEXT: [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 3
+// CHECK11-NEXT: store i32 [[TMP1]], ptr [[TMP20]], align 4
+// CHECK11-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 3
+// CHECK11-NEXT: store i32 [[TMP1]], ptr [[TMP21]], align 4
+// CHECK11-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 3
+// CHECK11-NEXT: store ptr null, ptr [[TMP22]], align 4
+// CHECK11-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 4
+// CHECK11-NEXT: store ptr [[VLA]], ptr [[TMP23]], align 4
+// CHECK11-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 4
+// CHECK11-NEXT: store ptr [[VLA]], ptr [[TMP24]], align 4
+// CHECK11-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 4
+// CHECK11-NEXT: store i64 [[TMP10]], ptr [[TMP25]], align 4
+// CHECK11-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 4
+// CHECK11-NEXT: store ptr null, ptr [[TMP26]], align 4
+// CHECK11-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK11-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK11-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CHECK11-NEXT: [[TMP30:%.*]] = load i32, ptr [[N]], align 4
+// CHECK11-NEXT: store i32 [[TMP30]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT: [[TMP31:%.*]] = load i32, ptr [[M]], align 4
+// CHECK11-NEXT: store i32 [[TMP31]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK11-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP32]], 0
+// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK11-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64
+// CHECK11-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK11-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP33]], 0
+// CHECK11-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1
+// CHECK11-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64
+// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]]
+// CHECK11-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1
+// CHECK11-NEXT: store i64 [[SUB7]], ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK11-NEXT: [[TMP34:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK11-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP34]], 1
+// CHECK11-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK11-NEXT: store i32 2, ptr [[TMP35]], align 4
+// CHECK11-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK11-NEXT: store i32 5, ptr [[TMP36]], align 4
+// CHECK11-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK11-NEXT: store ptr [[TMP27]], ptr [[TMP37]], align 4
+// CHECK11-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK11-NEXT: store ptr [[TMP28]], ptr [[TMP38]], align 4
+// CHECK11-NEXT: [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK11-NEXT: store ptr [[TMP29]], ptr [[TMP39]], align 4
+// CHECK11-NEXT: [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK11-NEXT: store ptr @.offload_maptypes, ptr [[TMP40]], align 4
+// CHECK11-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK11-NEXT: store ptr null, ptr [[TMP41]], align 4
+// CHECK11-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK11-NEXT: store ptr null, ptr [[TMP42]], align 4
+// CHECK11-NEXT: [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK11-NEXT: store i64 [[ADD]], ptr [[TMP43]], align 8
+// CHECK11-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK11-NEXT: store i64 0, ptr [[TMP44]], align 8
+// CHECK11-NEXT: [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK11-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP45]], align 4
+// CHECK11-NEXT: [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK11-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP46]], align 4
+// CHECK11-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK11-NEXT: store i32 0, ptr [[TMP47]], align 4
+// CHECK11-NEXT: [[TMP48:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81.region_id, ptr [[KERNEL_ARGS]])
+// CHECK11-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0
+// CHECK11-NEXT: br i1 [[TMP49]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK11: omp_offload.failed:
+// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81(i32 [[TMP5]], i32 [[TMP7]], i32 [[TMP0]], i32 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
+// CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK11: omp_offload.cont:
+// CHECK11-NEXT: [[TMP50:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
+// CHECK11-NEXT: [[CALL:%.*]] = call noundef i32 @_Z5tmainIiLi10ELi2EEiT_(i32 noundef [[TMP50]])
+// CHECK11-NEXT: store i32 [[CALL]], ptr [[RETVAL]], align 4
+// CHECK11-NEXT: [[TMP51:%.*]] = load ptr, ptr [[SAVED_STACK]], align 4
+// CHECK11-NEXT: call void @llvm.stackrestore(ptr [[TMP51]])
+// CHECK11-NEXT: [[TMP52:%.*]] = load i32, ptr [[RETVAL]], align 4
+// CHECK11-NEXT: ret i32 [[TMP52]]
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81
+// CHECK11-SAME: (i32 noundef [[N:%.*]], i32 noundef [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK11-NEXT: entry:
+// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[VLA_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[VLA_ADDR2:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[M_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[VLA1]], ptr [[VLA_ADDR2]], align 4
+// CHECK11-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK11-NEXT: [[TMP0:%.*]] = load i32, ptr [[VLA_ADDR]], align 4
+// CHECK11-NEXT: [[TMP1:%.*]] = load i32, ptr [[VLA_ADDR2]], align 4
+// CHECK11-NEXT: [[TMP2:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK11-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[TMP3]], ptr [[N_CASTED]], align 4
+// CHECK11-NEXT: [[TMP4:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK11-NEXT: [[TMP5:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[TMP5]], ptr [[M_CASTED]], align 4
+// CHECK11-NEXT: [[TMP6:%.*]] = load i32, ptr [[M_CASTED]], align 4
+// CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 5, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81.omp_outlined, i32 [[TMP4]], i32 [[TMP6]], i32 [[TMP0]], i32 [[TMP1]], ptr [[TMP2]])
+// CHECK11-NEXT: ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81.omp_outlined
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
+// CHECK11-NEXT: entry:
+// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[VLA_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[VLA_ADDR2:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8
+// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[_TMP3:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTCAPTURE_EXPR_4:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTCAPTURE_EXPR_5:%.*]] = alloca i64, align 8
+// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8
+// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8
+// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8
+// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[I11:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[J12:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[M_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK11-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[VLA1]], ptr [[VLA_ADDR2]], align 4
+// CHECK11-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK11-NEXT: [[TMP0:%.*]] = load i32, ptr [[VLA_ADDR]], align 4
+// CHECK11-NEXT: [[TMP1:%.*]] = load i32, ptr [[VLA_ADDR2]], align 4
+// CHECK11-NEXT: [[TMP2:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK11-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT: [[TMP4:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK11-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP5]], 0
+// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK11-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64
+// CHECK11-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK11-NEXT: [[SUB6:%.*]] = sub nsw i32 [[TMP6]], 0
+// CHECK11-NEXT: [[DIV7:%.*]] = sdiv i32 [[SUB6]], 1
+// CHECK11-NEXT: [[CONV8:%.*]] = sext i32 [[DIV7]] to i64
+// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV8]]
+// CHECK11-NEXT: [[SUB9:%.*]] = sub nsw i64 [[MUL]], 1
+// CHECK11-NEXT: store i64 [[SUB9]], ptr [[DOTCAPTURE_EXPR_5]], align 8
+// CHECK11-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK11-NEXT: store i32 0, ptr [[J]], align 4
+// CHECK11-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]]
+// CHECK11-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK11: land.lhs.true:
+// CHECK11-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK11-NEXT: [[CMP10:%.*]] = icmp slt i32 0, [[TMP8]]
+// CHECK11-NEXT: br i1 [[CMP10]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]]
+// CHECK11: omp.precond.then:
+// CHECK11-NEXT: store i64 0, ptr [[DOTOMP_COMB_LB]], align 8
+// CHECK11-NEXT: [[TMP9:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_5]], align 8
+// CHECK11-NEXT: store i64 [[TMP9]], ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK11-NEXT: store i64 1, ptr [[DOTOMP_STRIDE]], align 8
+// CHECK11-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK11-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
+// CHECK11-NEXT: call void @__kmpc_for_static_init_8(ptr @[[GLOB1:[0-9]+]], i32 [[TMP11]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i64 1, i64 1)
+// CHECK11-NEXT: [[TMP12:%.*]] = load i64, ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK11-NEXT: [[TMP13:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_5]], align 8
+// CHECK11-NEXT: [[CMP13:%.*]] = icmp sgt i64 [[TMP12]], [[TMP13]]
+// CHECK11-NEXT: br i1 [[CMP13]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK11: cond.true:
+// CHECK11-NEXT: [[TMP14:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_5]], align 8
+// CHECK11-NEXT: br label [[COND_END:%.*]]
+// CHECK11: cond.false:
+// CHECK11-NEXT: [[TMP15:%.*]] = load i64, ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK11-NEXT: br label [[COND_END]]
+// CHECK11: cond.end:
+// CHECK11-NEXT: [[COND:%.*]] = phi i64 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ]
+// CHECK11-NEXT: store i64 [[COND]], ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK11-NEXT: [[TMP16:%.*]] = load i64, ptr [[DOTOMP_COMB_LB]], align 8
+// CHECK11-NEXT: store i64 [[TMP16]], ptr [[DOTOMP_IV]], align 8
+// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK11: omp.inner.for.cond:
+// CHECK11-NEXT: [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK11-NEXT: [[TMP18:%.*]] = load i64, ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK11-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP17]], [[TMP18]]
+// CHECK11-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK11: omp.inner.for.body:
+// CHECK11-NEXT: [[TMP19:%.*]] = load i64, ptr [[DOTOMP_COMB_LB]], align 8
+// CHECK11-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32
+// CHECK11-NEXT: [[TMP21:%.*]] = load i64, ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK11-NEXT: [[TMP22:%.*]] = trunc i64 [[TMP21]] to i32
+// CHECK11-NEXT: [[TMP23:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[TMP23]], ptr [[N_CASTED]], align 4
+// CHECK11-NEXT: [[TMP24:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK11-NEXT: [[TMP25:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[TMP25]], ptr [[M_CASTED]], align 4
+// CHECK11-NEXT: [[TMP26:%.*]] = load i32, ptr [[M_CASTED]], align 4
+// CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 7, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81.omp_outlined.omp_outlined, i32 [[TMP20]], i32 [[TMP22]], i32 [[TMP24]], i32 [[TMP26]], i32 [[TMP0]], i32 [[TMP1]], ptr [[TMP2]])
+// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK11: omp.inner.for.inc:
+// CHECK11-NEXT: [[TMP27:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK11-NEXT: [[TMP28:%.*]] = load i64, ptr [[DOTOMP_STRIDE]], align 8
+// CHECK11-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP27]], [[TMP28]]
+// CHECK11-NEXT: store i64 [[ADD]], ptr [[DOTOMP_IV]], align 8
+// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK11: omp.inner.for.end:
+// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK11: omp.loop.exit:
+// CHECK11-NEXT: [[TMP29:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT: [[TMP30:%.*]] = load i32, ptr [[TMP29]], align 4
+// CHECK11-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP30]])
+// CHECK11-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK11: omp.precond.end:
+// CHECK11-NEXT: ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81.omp_outlined.omp_outlined
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
+// CHECK11-NEXT: entry:
+// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[VLA_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[VLA_ADDR2:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8
+// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[_TMP3:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTCAPTURE_EXPR_4:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTCAPTURE_EXPR_5:%.*]] = alloca i64, align 8
+// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8
+// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8
+// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8
+// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[I13:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[J14:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK11-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK11-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK11-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[VLA1]], ptr [[VLA_ADDR2]], align 4
+// CHECK11-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK11-NEXT: [[TMP0:%.*]] = load i32, ptr [[VLA_ADDR]], align 4
+// CHECK11-NEXT: [[TMP1:%.*]] = load i32, ptr [[VLA_ADDR2]], align 4
+// CHECK11-NEXT: [[TMP2:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK11-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT: [[TMP4:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK11-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP5]], 0
+// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK11-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64
+// CHECK11-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK11-NEXT: [[SUB6:%.*]] = sub nsw i32 [[TMP6]], 0
+// CHECK11-NEXT: [[DIV7:%.*]] = sdiv i32 [[SUB6]], 1
+// CHECK11-NEXT: [[CONV8:%.*]] = sext i32 [[DIV7]] to i64
+// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV8]]
+// CHECK11-NEXT: [[SUB9:%.*]] = sub nsw i64 [[MUL]], 1
+// CHECK11-NEXT: store i64 [[SUB9]], ptr [[DOTCAPTURE_EXPR_5]], align 8
+// CHECK11-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK11-NEXT: store i32 0, ptr [[J]], align 4
+// CHECK11-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]]
+// CHECK11-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK11: land.lhs.true:
+// CHECK11-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK11-NEXT: [[CMP10:%.*]] = icmp slt i32 0, [[TMP8]]
+// CHECK11-NEXT: br i1 [[CMP10]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]]
+// CHECK11: omp.precond.then:
+// CHECK11-NEXT: store i64 0, ptr [[DOTOMP_LB]], align 8
+// CHECK11-NEXT: [[TMP9:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_5]], align 8
+// CHECK11-NEXT: store i64 [[TMP9]], ptr [[DOTOMP_UB]], align 8
+// CHECK11-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK11-NEXT: [[CONV11:%.*]] = zext i32 [[TMP10]] to i64
+// CHECK11-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK11-NEXT: [[CONV12:%.*]] = zext i32 [[TMP11]] to i64
+// CHECK11-NEXT: store i64 [[CONV11]], ptr [[DOTOMP_LB]], align 8
+// CHECK11-NEXT: store i64 [[CONV12]], ptr [[DOTOMP_UB]], align 8
+// CHECK11-NEXT: store i64 1, ptr [[DOTOMP_STRIDE]], align 8
+// CHECK11-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK11-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
+// CHECK11-NEXT: call void @__kmpc_for_static_init_8(ptr @[[GLOB2]], i32 [[TMP13]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i64 1, i64 1)
+// CHECK11-NEXT: [[TMP14:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8
+// CHECK11-NEXT: [[TMP15:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_5]], align 8
+// CHECK11-NEXT: [[CMP15:%.*]] = icmp sgt i64 [[TMP14]], [[TMP15]]
+// CHECK11-NEXT: br i1 [[CMP15]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK11: cond.true:
+// CHECK11-NEXT: [[TMP16:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_5]], align 8
+// CHECK11-NEXT: br label [[COND_END:%.*]]
+// CHECK11: cond.false:
+// CHECK11-NEXT: [[TMP17:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8
+// CHECK11-NEXT: br label [[COND_END]]
+// CHECK11: cond.end:
+// CHECK11-NEXT: [[COND:%.*]] = phi i64 [ [[TMP16]], [[COND_TRUE]] ], [ [[TMP17]], [[COND_FALSE]] ]
+// CHECK11-NEXT: store i64 [[COND]], ptr [[DOTOMP_UB]], align 8
+// CHECK11-NEXT: [[TMP18:%.*]] = load i64, ptr [[DOTOMP_LB]], align 8
+// CHECK11-NEXT: store i64 [[TMP18]], ptr [[DOTOMP_IV]], align 8
+// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK11: omp.inner.for.cond:
+// CHECK11-NEXT: [[TMP19:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK11-NEXT: [[TMP20:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8
+// CHECK11-NEXT: [[CMP16:%.*]] = icmp sle i64 [[TMP19]], [[TMP20]]
+// CHECK11-NEXT: br i1 [[CMP16]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK11: omp.inner.for.body:
+// CHECK11-NEXT: [[TMP21:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK11-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK11-NEXT: [[SUB17:%.*]] = sub nsw i32 [[TMP22]], 0
+// CHECK11-NEXT: [[DIV18:%.*]] = sdiv i32 [[SUB17]], 1
+// CHECK11-NEXT: [[MUL19:%.*]] = mul nsw i32 1, [[DIV18]]
+// CHECK11-NEXT: [[CONV20:%.*]] = sext i32 [[MUL19]] to i64
+// CHECK11-NEXT: [[DIV21:%.*]] = sdiv i64 [[TMP21]], [[CONV20]]
+// CHECK11-NEXT: [[MUL22:%.*]] = mul nsw i64 [[DIV21]], 1
+// CHECK11-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL22]]
+// CHECK11-NEXT: [[CONV23:%.*]] = trunc i64 [[ADD]] to i32
+// CHECK11-NEXT: store i32 [[CONV23]], ptr [[I13]], align 4
+// CHECK11-NEXT: [[TMP23:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK11-NEXT: [[TMP24:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK11-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK11-NEXT: [[SUB24:%.*]] = sub nsw i32 [[TMP25]], 0
+// CHECK11-NEXT: [[DIV25:%.*]] = sdiv i32 [[SUB24]], 1
+// CHECK11-NEXT: [[MUL26:%.*]] = mul nsw i32 1, [[DIV25]]
+// CHECK11-NEXT: [[CONV27:%.*]] = sext i32 [[MUL26]] to i64
+// CHECK11-NEXT: [[DIV28:%.*]] = sdiv i64 [[TMP24]], [[CONV27]]
+// CHECK11-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK11-NEXT: [[SUB29:%.*]] = sub nsw i32 [[TMP26]], 0
+// CHECK11-NEXT: [[DIV30:%.*]] = sdiv i32 [[SUB29]], 1
+// CHECK11-NEXT: [[MUL31:%.*]] = mul nsw i32 1, [[DIV30]]
+// CHECK11-NEXT: [[CONV32:%.*]] = sext i32 [[MUL31]] to i64
+// CHECK11-NEXT: [[MUL33:%.*]] = mul nsw i64 [[DIV28]], [[CONV32]]
+// CHECK11-NEXT: [[SUB34:%.*]] = sub nsw i64 [[TMP23]], [[MUL33]]
+// CHECK11-NEXT: [[MUL35:%.*]] = mul nsw i64 [[SUB34]], 1
+// CHECK11-NEXT: [[ADD36:%.*]] = add nsw i64 0, [[MUL35]]
+// CHECK11-NEXT: [[CONV37:%.*]] = trunc i64 [[ADD36]] to i32
+// CHECK11-NEXT: store i32 [[CONV37]], ptr [[J14]], align 4
+// CHECK11-NEXT: [[TMP27:%.*]] = load i32, ptr [[I13]], align 4
+// CHECK11-NEXT: [[TMP28:%.*]] = mul nsw i32 [[TMP27]], [[TMP1]]
+// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 [[TMP28]]
+// CHECK11-NEXT: [[TMP29:%.*]] = load i32, ptr [[J14]], align 4
+// CHECK11-NEXT: [[ARRAYIDX38:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX]], i32 [[TMP29]]
+// CHECK11-NEXT: store i32 0, ptr [[ARRAYIDX38]], align 4
+// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK11: omp.body.continue:
+// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK11: omp.inner.for.inc:
+// CHECK11-NEXT: [[TMP30:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK11-NEXT: [[ADD39:%.*]] = add nsw i64 [[TMP30]], 1
+// CHECK11-NEXT: store i64 [[ADD39]], ptr [[DOTOMP_IV]], align 8
+// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK11: omp.inner.for.end:
+// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK11: omp.loop.exit:
+// CHECK11-NEXT: [[TMP31:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT: [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4
+// CHECK11-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP32]])
+// CHECK11-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK11: omp.precond.end:
+// CHECK11-NEXT: ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiLi10ELi2EEiT_
+// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat {
+// CHECK11-NEXT: entry:
+// CHECK11-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[A:%.*]] = alloca [10 x [2 x i32]], align 4
+// CHECK11-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// CHECK11-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// CHECK11-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK11-NEXT: store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4
+// CHECK11-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK11-NEXT: store ptr [[A]], ptr [[TMP0]], align 4
+// CHECK11-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK11-NEXT: store ptr [[A]], ptr [[TMP1]], align 4
+// CHECK11-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK11-NEXT: store ptr null, ptr [[TMP2]], align 4
+// CHECK11-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK11-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK11-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK11-NEXT: store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK11-NEXT: store i32 1, ptr [[TMP6]], align 4
+// CHECK11-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK11-NEXT: store ptr [[TMP3]], ptr [[TMP7]], align 4
+// CHECK11-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK11-NEXT: store ptr [[TMP4]], ptr [[TMP8]], align 4
+// CHECK11-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK11-NEXT: store ptr @.offload_sizes.1, ptr [[TMP9]], align 4
+// CHECK11-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK11-NEXT: store ptr @.offload_maptypes.2, ptr [[TMP10]], align 4
+// CHECK11-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK11-NEXT: store ptr null, ptr [[TMP11]], align 4
+// CHECK11-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK11-NEXT: store ptr null, ptr [[TMP12]], align 4
+// CHECK11-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK11-NEXT: store i64 20, ptr [[TMP13]], align 8
+// CHECK11-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK11-NEXT: store i64 0, ptr [[TMP14]], align 8
+// CHECK11-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK11-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP15]], align 4
+// CHECK11-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK11-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4
+// CHECK11-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK11-NEXT: store i32 0, ptr [[TMP17]], align 4
+// CHECK11-NEXT: [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68.region_id, ptr [[KERNEL_ARGS]])
+// CHECK11-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// CHECK11-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK11: omp_offload.failed:
+// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68(ptr [[A]]) #[[ATTR3]]
+// CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK11: omp_offload.cont:
+// CHECK11-NEXT: ret i32 0
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68
+// CHECK11-SAME: (ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] {
+// CHECK11-NEXT: entry:
+// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68.omp_outlined, ptr [[TMP0]])
+// CHECK11-NEXT: ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68.omp_outlined
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] {
+// CHECK11-NEXT: entry:
+// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK11-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK11-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK11-NEXT: store i32 19, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK11-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK11-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK11-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK11-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP2]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK11-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK11-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 19
+// CHECK11-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK11: cond.true:
+// CHECK11-NEXT: br label [[COND_END:%.*]]
+// CHECK11: cond.false:
+// CHECK11-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK11-NEXT: br label [[COND_END]]
+// CHECK11: cond.end:
+// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ 19, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK11-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK11-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK11-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK11: omp.inner.for.cond:
+// CHECK11-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK11-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK11-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK11: omp.inner.for.body:
+// CHECK11-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK11-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68.omp_outlined.omp_outlined, i32 [[TMP8]], i32 [[TMP9]], ptr [[TMP0]])
+// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK11: omp.inner.for.inc:
+// CHECK11-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], [[TMP11]]
+// CHECK11-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK11: omp.inner.for.end:
+// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK11: omp.loop.exit:
+// CHECK11-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP2]])
+// CHECK11-NEXT: ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68.omp_outlined.omp_outlined
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] {
+// CHECK11-NEXT: entry:
+// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK11-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK11-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK11-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK11-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK11-NEXT: store i32 19, ptr [[DOTOMP_UB]], align 4
+// CHECK11-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK11-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK11-NEXT: store i32 [[TMP1]], ptr [[DOTOMP_LB]], align 4
+// CHECK11-NEXT: store i32 [[TMP2]], ptr [[DOTOMP_UB]], align 4
+// CHECK11-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK11-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK11-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+// CHECK11-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP4]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK11-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK11-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP5]], 19
+// CHECK11-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK11: cond.true:
+// CHECK11-NEXT: br label [[COND_END:%.*]]
+// CHECK11: cond.false:
+// CHECK11-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK11-NEXT: br label [[COND_END]]
+// CHECK11: cond.end:
+// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ 19, [[COND_TRUE]] ], [ [[TMP6]], [[COND_FALSE]] ]
+// CHECK11-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK11-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK11-NEXT: store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK11: omp.inner.for.cond:
+// CHECK11-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK11-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK11-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK11: omp.inner.for.body:
+// CHECK11-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP10]], 2
+// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
+// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK11-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK11-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT: [[DIV3:%.*]] = sdiv i32 [[TMP12]], 2
+// CHECK11-NEXT: [[MUL4:%.*]] = mul nsw i32 [[DIV3]], 2
+// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP11]], [[MUL4]]
+// CHECK11-NEXT: [[MUL5:%.*]] = mul nsw i32 [[SUB]], 1
+// CHECK11-NEXT: [[ADD6:%.*]] = add nsw i32 0, [[MUL5]]
+// CHECK11-NEXT: store i32 [[ADD6]], ptr [[J]], align 4
+// CHECK11-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4
+// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [2 x i32]], ptr [[TMP0]], i32 0, i32 [[TMP13]]
+// CHECK11-NEXT: [[TMP14:%.*]] = load i32, ptr [[J]], align 4
+// CHECK11-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [2 x i32], ptr [[ARRAYIDX]], i32 0, i32 [[TMP14]]
+// CHECK11-NEXT: store i32 0, ptr [[ARRAYIDX7]], align 4
+// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK11: omp.body.continue:
+// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK11: omp.inner.for.inc:
+// CHECK11-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP15]], 1
+// CHECK11-NEXT: store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK11: omp.inner.for.end:
+// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK11: omp.loop.exit:
+// CHECK11-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP4]])
+// CHECK11-NEXT: ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK11-SAME: () #[[ATTR6:[0-9]+]] {
+// CHECK11-NEXT: entry:
+// CHECK11-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK11-NEXT: ret void
+//
diff --git a/clang/test/OpenMP/target_teams_generic_loop_depend_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_depend_codegen.cpp
new file mode 100644
index 00000000000000..e1e93d98480835
--- /dev/null
+++ b/clang/test/OpenMP/target_teams_generic_loop_depend_codegen.cpp
@@ -0,0 +1,3219 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// Test host codegen.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY1 %s
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+
+
+
+
+// Check target registration is registered as a Ctor.
+
+
+template<typename tx, typename ty>
+struct TT{
+ tx X;
+ ty Y;
+};
+
+int global;
+extern int global;
+
+int foo(int n) {
+ int a = 0;
+ short aa = 0;
+ float b[10];
+ float bn[n];
+ double c[5][10];
+ double cn[5][n];
+ TT<long long, char> d;
+ static long *plocal;
+
+ #pragma omp target teams loop device(global + a) depend(in: global) depend(out: a, b, cn[4])
+ for (int i = 0; i < 10; ++i) {
+ }
+
+
+
+
+
+
+
+ #pragma omp target teams loop device(global + a) depend(inout: global, a, bn) if(target:a)
+ for (int i = 0; i < *plocal; ++i) {
+ static int local1;
+ *plocal = global;
+ local1 = global;
+ }
+
+ #pragma omp target teams loop if(0) firstprivate(global) depend(out:global)
+ for (int i = 0; i < global; ++i) {
+ global += 1;
+ }
+
+ return a;
+}
+
+// Check that the offloading functions are emitted and that the arguments are
+// correct and loaded correctly for the target regions in foo().
+
+
+
+
+
+
+
+// Create stack storage and store argument in there.
+
+
+
+#endif
+// CHECK-64-LABEL: define {{[^@]+}}@_Z3fooi
+// CHECK-64-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-64-NEXT: entry:
+// CHECK-64-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[AA:%.*]] = alloca i16, align 2
+// CHECK-64-NEXT: [[B:%.*]] = alloca [10 x float], align 4
+// CHECK-64-NEXT: [[SAVED_STACK:%.*]] = alloca i8*, align 8
+// CHECK-64-NEXT: [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT: [[C:%.*]] = alloca [5 x [10 x double]], align 8
+// CHECK-64-NEXT: [[__VLA_EXPR1:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT: [[D:%.*]] = alloca [[STRUCT_TT:%.*]], align 8
+// CHECK-64-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 4
+// CHECK-64-NEXT: [[DOTDEP_ARR_ADDR:%.*]] = alloca [4 x %struct.kmp_depend_info], align 8
+// CHECK-64-NEXT: [[DEP_COUNTER_ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[GLOBAL_CASTED:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [2 x i8*], align 8
+// CHECK-64-NEXT: [[AGG_CAPTURED4:%.*]] = alloca [[STRUCT_ANON_0:%.*]], align 8
+// CHECK-64-NEXT: [[DOTDEP_ARR_ADDR5:%.*]] = alloca [3 x %struct.kmp_depend_info], align 8
+// CHECK-64-NEXT: [[DEP_COUNTER_ADDR6:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT: [[AGG_CAPTURED7:%.*]] = alloca [[STRUCT_ANON_0]], align 8
+// CHECK-64-NEXT: [[DOTDEP_ARR_ADDR8:%.*]] = alloca [3 x %struct.kmp_depend_info], align 8
+// CHECK-64-NEXT: [[DEP_COUNTER_ADDR9:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT: [[GLOBAL_CASTED10:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT: [[AGG_CAPTURED12:%.*]] = alloca [[STRUCT_ANON_4:%.*]], align 4
+// CHECK-64-NEXT: [[DOTDEP_ARR_ADDR13:%.*]] = alloca [1 x %struct.kmp_depend_info], align 8
+// CHECK-64-NEXT: [[DEP_COUNTER_ADDR14:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
+// CHECK-64-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK-64-NEXT: store i32 0, i32* [[A]], align 4
+// CHECK-64-NEXT: store i16 0, i16* [[AA]], align 2
+// CHECK-64-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK-64-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+// CHECK-64-NEXT: [[TMP3:%.*]] = call i8* @llvm.stacksave()
+// CHECK-64-NEXT: store i8* [[TMP3]], i8** [[SAVED_STACK]], align 8
+// CHECK-64-NEXT: [[VLA:%.*]] = alloca float, i64 [[TMP2]], align 4
+// CHECK-64-NEXT: store i64 [[TMP2]], i64* [[__VLA_EXPR0]], align 8
+// CHECK-64-NEXT: [[TMP4:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK-64-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+// CHECK-64-NEXT: [[TMP6:%.*]] = mul nuw i64 5, [[TMP5]]
+// CHECK-64-NEXT: [[VLA1:%.*]] = alloca double, i64 [[TMP6]], align 8
+// CHECK-64-NEXT: store i64 [[TMP5]], i64* [[__VLA_EXPR1]], align 8
+// CHECK-64-NEXT: [[TMP7:%.*]] = load i32, i32* @global, align 4
+// CHECK-64-NEXT: [[TMP8:%.*]] = load i32, i32* [[A]], align 4
+// CHECK-64-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP7]], [[TMP8]]
+// CHECK-64-NEXT: store i32 [[ADD]], i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-64-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[AGG_CAPTURED]], i32 0, i32 0
+// CHECK-64-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-64-NEXT: store i32 [[TMP10]], i32* [[TMP9]], align 4
+// CHECK-64-NEXT: [[TMP11:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i64 40, i64 4, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @.omp_task_entry. to i32 (i32, i8*)*))
+// CHECK-64-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP11]] to %struct.kmp_task_t_with_privates*
+// CHECK-64-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], %struct.kmp_task_t_with_privates* [[TMP12]], i32 0, i32 0
+// CHECK-64-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], %struct.kmp_task_t* [[TMP13]], i32 0, i32 0
+// CHECK-64-NEXT: [[TMP15:%.*]] = load i8*, i8** [[TMP14]], align 8
+// CHECK-64-NEXT: [[TMP16:%.*]] = bitcast %struct.anon* [[AGG_CAPTURED]] to i8*
+// CHECK-64-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP15]], i8* align 4 [[TMP16]], i64 4, i1 false)
+// CHECK-64-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* [[DOTDEP_ARR_ADDR]], i64 0, i64 0
+// CHECK-64-NEXT: [[TMP18:%.*]] = getelementptr [[STRUCT_KMP_DEPEND_INFO:%.*]], %struct.kmp_depend_info* [[TMP17]], i64 0
+// CHECK-64-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP18]], i32 0, i32 0
+// CHECK-64-NEXT: store i64 ptrtoint (i32* @global to i64), i64* [[TMP19]], align 8
+// CHECK-64-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP18]], i32 0, i32 1
+// CHECK-64-NEXT: store i64 4, i64* [[TMP20]], align 8
+// CHECK-64-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP18]], i32 0, i32 2
+// CHECK-64-NEXT: store i8 1, i8* [[TMP21]], align 8
+// CHECK-64-NEXT: [[TMP22:%.*]] = ptrtoint i32* [[A]] to i64
+// CHECK-64-NEXT: [[TMP23:%.*]] = getelementptr [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP17]], i64 1
+// CHECK-64-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP23]], i32 0, i32 0
+// CHECK-64-NEXT: store i64 [[TMP22]], i64* [[TMP24]], align 8
+// CHECK-64-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP23]], i32 0, i32 1
+// CHECK-64-NEXT: store i64 4, i64* [[TMP25]], align 8
+// CHECK-64-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP23]], i32 0, i32 2
+// CHECK-64-NEXT: store i8 3, i8* [[TMP26]], align 8
+// CHECK-64-NEXT: [[TMP27:%.*]] = ptrtoint [10 x float]* [[B]] to i64
+// CHECK-64-NEXT: [[TMP28:%.*]] = getelementptr [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP17]], i64 2
+// CHECK-64-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP28]], i32 0, i32 0
+// CHECK-64-NEXT: store i64 [[TMP27]], i64* [[TMP29]], align 8
+// CHECK-64-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP28]], i32 0, i32 1
+// CHECK-64-NEXT: store i64 40, i64* [[TMP30]], align 8
+// CHECK-64-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP28]], i32 0, i32 2
+// CHECK-64-NEXT: store i8 3, i8* [[TMP31]], align 8
+// CHECK-64-NEXT: [[TMP32:%.*]] = mul nsw i64 4, [[TMP5]]
+// CHECK-64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VLA1]], i64 [[TMP32]]
+// CHECK-64-NEXT: [[TMP33:%.*]] = mul nuw i64 [[TMP5]], 8
+// CHECK-64-NEXT: [[TMP34:%.*]] = ptrtoint double* [[ARRAYIDX]] to i64
+// CHECK-64-NEXT: [[TMP35:%.*]] = getelementptr [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP17]], i64 3
+// CHECK-64-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP35]], i32 0, i32 0
+// CHECK-64-NEXT: store i64 [[TMP34]], i64* [[TMP36]], align 8
+// CHECK-64-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP35]], i32 0, i32 1
+// CHECK-64-NEXT: store i64 [[TMP33]], i64* [[TMP37]], align 8
+// CHECK-64-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP35]], i32 0, i32 2
+// CHECK-64-NEXT: store i8 3, i8* [[TMP38]], align 8
+// CHECK-64-NEXT: store i64 4, i64* [[DEP_COUNTER_ADDR]], align 8
+// CHECK-64-NEXT: [[TMP39:%.*]] = bitcast %struct.kmp_depend_info* [[TMP17]] to i8*
+// CHECK-64-NEXT: call void @__kmpc_omp_taskwait_deps_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 4, i8* [[TMP39]], i32 0, i8* null, i32 0)
+// CHECK-64-NEXT: call void @__kmpc_omp_task_begin_if0(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i8* [[TMP11]])
+// CHECK-64-NEXT: [[TMP40:%.*]] = call i32 @.omp_task_entry.(i32 [[TMP0]], %struct.kmp_task_t_with_privates* [[TMP12]]) #[[ATTR3:[0-9]+]]
+// CHECK-64-NEXT: call void @__kmpc_omp_task_complete_if0(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i8* [[TMP11]])
+// CHECK-64-NEXT: [[TMP41:%.*]] = load i32, i32* @global, align 4
+// CHECK-64-NEXT: [[TMP42:%.*]] = load i32, i32* [[A]], align 4
+// CHECK-64-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP41]], [[TMP42]]
+// CHECK-64-NEXT: store i32 [[ADD3]], i32* [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-64-NEXT: [[TMP43:%.*]] = load i64*, i64** @_ZZ3fooiE6plocal, align 8
+// CHECK-64-NEXT: [[TMP44:%.*]] = load i32, i32* @global, align 4
+// CHECK-64-NEXT: [[CONV:%.*]] = bitcast i64* [[GLOBAL_CASTED]] to i32*
+// CHECK-64-NEXT: store i32 [[TMP44]], i32* [[CONV]], align 4
+// CHECK-64-NEXT: [[TMP45:%.*]] = load i64, i64* [[GLOBAL_CASTED]], align 8
+// CHECK-64-NEXT: [[TMP46:%.*]] = load i32, i32* [[A]], align 4
+// CHECK-64-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP46]], 0
+// CHECK-64-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]]
+// CHECK-64: omp_if.then:
+// CHECK-64-NEXT: [[TMP47:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-64-NEXT: [[TMP48:%.*]] = bitcast i8** [[TMP47]] to i64**
+// CHECK-64-NEXT: store i64* [[TMP43]], i64** [[TMP48]], align 8
+// CHECK-64-NEXT: [[TMP49:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-64-NEXT: [[TMP50:%.*]] = bitcast i8** [[TMP49]] to i64**
+// CHECK-64-NEXT: store i64* [[TMP43]], i64** [[TMP50]], align 8
+// CHECK-64-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK-64-NEXT: store i8* null, i8** [[TMP51]], align 8
+// CHECK-64-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK-64-NEXT: [[TMP53:%.*]] = bitcast i8** [[TMP52]] to i64*
+// CHECK-64-NEXT: store i64 [[TMP45]], i64* [[TMP53]], align 8
+// CHECK-64-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK-64-NEXT: [[TMP55:%.*]] = bitcast i8** [[TMP54]] to i64*
+// CHECK-64-NEXT: store i64 [[TMP45]], i64* [[TMP55]], align 8
+// CHECK-64-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK-64-NEXT: store i8* null, i8** [[TMP56]], align 8
+// CHECK-64-NEXT: [[TMP57:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-64-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-64-NEXT: [[TMP59:%.*]] = getelementptr inbounds [[STRUCT_ANON_0]], %struct.anon.0* [[AGG_CAPTURED4]], i32 0, i32 0
+// CHECK-64-NEXT: [[TMP60:%.*]] = load i64*, i64** @_ZZ3fooiE6plocal, align 8
+// CHECK-64-NEXT: store i64* [[TMP60]], i64** [[TMP59]], align 8
+// CHECK-64-NEXT: [[TMP61:%.*]] = getelementptr inbounds [[STRUCT_ANON_0]], %struct.anon.0* [[AGG_CAPTURED4]], i32 0, i32 1
+// CHECK-64-NEXT: [[TMP62:%.*]] = load i32, i32* @global, align 4
+// CHECK-64-NEXT: store i32 [[TMP62]], i32* [[TMP61]], align 8
+// CHECK-64-NEXT: [[TMP63:%.*]] = getelementptr inbounds [[STRUCT_ANON_0]], %struct.anon.0* [[AGG_CAPTURED4]], i32 0, i32 2
+// CHECK-64-NEXT: [[TMP64:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-64-NEXT: store i32 [[TMP64]], i32* [[TMP63]], align 4
+// CHECK-64-NEXT: [[TMP65:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i64 104, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.1*)* @.omp_task_entry..6 to i32 (i32, i8*)*))
+// CHECK-64-NEXT: [[TMP66:%.*]] = bitcast i8* [[TMP65]] to %struct.kmp_task_t_with_privates.1*
+// CHECK-64-NEXT: [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_1:%.*]], %struct.kmp_task_t_with_privates.1* [[TMP66]], i32 0, i32 0
+// CHECK-64-NEXT: [[TMP68:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP67]], i32 0, i32 0
+// CHECK-64-NEXT: [[TMP69:%.*]] = load i8*, i8** [[TMP68]], align 8
+// CHECK-64-NEXT: [[TMP70:%.*]] = bitcast %struct.anon.0* [[AGG_CAPTURED4]] to i8*
+// CHECK-64-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP69]], i8* align 8 [[TMP70]], i64 16, i1 false)
+// CHECK-64-NEXT: [[TMP71:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_1]], %struct.kmp_task_t_with_privates.1* [[TMP66]], i32 0, i32 1
+// CHECK-64-NEXT: [[TMP72:%.*]] = bitcast i8* [[TMP69]] to %struct.anon.0*
+// CHECK-64-NEXT: [[TMP73:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], %struct..kmp_privates.t* [[TMP71]], i32 0, i32 0
+// CHECK-64-NEXT: [[TMP74:%.*]] = load i64*, i64** @_ZZ3fooiE6plocal, align 8
+// CHECK-64-NEXT: store i64* [[TMP74]], i64** [[TMP73]], align 8
+// CHECK-64-NEXT: [[TMP75:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP71]], i32 0, i32 1
+// CHECK-64-NEXT: [[TMP76:%.*]] = bitcast [2 x i8*]* [[TMP75]] to i8*
+// CHECK-64-NEXT: [[TMP77:%.*]] = bitcast i8** [[TMP57]] to i8*
+// CHECK-64-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP76]], i8* align 8 [[TMP77]], i64 16, i1 false)
+// CHECK-64-NEXT: [[TMP78:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP71]], i32 0, i32 2
+// CHECK-64-NEXT: [[TMP79:%.*]] = bitcast [2 x i8*]* [[TMP78]] to i8*
+// CHECK-64-NEXT: [[TMP80:%.*]] = bitcast i8** [[TMP58]] to i8*
+// CHECK-64-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP79]], i8* align 8 [[TMP80]], i64 16, i1 false)
+// CHECK-64-NEXT: [[TMP81:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP71]], i32 0, i32 3
+// CHECK-64-NEXT: [[TMP82:%.*]] = bitcast [2 x i64]* [[TMP81]] to i8*
+// CHECK-64-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP82]], i8* align 8 bitcast ([2 x i64]* @.offload_sizes to i8*), i64 16, i1 false)
+// CHECK-64-NEXT: [[TMP83:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP71]], i32 0, i32 4
+// CHECK-64-NEXT: [[TMP84:%.*]] = load i32, i32* @global, align 4
+// CHECK-64-NEXT: store i32 [[TMP84]], i32* [[TMP83]], align 8
+// CHECK-64-NEXT: [[TMP85:%.*]] = getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* [[DOTDEP_ARR_ADDR5]], i64 0, i64 0
+// CHECK-64-NEXT: [[TMP86:%.*]] = getelementptr [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP85]], i64 0
+// CHECK-64-NEXT: [[TMP87:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP86]], i32 0, i32 0
+// CHECK-64-NEXT: store i64 ptrtoint (i32* @global to i64), i64* [[TMP87]], align 8
+// CHECK-64-NEXT: [[TMP88:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP86]], i32 0, i32 1
+// CHECK-64-NEXT: store i64 4, i64* [[TMP88]], align 8
+// CHECK-64-NEXT: [[TMP89:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP86]], i32 0, i32 2
+// CHECK-64-NEXT: store i8 3, i8* [[TMP89]], align 8
+// CHECK-64-NEXT: [[TMP90:%.*]] = ptrtoint i32* [[A]] to i64
+// CHECK-64-NEXT: [[TMP91:%.*]] = getelementptr [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP85]], i64 1
+// CHECK-64-NEXT: [[TMP92:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP91]], i32 0, i32 0
+// CHECK-64-NEXT: store i64 [[TMP90]], i64* [[TMP92]], align 8
+// CHECK-64-NEXT: [[TMP93:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP91]], i32 0, i32 1
+// CHECK-64-NEXT: store i64 4, i64* [[TMP93]], align 8
+// CHECK-64-NEXT: [[TMP94:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP91]], i32 0, i32 2
+// CHECK-64-NEXT: store i8 3, i8* [[TMP94]], align 8
+// CHECK-64-NEXT: [[TMP95:%.*]] = mul nuw i64 [[TMP2]], 4
+// CHECK-64-NEXT: [[TMP96:%.*]] = ptrtoint float* [[VLA]] to i64
+// CHECK-64-NEXT: [[TMP97:%.*]] = getelementptr [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP85]], i64 2
+// CHECK-64-NEXT: [[TMP98:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP97]], i32 0, i32 0
+// CHECK-64-NEXT: store i64 [[TMP96]], i64* [[TMP98]], align 8
+// CHECK-64-NEXT: [[TMP99:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP97]], i32 0, i32 1
+// CHECK-64-NEXT: store i64 [[TMP95]], i64* [[TMP99]], align 8
+// CHECK-64-NEXT: [[TMP100:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP97]], i32 0, i32 2
+// CHECK-64-NEXT: store i8 3, i8* [[TMP100]], align 8
+// CHECK-64-NEXT: store i64 3, i64* [[DEP_COUNTER_ADDR6]], align 8
+// CHECK-64-NEXT: [[TMP101:%.*]] = bitcast %struct.kmp_depend_info* [[TMP85]] to i8*
+// CHECK-64-NEXT: call void @__kmpc_omp_taskwait_deps_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 3, i8* [[TMP101]], i32 0, i8* null, i32 0)
+// CHECK-64-NEXT: call void @__kmpc_omp_task_begin_if0(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i8* [[TMP65]])
+// CHECK-64-NEXT: [[TMP102:%.*]] = call i32 @.omp_task_entry..6(i32 [[TMP0]], %struct.kmp_task_t_with_privates.1* [[TMP66]]) #[[ATTR3]]
+// CHECK-64-NEXT: call void @__kmpc_omp_task_complete_if0(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i8* [[TMP65]])
+// CHECK-64-NEXT: br label [[OMP_IF_END:%.*]]
+// CHECK-64: omp_if.else:
+// CHECK-64-NEXT: [[TMP103:%.*]] = getelementptr inbounds [[STRUCT_ANON_0]], %struct.anon.0* [[AGG_CAPTURED7]], i32 0, i32 0
+// CHECK-64-NEXT: [[TMP104:%.*]] = load i64*, i64** @_ZZ3fooiE6plocal, align 8
+// CHECK-64-NEXT: store i64* [[TMP104]], i64** [[TMP103]], align 8
+// CHECK-64-NEXT: [[TMP105:%.*]] = getelementptr inbounds [[STRUCT_ANON_0]], %struct.anon.0* [[AGG_CAPTURED7]], i32 0, i32 1
+// CHECK-64-NEXT: [[TMP106:%.*]] = load i32, i32* @global, align 4
+// CHECK-64-NEXT: store i32 [[TMP106]], i32* [[TMP105]], align 8
+// CHECK-64-NEXT: [[TMP107:%.*]] = getelementptr inbounds [[STRUCT_ANON_0]], %struct.anon.0* [[AGG_CAPTURED7]], i32 0, i32 2
+// CHECK-64-NEXT: [[TMP108:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-64-NEXT: store i32 [[TMP108]], i32* [[TMP107]], align 4
+// CHECK-64-NEXT: [[TMP109:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i64 56, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.2*)* @.omp_task_entry..9 to i32 (i32, i8*)*))
+// CHECK-64-NEXT: [[TMP110:%.*]] = bitcast i8* [[TMP109]] to %struct.kmp_task_t_with_privates.2*
+// CHECK-64-NEXT: [[TMP111:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_2:%.*]], %struct.kmp_task_t_with_privates.2* [[TMP110]], i32 0, i32 0
+// CHECK-64-NEXT: [[TMP112:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP111]], i32 0, i32 0
+// CHECK-64-NEXT: [[TMP113:%.*]] = load i8*, i8** [[TMP112]], align 8
+// CHECK-64-NEXT: [[TMP114:%.*]] = bitcast %struct.anon.0* [[AGG_CAPTURED7]] to i8*
+// CHECK-64-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP113]], i8* align 8 [[TMP114]], i64 16, i1 false)
+// CHECK-64-NEXT: [[TMP115:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_2]], %struct.kmp_task_t_with_privates.2* [[TMP110]], i32 0, i32 1
+// CHECK-64-NEXT: [[TMP116:%.*]] = bitcast i8* [[TMP113]] to %struct.anon.0*
+// CHECK-64-NEXT: [[TMP117:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T_3:%.*]], %struct..kmp_privates.t.3* [[TMP115]], i32 0, i32 0
+// CHECK-64-NEXT: [[TMP118:%.*]] = load i64*, i64** @_ZZ3fooiE6plocal, align 8
+// CHECK-64-NEXT: store i64* [[TMP118]], i64** [[TMP117]], align 8
+// CHECK-64-NEXT: [[TMP119:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T_3]], %struct..kmp_privates.t.3* [[TMP115]], i32 0, i32 1
+// CHECK-64-NEXT: [[TMP120:%.*]] = load i32, i32* @global, align 4
+// CHECK-64-NEXT: store i32 [[TMP120]], i32* [[TMP119]], align 8
+// CHECK-64-NEXT: [[TMP121:%.*]] = getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* [[DOTDEP_ARR_ADDR8]], i64 0, i64 0
+// CHECK-64-NEXT: [[TMP122:%.*]] = getelementptr [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP121]], i64 0
+// CHECK-64-NEXT: [[TMP123:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP122]], i32 0, i32 0
+// CHECK-64-NEXT: store i64 ptrtoint (i32* @global to i64), i64* [[TMP123]], align 8
+// CHECK-64-NEXT: [[TMP124:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP122]], i32 0, i32 1
+// CHECK-64-NEXT: store i64 4, i64* [[TMP124]], align 8
+// CHECK-64-NEXT: [[TMP125:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP122]], i32 0, i32 2
+// CHECK-64-NEXT: store i8 3, i8* [[TMP125]], align 8
+// CHECK-64-NEXT: [[TMP126:%.*]] = ptrtoint i32* [[A]] to i64
+// CHECK-64-NEXT: [[TMP127:%.*]] = getelementptr [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP121]], i64 1
+// CHECK-64-NEXT: [[TMP128:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP127]], i32 0, i32 0
+// CHECK-64-NEXT: store i64 [[TMP126]], i64* [[TMP128]], align 8
+// CHECK-64-NEXT: [[TMP129:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP127]], i32 0, i32 1
+// CHECK-64-NEXT: store i64 4, i64* [[TMP129]], align 8
+// CHECK-64-NEXT: [[TMP130:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP127]], i32 0, i32 2
+// CHECK-64-NEXT: store i8 3, i8* [[TMP130]], align 8
+// CHECK-64-NEXT: [[TMP131:%.*]] = mul nuw i64 [[TMP2]], 4
+// CHECK-64-NEXT: [[TMP132:%.*]] = ptrtoint float* [[VLA]] to i64
+// CHECK-64-NEXT: [[TMP133:%.*]] = getelementptr [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP121]], i64 2
+// CHECK-64-NEXT: [[TMP134:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP133]], i32 0, i32 0
+// CHECK-64-NEXT: store i64 [[TMP132]], i64* [[TMP134]], align 8
+// CHECK-64-NEXT: [[TMP135:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP133]], i32 0, i32 1
+// CHECK-64-NEXT: store i64 [[TMP131]], i64* [[TMP135]], align 8
+// CHECK-64-NEXT: [[TMP136:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP133]], i32 0, i32 2
+// CHECK-64-NEXT: store i8 3, i8* [[TMP136]], align 8
+// CHECK-64-NEXT: store i64 3, i64* [[DEP_COUNTER_ADDR9]], align 8
+// CHECK-64-NEXT: [[TMP137:%.*]] = bitcast %struct.kmp_depend_info* [[TMP121]] to i8*
+// CHECK-64-NEXT: call void @__kmpc_omp_taskwait_deps_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 3, i8* [[TMP137]], i32 0, i8* null, i32 0)
+// CHECK-64-NEXT: call void @__kmpc_omp_task_begin_if0(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i8* [[TMP109]])
+// CHECK-64-NEXT: [[TMP138:%.*]] = call i32 @.omp_task_entry..9(i32 [[TMP0]], %struct.kmp_task_t_with_privates.2* [[TMP110]]) #[[ATTR3]]
+// CHECK-64-NEXT: call void @__kmpc_omp_task_complete_if0(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i8* [[TMP109]])
+// CHECK-64-NEXT: br label [[OMP_IF_END]]
+// CHECK-64: omp_if.end:
+// CHECK-64-NEXT: [[TMP139:%.*]] = load i32, i32* @global, align 4
+// CHECK-64-NEXT: [[CONV11:%.*]] = bitcast i64* [[GLOBAL_CASTED10]] to i32*
+// CHECK-64-NEXT: store i32 [[TMP139]], i32* [[CONV11]], align 4
+// CHECK-64-NEXT: [[TMP140:%.*]] = load i64, i64* [[GLOBAL_CASTED10]], align 8
+// CHECK-64-NEXT: [[TMP141:%.*]] = getelementptr inbounds [[STRUCT_ANON_4]], %struct.anon.4* [[AGG_CAPTURED12]], i32 0, i32 0
+// CHECK-64-NEXT: [[TMP142:%.*]] = load i32, i32* @global, align 4
+// CHECK-64-NEXT: store i32 [[TMP142]], i32* [[TMP141]], align 4
+// CHECK-64-NEXT: [[TMP143:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i64 48, i64 4, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.5*)* @.omp_task_entry..14 to i32 (i32, i8*)*))
+// CHECK-64-NEXT: [[TMP144:%.*]] = bitcast i8* [[TMP143]] to %struct.kmp_task_t_with_privates.5*
+// CHECK-64-NEXT: [[TMP145:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_5:%.*]], %struct.kmp_task_t_with_privates.5* [[TMP144]], i32 0, i32 0
+// CHECK-64-NEXT: [[TMP146:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP145]], i32 0, i32 0
+// CHECK-64-NEXT: [[TMP147:%.*]] = load i8*, i8** [[TMP146]], align 8
+// CHECK-64-NEXT: [[TMP148:%.*]] = bitcast %struct.anon.4* [[AGG_CAPTURED12]] to i8*
+// CHECK-64-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP147]], i8* align 4 [[TMP148]], i64 4, i1 false)
+// CHECK-64-NEXT: [[TMP149:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_5]], %struct.kmp_task_t_with_privates.5* [[TMP144]], i32 0, i32 1
+// CHECK-64-NEXT: [[TMP150:%.*]] = bitcast i8* [[TMP147]] to %struct.anon.4*
+// CHECK-64-NEXT: [[TMP151:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T_6:%.*]], %struct..kmp_privates.t.6* [[TMP149]], i32 0, i32 0
+// CHECK-64-NEXT: [[TMP152:%.*]] = load i32, i32* @global, align 4
+// CHECK-64-NEXT: store i32 [[TMP152]], i32* [[TMP151]], align 8
+// CHECK-64-NEXT: [[TMP153:%.*]] = getelementptr inbounds [1 x %struct.kmp_depend_info], [1 x %struct.kmp_depend_info]* [[DOTDEP_ARR_ADDR13]], i64 0, i64 0
+// CHECK-64-NEXT: [[TMP154:%.*]] = getelementptr [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP153]], i64 0
+// CHECK-64-NEXT: [[TMP155:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP154]], i32 0, i32 0
+// CHECK-64-NEXT: store i64 ptrtoint (i32* @global to i64), i64* [[TMP155]], align 8
+// CHECK-64-NEXT: [[TMP156:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP154]], i32 0, i32 1
+// CHECK-64-NEXT: store i64 4, i64* [[TMP156]], align 8
+// CHECK-64-NEXT: [[TMP157:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP154]], i32 0, i32 2
+// CHECK-64-NEXT: store i8 3, i8* [[TMP157]], align 8
+// CHECK-64-NEXT: store i64 1, i64* [[DEP_COUNTER_ADDR14]], align 8
+// CHECK-64-NEXT: [[TMP158:%.*]] = bitcast %struct.kmp_depend_info* [[TMP153]] to i8*
+// CHECK-64-NEXT: call void @__kmpc_omp_taskwait_deps_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i8* [[TMP158]], i32 0, i8* null, i32 0)
+// CHECK-64-NEXT: call void @__kmpc_omp_task_begin_if0(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i8* [[TMP143]])
+// CHECK-64-NEXT: [[TMP159:%.*]] = call i32 @.omp_task_entry..14(i32 [[TMP0]], %struct.kmp_task_t_with_privates.5* [[TMP144]]) #[[ATTR3]]
+// CHECK-64-NEXT: call void @__kmpc_omp_task_complete_if0(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i8* [[TMP143]])
+// CHECK-64-NEXT: [[TMP160:%.*]] = load i32, i32* [[A]], align 4
+// CHECK-64-NEXT: [[TMP161:%.*]] = load i8*, i8** [[SAVED_STACK]], align 8
+// CHECK-64-NEXT: call void @llvm.stackrestore(i8* [[TMP161]])
+// CHECK-64-NEXT: ret i32 [[TMP160]]
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l66
+// CHECK-64-SAME: () #[[ATTR2:[0-9]+]] {
+// CHECK-64-NEXT: entry:
+// CHECK-64-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%struct.ident_t* @[[GLOB3]], i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*))
+// CHECK-64-NEXT: ret void
+// CHECK-64-LABEL: define {{[^@]+}}@.omp_outlined.
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK-64-NEXT: entry:
+// CHECK-64-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-64-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64: cond.true:
+// CHECK-64-NEXT: br label [[COND_END:%.*]]
+// CHECK-64: cond.false:
+// CHECK-64-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT: br label [[COND_END]]
+// CHECK-64: cond.end:
+// CHECK-64-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64: omp.inner.for.cond:
+// CHECK-64-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-64-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64: omp.inner.for.body:
+// CHECK-64-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-64-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-64-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB3]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i64 [[TMP8]], i64 [[TMP10]])
+// CHECK-64-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64: omp.inner.for.inc:
+// CHECK-64-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK-64-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-64: omp.inner.for.end:
+// CHECK-64-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64: omp.loop.exit:
+// CHECK-64-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]])
+// CHECK-64-NEXT: ret void
+// CHECK-64-LABEL: define {{[^@]+}}@.omp_outlined..1
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK-64-NEXT: entry:
+// CHECK-64-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT: [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT: [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT: store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-64-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9
+// CHECK-64-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64: cond.true:
+// CHECK-64-NEXT: br label [[COND_END:%.*]]
+// CHECK-64: cond.false:
+// CHECK-64-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT: br label [[COND_END]]
+// CHECK-64: cond.end:
+// CHECK-64-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK-64-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64: omp.inner.for.cond:
+// CHECK-64-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-64-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64: omp.inner.for.body:
+// CHECK-64-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-64-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT: store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-64-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64: omp.body.continue:
+// CHECK-64-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64: omp.inner.for.inc:
+// CHECK-64-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-64-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-64: omp.inner.for.end:
+// CHECK-64-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64: omp.loop.exit:
+// CHECK-64-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-64-NEXT: ret void
+// CHECK-64-LABEL: define {{[^@]+}}@.omp_task_entry.
+// CHECK-64-SAME: (i32 noundef signext [[TMP0:%.*]], %struct.kmp_task_t_with_privates* noalias noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK-64-NEXT: entry:
+// CHECK-64-NEXT: [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTPART_ID__ADDR_I:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT: [[DOTPRIVATES__ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK-64-NEXT: [[DOTCOPY_FN__ADDR_I:%.*]] = alloca void (i8*, ...)*, align 8
+// CHECK-64-NEXT: [[DOTTASK_T__ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK-64-NEXT: [[__CONTEXT_ADDR_I:%.*]] = alloca %struct.anon*, align 8
+// CHECK-64-NEXT: [[KERNEL_ARGS_I:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK-64-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTADDR1:%.*]] = alloca %struct.kmp_task_t_with_privates*, align 8
+// CHECK-64-NEXT: store i32 [[TMP0]], i32* [[DOTADDR]], align 4
+// CHECK-64-NEXT: store %struct.kmp_task_t_with_privates* [[TMP1]], %struct.kmp_task_t_with_privates** [[DOTADDR1]], align 8
+// CHECK-64-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTADDR]], align 4
+// CHECK-64-NEXT: [[TMP3:%.*]] = load %struct.kmp_task_t_with_privates*, %struct.kmp_task_t_with_privates** [[DOTADDR1]], align 8
+// CHECK-64-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], %struct.kmp_task_t_with_privates* [[TMP3]], i32 0, i32 0
+// CHECK-64-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 2
+// CHECK-64-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 0
+// CHECK-64-NEXT: [[TMP7:%.*]] = load i8*, i8** [[TMP6]], align 8
+// CHECK-64-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to %struct.anon*
+// CHECK-64-NEXT: [[TMP9:%.*]] = bitcast %struct.kmp_task_t_with_privates* [[TMP3]] to i8*
+// CHECK-64-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META7:![0-9]+]])
+// CHECK-64-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META10:![0-9]+]])
+// CHECK-64-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]])
+// CHECK-64-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META14:![0-9]+]])
+// CHECK-64-NEXT: store i32 [[TMP2]], i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !16
+// CHECK-64-NEXT: store i32* [[TMP5]], i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !16
+// CHECK-64-NEXT: store i8* null, i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !16
+// CHECK-64-NEXT: store void (i8*, ...)* null, void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !16
+// CHECK-64-NEXT: store i8* [[TMP9]], i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !16
+// CHECK-64-NEXT: store %struct.anon* [[TMP8]], %struct.anon** [[__CONTEXT_ADDR_I]], align 8, !noalias !16
+// CHECK-64-NEXT: [[TMP10:%.*]] = load %struct.anon*, %struct.anon** [[__CONTEXT_ADDR_I]], align 8, !noalias !16
+// CHECK-64-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], %struct.anon* [[TMP10]], i32 0, i32 0
+// CHECK-64-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4
+// CHECK-64-NEXT: [[TMP13:%.*]] = sext i32 [[TMP12]] to i64
+// CHECK-64-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 0
+// CHECK-64-NEXT: store i32 2, i32* [[TMP14]], align 4, !noalias !16
+// CHECK-64-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 1
+// CHECK-64-NEXT: store i32 0, i32* [[TMP15]], align 4, !noalias !16
+// CHECK-64-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 2
+// CHECK-64-NEXT: store i8** null, i8*** [[TMP16]], align 8, !noalias !16
+// CHECK-64-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 3
+// CHECK-64-NEXT: store i8** null, i8*** [[TMP17]], align 8, !noalias !16
+// CHECK-64-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 4
+// CHECK-64-NEXT: store i64* null, i64** [[TMP18]], align 8, !noalias !16
+// CHECK-64-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 5
+// CHECK-64-NEXT: store i64* null, i64** [[TMP19]], align 8, !noalias !16
+// CHECK-64-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 6
+// CHECK-64-NEXT: store i8** null, i8*** [[TMP20]], align 8, !noalias !16
+// CHECK-64-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 7
+// CHECK-64-NEXT: store i8** null, i8*** [[TMP21]], align 8, !noalias !16
+// CHECK-64-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
+// CHECK-64-NEXT: store i64 10, i64* [[TMP22]], align 8, !noalias !16
+// CHECK-64-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 9
+// CHECK-64-NEXT: store i64 0, i64* [[TMP23]], align 8, !noalias !16
+// CHECK-64-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 10
+// CHECK-64-NEXT: store [3 x i32] zeroinitializer, [3 x i32]* [[TMP24]], align 4, !noalias !16
+// CHECK-64-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 11
+// CHECK-64-NEXT: store [3 x i32] zeroinitializer, [3 x i32]* [[TMP25]], align 4, !noalias !16
+// CHECK-64-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 12
+// CHECK-64-NEXT: store i32 0, i32* [[TMP26]], align 4, !noalias !16
+// CHECK-64-NEXT: [[TMP27:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB3]], i64 [[TMP13]], i32 0, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l66.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]])
+// CHECK-64-NEXT: [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0
+// CHECK-64-NEXT: br i1 [[TMP28]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__2_EXIT:%.*]]
+// CHECK-64: omp_offload.failed.i:
+// CHECK-64-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l66() #[[ATTR3]]
+// CHECK-64-NEXT: br label [[DOTOMP_OUTLINED__2_EXIT]]
+// CHECK-64: .omp_outlined..2.exit:
+// CHECK-64-NEXT: ret i32 0
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l76
+// CHECK-64-SAME: (i64* noundef [[PLOCAL:%.*]], i64 noundef [[GLOBAL:%.*]]) #[[ATTR2]] {
+// CHECK-64-NEXT: entry:
+// CHECK-64-NEXT: [[PLOCAL_ADDR:%.*]] = alloca i64*, align 8
+// CHECK-64-NEXT: [[GLOBAL_ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT: [[GLOBAL_CASTED:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT: store i64* [[PLOCAL]], i64** [[PLOCAL_ADDR]], align 8
+// CHECK-64-NEXT: store i64 [[GLOBAL]], i64* [[GLOBAL_ADDR]], align 8
+// CHECK-64-NEXT: [[CONV:%.*]] = bitcast i64* [[GLOBAL_ADDR]] to i32*
+// CHECK-64-NEXT: [[TMP0:%.*]] = load i64*, i64** [[PLOCAL_ADDR]], align 8
+// CHECK-64-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 4
+// CHECK-64-NEXT: [[CONV1:%.*]] = bitcast i64* [[GLOBAL_CASTED]] to i32*
+// CHECK-64-NEXT: store i32 [[TMP1]], i32* [[CONV1]], align 4
+// CHECK-64-NEXT: [[TMP2:%.*]] = load i64, i64* [[GLOBAL_CASTED]], align 8
+// CHECK-64-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%struct.ident_t* @[[GLOB3]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64*, i64)* @.omp_outlined..3 to void (i32*, i32*, ...)*), i64* [[TMP0]], i64 [[TMP2]])
+// CHECK-64-NEXT: ret void
+// CHECK-64-LABEL: define {{[^@]+}}@.omp_outlined..3
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64* noundef [[PLOCAL:%.*]], i64 noundef [[GLOBAL:%.*]]) #[[ATTR2]] {
+// CHECK-64-NEXT: entry:
+// CHECK-64-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT: [[PLOCAL_ADDR:%.*]] = alloca i64*, align 8
+// CHECK-64-NEXT: [[GLOBAL_ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[I4:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[GLOBAL_CASTED:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT: store i64* [[PLOCAL]], i64** [[PLOCAL_ADDR]], align 8
+// CHECK-64-NEXT: store i64 [[GLOBAL]], i64* [[GLOBAL_ADDR]], align 8
+// CHECK-64-NEXT: [[CONV:%.*]] = bitcast i64* [[GLOBAL_ADDR]] to i32*
+// CHECK-64-NEXT: [[TMP0:%.*]] = load i64*, i64** [[PLOCAL_ADDR]], align 8
+// CHECK-64-NEXT: [[TMP1:%.*]] = load i64, i64* [[TMP0]], align 8
+// CHECK-64-NEXT: store i64 [[TMP1]], i64* [[DOTCAPTURE_EXPR_]], align 8
+// CHECK-64-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_]], align 8
+// CHECK-64-NEXT: [[SUB:%.*]] = sub nsw i64 [[TMP2]], 0
+// CHECK-64-NEXT: [[DIV:%.*]] = sdiv i64 [[SUB]], 1
+// CHECK-64-NEXT: [[CONV2:%.*]] = trunc i64 [[DIV]] to i32
+// CHECK-64-NEXT: [[SUB3:%.*]] = sub nsw i32 [[CONV2]], 1
+// CHECK-64-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-64-NEXT: store i32 0, i32* [[I]], align 4
+// CHECK-64-NEXT: [[TMP3:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_]], align 8
+// CHECK-64-NEXT: [[CMP:%.*]] = icmp slt i64 0, [[TMP3]]
+// CHECK-64-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK-64: omp.precond.then:
+// CHECK-64-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-64-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4
+// CHECK-64-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-64-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK-64-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64: cond.true:
+// CHECK-64-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-64-NEXT: br label [[COND_END:%.*]]
+// CHECK-64: cond.false:
+// CHECK-64-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT: br label [[COND_END]]
+// CHECK-64: cond.end:
+// CHECK-64-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ]
+// CHECK-64-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64: omp.inner.for.cond:
+// CHECK-64-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
+// CHECK-64-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64: omp.inner.for.body:
+// CHECK-64-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64
+// CHECK-64-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64
+// CHECK-64-NEXT: [[TMP18:%.*]] = load i64*, i64** [[PLOCAL_ADDR]], align 8
+// CHECK-64-NEXT: [[TMP19:%.*]] = load i32, i32* [[CONV]], align 4
+// CHECK-64-NEXT: [[CONV7:%.*]] = bitcast i64* [[GLOBAL_CASTED]] to i32*
+// CHECK-64-NEXT: store i32 [[TMP19]], i32* [[CONV7]], align 4
+// CHECK-64-NEXT: [[TMP20:%.*]] = load i64, i64* [[GLOBAL_CASTED]], align 8
+// CHECK-64-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB3]], i32 4, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i64*, i64)* @.omp_outlined..4 to void (i32*, i32*, ...)*), i64 [[TMP15]], i64 [[TMP17]], i64* [[TMP18]], i64 [[TMP20]])
+// CHECK-64-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64: omp.inner.for.inc:
+// CHECK-64-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP21]], [[TMP22]]
+// CHECK-64-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-64: omp.inner.for.end:
+// CHECK-64-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64: omp.loop.exit:
+// CHECK-64-NEXT: [[TMP23:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP23]], align 4
+// CHECK-64-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP24]])
+// CHECK-64-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK-64: omp.precond.end:
+// CHECK-64-NEXT: ret void
+// CHECK-64-LABEL: define {{[^@]+}}@.omp_outlined..4
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64* noundef [[PLOCAL:%.*]], i64 noundef [[GLOBAL:%.*]]) #[[ATTR2]] {
+// CHECK-64-NEXT: entry:
+// CHECK-64-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT: [[PLOCAL_ADDR:%.*]] = alloca i64*, align 8
+// CHECK-64-NEXT: [[GLOBAL_ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[I6:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT: store i64* [[PLOCAL]], i64** [[PLOCAL_ADDR]], align 8
+// CHECK-64-NEXT: store i64 [[GLOBAL]], i64* [[GLOBAL_ADDR]], align 8
+// CHECK-64-NEXT: [[CONV:%.*]] = bitcast i64* [[GLOBAL_ADDR]] to i32*
+// CHECK-64-NEXT: [[TMP0:%.*]] = load i64*, i64** [[PLOCAL_ADDR]], align 8
+// CHECK-64-NEXT: [[TMP1:%.*]] = load i64, i64* [[TMP0]], align 8
+// CHECK-64-NEXT: store i64 [[TMP1]], i64* [[DOTCAPTURE_EXPR_]], align 8
+// CHECK-64-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_]], align 8
+// CHECK-64-NEXT: [[SUB:%.*]] = sub nsw i64 [[TMP2]], 0
+// CHECK-64-NEXT: [[DIV:%.*]] = sdiv i64 [[SUB]], 1
+// CHECK-64-NEXT: [[CONV2:%.*]] = trunc i64 [[DIV]] to i32
+// CHECK-64-NEXT: [[SUB3:%.*]] = sub nsw i32 [[CONV2]], 1
+// CHECK-64-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-64-NEXT: store i32 0, i32* [[I]], align 4
+// CHECK-64-NEXT: [[TMP3:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_]], align 8
+// CHECK-64-NEXT: [[CMP:%.*]] = icmp slt i64 0, [[TMP3]]
+// CHECK-64-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK-64: omp.precond.then:
+// CHECK-64-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-64-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP5]] to i32
+// CHECK-64-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP6]] to i32
+// CHECK-64-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4
+// CHECK-64-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-64-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
+// CHECK-64-NEXT: br i1 [[CMP7]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64: cond.true:
+// CHECK-64-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-64-NEXT: br label [[COND_END:%.*]]
+// CHECK-64: cond.false:
+// CHECK-64-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT: br label [[COND_END]]
+// CHECK-64: cond.end:
+// CHECK-64-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ]
+// CHECK-64-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64: omp.inner.for.cond:
+// CHECK-64-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT: [[CMP8:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
+// CHECK-64-NEXT: br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64: omp.inner.for.body:
+// CHECK-64-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
+// CHECK-64-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT: store i32 [[ADD]], i32* [[I6]], align 4
+// CHECK-64-NEXT: [[TMP17:%.*]] = load i32, i32* [[CONV]], align 4
+// CHECK-64-NEXT: [[CONV9:%.*]] = sext i32 [[TMP17]] to i64
+// CHECK-64-NEXT: [[TMP18:%.*]] = load i64*, i64** [[PLOCAL_ADDR]], align 8
+// CHECK-64-NEXT: store i64 [[CONV9]], i64* [[TMP18]], align 8
+// CHECK-64-NEXT: [[TMP19:%.*]] = load i32, i32* [[CONV]], align 4
+// CHECK-64-NEXT: store i32 [[TMP19]], i32* @_ZZ3fooiE6local1, align 4
+// CHECK-64-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64: omp.body.continue:
+// CHECK-64-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64: omp.inner.for.inc:
+// CHECK-64-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP20]], 1
+// CHECK-64-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-64: omp.inner.for.end:
+// CHECK-64-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64: omp.loop.exit:
+// CHECK-64-NEXT: [[TMP21:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4
+// CHECK-64-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP22]])
+// CHECK-64-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK-64: omp.precond.end:
+// CHECK-64-NEXT: ret void
+// CHECK-64-LABEL: define {{[^@]+}}@.omp_task_privates_map.
+// CHECK-64-SAME: (%struct..kmp_privates.t* noalias noundef [[TMP0:%.*]], i64*** noalias noundef [[TMP1:%.*]], i32** noalias noundef [[TMP2:%.*]], [2 x i8*]** noalias noundef [[TMP3:%.*]], [2 x i8*]** noalias noundef [[TMP4:%.*]], [2 x i64]** noalias noundef [[TMP5:%.*]]) #[[ATTR7:[0-9]+]] {
+// CHECK-64-NEXT: entry:
+// CHECK-64-NEXT: [[DOTADDR:%.*]] = alloca %struct..kmp_privates.t*, align 8
+// CHECK-64-NEXT: [[DOTADDR1:%.*]] = alloca i64***, align 8
+// CHECK-64-NEXT: [[DOTADDR2:%.*]] = alloca i32**, align 8
+// CHECK-64-NEXT: [[DOTADDR3:%.*]] = alloca [2 x i8*]**, align 8
+// CHECK-64-NEXT: [[DOTADDR4:%.*]] = alloca [2 x i8*]**, align 8
+// CHECK-64-NEXT: [[DOTADDR5:%.*]] = alloca [2 x i64]**, align 8
+// CHECK-64-NEXT: store %struct..kmp_privates.t* [[TMP0]], %struct..kmp_privates.t** [[DOTADDR]], align 8
+// CHECK-64-NEXT: store i64*** [[TMP1]], i64**** [[DOTADDR1]], align 8
+// CHECK-64-NEXT: store i32** [[TMP2]], i32*** [[DOTADDR2]], align 8
+// CHECK-64-NEXT: store [2 x i8*]** [[TMP3]], [2 x i8*]*** [[DOTADDR3]], align 8
+// CHECK-64-NEXT: store [2 x i8*]** [[TMP4]], [2 x i8*]*** [[DOTADDR4]], align 8
+// CHECK-64-NEXT: store [2 x i64]** [[TMP5]], [2 x i64]*** [[DOTADDR5]], align 8
+// CHECK-64-NEXT: [[TMP6:%.*]] = load %struct..kmp_privates.t*, %struct..kmp_privates.t** [[DOTADDR]], align 8
+// CHECK-64-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], %struct..kmp_privates.t* [[TMP6]], i32 0, i32 0
+// CHECK-64-NEXT: [[TMP8:%.*]] = load i64***, i64**** [[DOTADDR1]], align 8
+// CHECK-64-NEXT: store i64** [[TMP7]], i64*** [[TMP8]], align 8
+// CHECK-64-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP6]], i32 0, i32 1
+// CHECK-64-NEXT: [[TMP10:%.*]] = load [2 x i8*]**, [2 x i8*]*** [[DOTADDR3]], align 8
+// CHECK-64-NEXT: store [2 x i8*]* [[TMP9]], [2 x i8*]** [[TMP10]], align 8
+// CHECK-64-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP6]], i32 0, i32 2
+// CHECK-64-NEXT: [[TMP12:%.*]] = load [2 x i8*]**, [2 x i8*]*** [[DOTADDR4]], align 8
+// CHECK-64-NEXT: store [2 x i8*]* [[TMP11]], [2 x i8*]** [[TMP12]], align 8
+// CHECK-64-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP6]], i32 0, i32 3
+// CHECK-64-NEXT: [[TMP14:%.*]] = load [2 x i64]**, [2 x i64]*** [[DOTADDR5]], align 8
+// CHECK-64-NEXT: store [2 x i64]* [[TMP13]], [2 x i64]** [[TMP14]], align 8
+// CHECK-64-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP6]], i32 0, i32 4
+// CHECK-64-NEXT: [[TMP16:%.*]] = load i32**, i32*** [[DOTADDR2]], align 8
+// CHECK-64-NEXT: store i32* [[TMP15]], i32** [[TMP16]], align 8
+// CHECK-64-NEXT: ret void
+// CHECK-64-LABEL: define {{[^@]+}}@.omp_task_entry..6
+// CHECK-64-SAME: (i32 noundef signext [[TMP0:%.*]], %struct.kmp_task_t_with_privates.1* noalias noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK-64-NEXT: entry:
+// CHECK-64-NEXT: [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTPART_ID__ADDR_I:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT: [[DOTPRIVATES__ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK-64-NEXT: [[DOTCOPY_FN__ADDR_I:%.*]] = alloca void (i8*, ...)*, align 8
+// CHECK-64-NEXT: [[DOTTASK_T__ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK-64-NEXT: [[__CONTEXT_ADDR_I:%.*]] = alloca %struct.anon.0*, align 8
+// CHECK-64-NEXT: [[DOTFIRSTPRIV_PTR_ADDR_I:%.*]] = alloca i64**, align 8
+// CHECK-64-NEXT: [[DOTFIRSTPRIV_PTR_ADDR1_I:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT: [[DOTFIRSTPRIV_PTR_ADDR2_I:%.*]] = alloca [2 x i8*]*, align 8
+// CHECK-64-NEXT: [[DOTFIRSTPRIV_PTR_ADDR3_I:%.*]] = alloca [2 x i8*]*, align 8
+// CHECK-64-NEXT: [[DOTFIRSTPRIV_PTR_ADDR4_I:%.*]] = alloca [2 x i64]*, align 8
+// CHECK-64-NEXT: [[DOTCAPTURE_EXPR__I:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT: [[DOTCAPTURE_EXPR_5_I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[GLOBAL_CASTED_I:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT: [[KERNEL_ARGS_I:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK-64-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTADDR1:%.*]] = alloca %struct.kmp_task_t_with_privates.1*, align 8
+// CHECK-64-NEXT: store i32 [[TMP0]], i32* [[DOTADDR]], align 4
+// CHECK-64-NEXT: store %struct.kmp_task_t_with_privates.1* [[TMP1]], %struct.kmp_task_t_with_privates.1** [[DOTADDR1]], align 8
+// CHECK-64-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTADDR]], align 4
+// CHECK-64-NEXT: [[TMP3:%.*]] = load %struct.kmp_task_t_with_privates.1*, %struct.kmp_task_t_with_privates.1** [[DOTADDR1]], align 8
+// CHECK-64-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_1:%.*]], %struct.kmp_task_t_with_privates.1* [[TMP3]], i32 0, i32 0
+// CHECK-64-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 2
+// CHECK-64-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 0
+// CHECK-64-NEXT: [[TMP7:%.*]] = load i8*, i8** [[TMP6]], align 8
+// CHECK-64-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to %struct.anon.0*
+// CHECK-64-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_1]], %struct.kmp_task_t_with_privates.1* [[TMP3]], i32 0, i32 1
+// CHECK-64-NEXT: [[TMP10:%.*]] = bitcast %struct..kmp_privates.t* [[TMP9]] to i8*
+// CHECK-64-NEXT: [[TMP11:%.*]] = bitcast %struct.kmp_task_t_with_privates.1* [[TMP3]] to i8*
+// CHECK-64-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META17:![0-9]+]])
+// CHECK-64-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META20:![0-9]+]])
+// CHECK-64-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META22:![0-9]+]])
+// CHECK-64-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META24:![0-9]+]])
+// CHECK-64-NEXT: store i32 [[TMP2]], i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !26
+// CHECK-64-NEXT: store i32* [[TMP5]], i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !26
+// CHECK-64-NEXT: store i8* [[TMP10]], i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !26
+// CHECK-64-NEXT: store void (i8*, ...)* bitcast (void (%struct..kmp_privates.t*, i64***, i32**, [2 x i8*]**, [2 x i8*]**, [2 x i64]**)* @.omp_task_privates_map. to void (i8*, ...)*), void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !26
+// CHECK-64-NEXT: store i8* [[TMP11]], i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !26
+// CHECK-64-NEXT: store %struct.anon.0* [[TMP8]], %struct.anon.0** [[__CONTEXT_ADDR_I]], align 8, !noalias !26
+// CHECK-64-NEXT: [[TMP12:%.*]] = load %struct.anon.0*, %struct.anon.0** [[__CONTEXT_ADDR_I]], align 8, !noalias !26
+// CHECK-64-NEXT: [[TMP13:%.*]] = load void (i8*, ...)*, void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !26
+// CHECK-64-NEXT: [[TMP14:%.*]] = load i8*, i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !26
+// CHECK-64-NEXT: [[TMP15:%.*]] = bitcast void (i8*, ...)* [[TMP13]] to void (i8*, i64***, i32**, [2 x i8*]**, [2 x i8*]**, [2 x i64]**)*
+// CHECK-64-NEXT: call void [[TMP15]](i8* [[TMP14]], i64*** [[DOTFIRSTPRIV_PTR_ADDR_I]], i32** [[DOTFIRSTPRIV_PTR_ADDR1_I]], [2 x i8*]** [[DOTFIRSTPRIV_PTR_ADDR2_I]], [2 x i8*]** [[DOTFIRSTPRIV_PTR_ADDR3_I]], [2 x i64]** [[DOTFIRSTPRIV_PTR_ADDR4_I]]) #[[ATTR3]]
+// CHECK-64-NEXT: [[TMP16:%.*]] = load i64**, i64*** [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !26
+// CHECK-64-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias !26
+// CHECK-64-NEXT: [[TMP18:%.*]] = load [2 x i8*]*, [2 x i8*]** [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias !26
+// CHECK-64-NEXT: [[TMP19:%.*]] = load [2 x i8*]*, [2 x i8*]** [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 8, !noalias !26
+// CHECK-64-NEXT: [[TMP20:%.*]] = load [2 x i64]*, [2 x i64]** [[DOTFIRSTPRIV_PTR_ADDR4_I]], align 8, !noalias !26
+// CHECK-64-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP18]], i64 0, i64 0
+// CHECK-64-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP19]], i64 0, i64 0
+// CHECK-64-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i64], [2 x i64]* [[TMP20]], i64 0, i64 0
+// CHECK-64-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_ANON_0:%.*]], %struct.anon.0* [[TMP12]], i32 0, i32 2
+// CHECK-64-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4
+// CHECK-64-NEXT: [[TMP26:%.*]] = sext i32 [[TMP25]] to i64
+// CHECK-64-NEXT: [[TMP27:%.*]] = load i64*, i64** [[TMP16]], align 8
+// CHECK-64-NEXT: [[TMP28:%.*]] = load i64, i64* [[TMP27]], align 8
+// CHECK-64-NEXT: store i64 [[TMP28]], i64* [[DOTCAPTURE_EXPR__I]], align 8, !noalias !26
+// CHECK-64-NEXT: [[TMP29:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR__I]], align 8, !noalias !26
+// CHECK-64-NEXT: [[CONV_I:%.*]] = trunc i64 [[TMP29]] to i32
+// CHECK-64-NEXT: [[SUB6_I:%.*]] = sub nsw i32 [[CONV_I]], 1
+// CHECK-64-NEXT: store i32 [[SUB6_I]], i32* [[DOTCAPTURE_EXPR_5_I]], align 4, !noalias !26
+// CHECK-64-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_5_I]], align 4, !noalias !26
+// CHECK-64-NEXT: [[ADD_I:%.*]] = add nsw i32 [[TMP30]], 1
+// CHECK-64-NEXT: [[TMP31:%.*]] = zext i32 [[ADD_I]] to i64
+// CHECK-64-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 0
+// CHECK-64-NEXT: store i32 2, i32* [[TMP32]], align 4, !noalias !26
+// CHECK-64-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 1
+// CHECK-64-NEXT: store i32 2, i32* [[TMP33]], align 4, !noalias !26
+// CHECK-64-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 2
+// CHECK-64-NEXT: store i8** [[TMP21]], i8*** [[TMP34]], align 8, !noalias !26
+// CHECK-64-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 3
+// CHECK-64-NEXT: store i8** [[TMP22]], i8*** [[TMP35]], align 8, !noalias !26
+// CHECK-64-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 4
+// CHECK-64-NEXT: store i64* [[TMP23]], i64** [[TMP36]], align 8, !noalias !26
+// CHECK-64-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 5
+// CHECK-64-NEXT: store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes, i32 0, i32 0), i64** [[TMP37]], align 8, !noalias !26
+// CHECK-64-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 6
+// CHECK-64-NEXT: store i8** null, i8*** [[TMP38]], align 8, !noalias !26
+// CHECK-64-NEXT: [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 7
+// CHECK-64-NEXT: store i8** null, i8*** [[TMP39]], align 8, !noalias !26
+// CHECK-64-NEXT: [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
+// CHECK-64-NEXT: store i64 [[TMP31]], i64* [[TMP40]], align 8, !noalias !26
+// CHECK-64-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 9
+// CHECK-64-NEXT: store i64 0, i64* [[TMP41]], align 8, !noalias !26
+// CHECK-64-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 10
+// CHECK-64-NEXT: store [3 x i32] zeroinitializer, [3 x i32]* [[TMP42]], align 4, !noalias !26
+// CHECK-64-NEXT: [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 11
+// CHECK-64-NEXT: store [3 x i32] zeroinitializer, [3 x i32]* [[TMP43]], align 4, !noalias !26
+// CHECK-64-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 12
+// CHECK-64-NEXT: store i32 0, i32* [[TMP44]], align 4, !noalias !26
+// CHECK-64-NEXT: [[TMP45:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB3]], i64 [[TMP26]], i32 0, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l76.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]])
+// CHECK-64-NEXT: [[TMP46:%.*]] = icmp ne i32 [[TMP45]], 0
+// CHECK-64-NEXT: br i1 [[TMP46]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__5_EXIT:%.*]]
+// CHECK-64: omp_offload.failed.i:
+// CHECK-64-NEXT: [[TMP47:%.*]] = load i64*, i64** [[TMP16]], align 8
+// CHECK-64-NEXT: [[TMP48:%.*]] = load i32, i32* @global, align 4, !noalias !26
+// CHECK-64-NEXT: [[CONV7_I:%.*]] = bitcast i64* [[GLOBAL_CASTED_I]] to i32*
+// CHECK-64-NEXT: store i32 [[TMP48]], i32* [[CONV7_I]], align 4, !noalias !26
+// CHECK-64-NEXT: [[TMP49:%.*]] = load i64, i64* [[GLOBAL_CASTED_I]], align 8, !noalias !26
+// CHECK-64-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l76(i64* [[TMP47]], i64 [[TMP49]]) #[[ATTR3]]
+// CHECK-64-NEXT: br label [[DOTOMP_OUTLINED__5_EXIT]]
+// CHECK-64: .omp_outlined..5.exit:
+// CHECK-64-NEXT: ret i32 0
+// CHECK-64-LABEL: define {{[^@]+}}@.omp_task_privates_map..8
+// CHECK-64-SAME: (%struct..kmp_privates.t.3* noalias noundef [[TMP0:%.*]], i64*** noalias noundef [[TMP1:%.*]], i32** noalias noundef [[TMP2:%.*]]) #[[ATTR7]] {
+// CHECK-64-NEXT: entry:
+// CHECK-64-NEXT: [[DOTADDR:%.*]] = alloca %struct..kmp_privates.t.3*, align 8
+// CHECK-64-NEXT: [[DOTADDR1:%.*]] = alloca i64***, align 8
+// CHECK-64-NEXT: [[DOTADDR2:%.*]] = alloca i32**, align 8
+// CHECK-64-NEXT: store %struct..kmp_privates.t.3* [[TMP0]], %struct..kmp_privates.t.3** [[DOTADDR]], align 8
+// CHECK-64-NEXT: store i64*** [[TMP1]], i64**** [[DOTADDR1]], align 8
+// CHECK-64-NEXT: store i32** [[TMP2]], i32*** [[DOTADDR2]], align 8
+// CHECK-64-NEXT: [[TMP3:%.*]] = load %struct..kmp_privates.t.3*, %struct..kmp_privates.t.3** [[DOTADDR]], align 8
+// CHECK-64-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T_3:%.*]], %struct..kmp_privates.t.3* [[TMP3]], i32 0, i32 0
+// CHECK-64-NEXT: [[TMP5:%.*]] = load i64***, i64**** [[DOTADDR1]], align 8
+// CHECK-64-NEXT: store i64** [[TMP4]], i64*** [[TMP5]], align 8
+// CHECK-64-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T_3]], %struct..kmp_privates.t.3* [[TMP3]], i32 0, i32 1
+// CHECK-64-NEXT: [[TMP7:%.*]] = load i32**, i32*** [[DOTADDR2]], align 8
+// CHECK-64-NEXT: store i32* [[TMP6]], i32** [[TMP7]], align 8
+// CHECK-64-NEXT: ret void
+// CHECK-64-LABEL: define {{[^@]+}}@.omp_task_entry..9
+// CHECK-64-SAME: (i32 noundef signext [[TMP0:%.*]], %struct.kmp_task_t_with_privates.2* noalias noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK-64-NEXT: entry:
+// CHECK-64-NEXT: [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTPART_ID__ADDR_I:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT: [[DOTPRIVATES__ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK-64-NEXT: [[DOTCOPY_FN__ADDR_I:%.*]] = alloca void (i8*, ...)*, align 8
+// CHECK-64-NEXT: [[DOTTASK_T__ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK-64-NEXT: [[__CONTEXT_ADDR_I:%.*]] = alloca %struct.anon.0*, align 8
+// CHECK-64-NEXT: [[DOTFIRSTPRIV_PTR_ADDR_I:%.*]] = alloca i64**, align 8
+// CHECK-64-NEXT: [[DOTFIRSTPRIV_PTR_ADDR1_I:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT: [[GLOBAL_CASTED_I:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTADDR1:%.*]] = alloca %struct.kmp_task_t_with_privates.2*, align 8
+// CHECK-64-NEXT: store i32 [[TMP0]], i32* [[DOTADDR]], align 4
+// CHECK-64-NEXT: store %struct.kmp_task_t_with_privates.2* [[TMP1]], %struct.kmp_task_t_with_privates.2** [[DOTADDR1]], align 8
+// CHECK-64-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTADDR]], align 4
+// CHECK-64-NEXT: [[TMP3:%.*]] = load %struct.kmp_task_t_with_privates.2*, %struct.kmp_task_t_with_privates.2** [[DOTADDR1]], align 8
+// CHECK-64-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_2:%.*]], %struct.kmp_task_t_with_privates.2* [[TMP3]], i32 0, i32 0
+// CHECK-64-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 2
+// CHECK-64-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 0
+// CHECK-64-NEXT: [[TMP7:%.*]] = load i8*, i8** [[TMP6]], align 8
+// CHECK-64-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to %struct.anon.0*
+// CHECK-64-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_2]], %struct.kmp_task_t_with_privates.2* [[TMP3]], i32 0, i32 1
+// CHECK-64-NEXT: [[TMP10:%.*]] = bitcast %struct..kmp_privates.t.3* [[TMP9]] to i8*
+// CHECK-64-NEXT: [[TMP11:%.*]] = bitcast %struct.kmp_task_t_with_privates.2* [[TMP3]] to i8*
+// CHECK-64-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META27:![0-9]+]])
+// CHECK-64-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META30:![0-9]+]])
+// CHECK-64-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META32:![0-9]+]])
+// CHECK-64-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META34:![0-9]+]])
+// CHECK-64-NEXT: store i32 [[TMP2]], i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !36
+// CHECK-64-NEXT: store i32* [[TMP5]], i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !36
+// CHECK-64-NEXT: store i8* [[TMP10]], i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !36
+// CHECK-64-NEXT: store void (i8*, ...)* bitcast (void (%struct..kmp_privates.t.3*, i64***, i32**)* @.omp_task_privates_map..8 to void (i8*, ...)*), void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !36
+// CHECK-64-NEXT: store i8* [[TMP11]], i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !36
+// CHECK-64-NEXT: store %struct.anon.0* [[TMP8]], %struct.anon.0** [[__CONTEXT_ADDR_I]], align 8, !noalias !36
+// CHECK-64-NEXT: [[TMP12:%.*]] = load %struct.anon.0*, %struct.anon.0** [[__CONTEXT_ADDR_I]], align 8, !noalias !36
+// CHECK-64-NEXT: [[TMP13:%.*]] = load void (i8*, ...)*, void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !36
+// CHECK-64-NEXT: [[TMP14:%.*]] = load i8*, i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !36
+// CHECK-64-NEXT: [[TMP15:%.*]] = bitcast void (i8*, ...)* [[TMP13]] to void (i8*, i64***, i32**)*
+// CHECK-64-NEXT: call void [[TMP15]](i8* [[TMP14]], i64*** [[DOTFIRSTPRIV_PTR_ADDR_I]], i32** [[DOTFIRSTPRIV_PTR_ADDR1_I]]) #[[ATTR3]]
+// CHECK-64-NEXT: [[TMP16:%.*]] = load i64**, i64*** [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !36
+// CHECK-64-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias !36
+// CHECK-64-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_ANON_0:%.*]], %struct.anon.0* [[TMP12]], i32 0, i32 2
+// CHECK-64-NEXT: [[TMP19:%.*]] = load i64*, i64** [[TMP16]], align 8
+// CHECK-64-NEXT: [[TMP20:%.*]] = load i32, i32* @global, align 4, !noalias !36
+// CHECK-64-NEXT: [[CONV_I:%.*]] = bitcast i64* [[GLOBAL_CASTED_I]] to i32*
+// CHECK-64-NEXT: store i32 [[TMP20]], i32* [[CONV_I]], align 4, !noalias !36
+// CHECK-64-NEXT: [[TMP21:%.*]] = load i64, i64* [[GLOBAL_CASTED_I]], align 8, !noalias !36
+// CHECK-64-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l76(i64* [[TMP19]], i64 [[TMP21]]) #[[ATTR3]]
+// CHECK-64-NEXT: ret i32 0
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l83
+// CHECK-64-SAME: (i64 noundef [[GLOBAL:%.*]]) #[[ATTR2]] {
+// CHECK-64-NEXT: entry:
+// CHECK-64-NEXT: [[GLOBAL_ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT: [[GLOBAL_CASTED:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT: store i64 [[GLOBAL]], i64* [[GLOBAL_ADDR]], align 8
+// CHECK-64-NEXT: [[CONV:%.*]] = bitcast i64* [[GLOBAL_ADDR]] to i32*
+// CHECK-64-NEXT: [[TMP0:%.*]] = load i32, i32* [[CONV]], align 4
+// CHECK-64-NEXT: [[CONV1:%.*]] = bitcast i64* [[GLOBAL_CASTED]] to i32*
+// CHECK-64-NEXT: store i32 [[TMP0]], i32* [[CONV1]], align 4
+// CHECK-64-NEXT: [[TMP1:%.*]] = load i64, i64* [[GLOBAL_CASTED]], align 8
+// CHECK-64-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%struct.ident_t* @[[GLOB3]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64)* @.omp_outlined..10 to void (i32*, i32*, ...)*), i64 [[TMP1]])
+// CHECK-64-NEXT: ret void
+// CHECK-64-LABEL: define {{[^@]+}}@.omp_outlined..10
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[GLOBAL:%.*]]) #[[ATTR2]] {
+// CHECK-64-NEXT: entry:
+// CHECK-64-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT: [[GLOBAL_ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[GLOBAL_CASTED:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT: [[DOTBOUND_ZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT: store i64 [[GLOBAL]], i64* [[GLOBAL_ADDR]], align 8
+// CHECK-64-NEXT: [[CONV:%.*]] = bitcast i64* [[GLOBAL_ADDR]] to i32*
+// CHECK-64-NEXT: [[TMP0:%.*]] = load i32, i32* [[CONV]], align 4
+// CHECK-64-NEXT: store i32 [[TMP0]], i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-64-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-64-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
+// CHECK-64-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK-64-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK-64-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-64-NEXT: store i32 0, i32* [[I]], align 4
+// CHECK-64-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-64-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
+// CHECK-64-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK-64: omp.precond.then:
+// CHECK-64-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-64-NEXT: store i32 [[TMP3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-64-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-64-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
+// CHECK-64-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64: cond.true:
+// CHECK-64-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-64-NEXT: br label [[COND_END:%.*]]
+// CHECK-64: cond.false:
+// CHECK-64-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT: br label [[COND_END]]
+// CHECK-64: cond.end:
+// CHECK-64-NEXT: [[COND:%.*]] = phi i32 [ [[TMP8]], [[COND_TRUE]] ], [ [[TMP9]], [[COND_FALSE]] ]
+// CHECK-64-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64: omp.inner.for.cond:
+// CHECK-64-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP11]], [[TMP12]]
+// CHECK-64-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64: omp.inner.for.body:
+// CHECK-64-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT: [[TMP14:%.*]] = zext i32 [[TMP13]] to i64
+// CHECK-64-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT: [[TMP16:%.*]] = zext i32 [[TMP15]] to i64
+// CHECK-64-NEXT: [[TMP17:%.*]] = load i32, i32* [[CONV]], align 4
+// CHECK-64-NEXT: [[CONV6:%.*]] = bitcast i64* [[GLOBAL_CASTED]] to i32*
+// CHECK-64-NEXT: store i32 [[TMP17]], i32* [[CONV6]], align 4
+// CHECK-64-NEXT: [[TMP18:%.*]] = load i64, i64* [[GLOBAL_CASTED]], align 8
+// CHECK-64-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 4
+// CHECK-64-NEXT: call void @__kmpc_serialized_parallel(%struct.ident_t* @[[GLOB3]], i32 [[TMP20]])
+// CHECK-64-NEXT: [[TMP21:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT: store i32 0, i32* [[DOTBOUND_ZERO_ADDR]], align 4
+// CHECK-64-NEXT: call void @.omp_outlined..11(i32* [[TMP21]], i32* [[DOTBOUND_ZERO_ADDR]], i64 [[TMP14]], i64 [[TMP16]], i64 [[TMP18]]) #[[ATTR3]]
+// CHECK-64-NEXT: call void @__kmpc_end_serialized_parallel(%struct.ident_t* @[[GLOB3]], i32 [[TMP20]])
+// CHECK-64-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64: omp.inner.for.inc:
+// CHECK-64-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK-64-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-64: omp.inner.for.end:
+// CHECK-64-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64: omp.loop.exit:
+// CHECK-64-NEXT: [[TMP24:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4
+// CHECK-64-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP25]])
+// CHECK-64-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK-64: omp.precond.end:
+// CHECK-64-NEXT: ret void
+// CHECK-64-LABEL: define {{[^@]+}}@.omp_outlined..11
+// CHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[GLOBAL:%.*]]) #[[ATTR2]] {
+// CHECK-64-NEXT: entry:
+// CHECK-64-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT: [[GLOBAL_ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[I5:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT: store i64 [[GLOBAL]], i64* [[GLOBAL_ADDR]], align 8
+// CHECK-64-NEXT: [[CONV:%.*]] = bitcast i64* [[GLOBAL_ADDR]] to i32*
+// CHECK-64-NEXT: [[TMP0:%.*]] = load i32, i32* [[CONV]], align 4
+// CHECK-64-NEXT: store i32 [[TMP0]], i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-64-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-64-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
+// CHECK-64-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK-64-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK-64-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-64-NEXT: store i32 0, i32* [[I]], align 4
+// CHECK-64-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-64-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
+// CHECK-64-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK-64: omp.precond.then:
+// CHECK-64-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-64-NEXT: store i32 [[TMP3]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT: [[TMP4:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP4]] to i32
+// CHECK-64-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP5]] to i32
+// CHECK-64-NEXT: store i32 [[CONV3]], i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
+// CHECK-64-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP7]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-64-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+// CHECK-64-NEXT: br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64: cond.true:
+// CHECK-64-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-64-NEXT: br label [[COND_END:%.*]]
+// CHECK-64: cond.false:
+// CHECK-64-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT: br label [[COND_END]]
+// CHECK-64: cond.end:
+// CHECK-64-NEXT: [[COND:%.*]] = phi i32 [ [[TMP10]], [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ]
+// CHECK-64-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT: store i32 [[TMP12]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64: omp.inner.for.cond:
+// CHECK-64-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT: [[CMP7:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
+// CHECK-64-NEXT: br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64: omp.inner.for.body:
+// CHECK-64-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1
+// CHECK-64-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT: store i32 [[ADD]], i32* [[I5]], align 4
+// CHECK-64-NEXT: [[TMP16:%.*]] = load i32, i32* [[CONV]], align 4
+// CHECK-64-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP16]], 1
+// CHECK-64-NEXT: store i32 [[ADD8]], i32* [[CONV]], align 4
+// CHECK-64-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64: omp.body.continue:
+// CHECK-64-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64: omp.inner.for.inc:
+// CHECK-64-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP17]], 1
+// CHECK-64-NEXT: store i32 [[ADD9]], i32* [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-64: omp.inner.for.end:
+// CHECK-64-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64: omp.loop.exit:
+// CHECK-64-NEXT: [[TMP18:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT: [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4
+// CHECK-64-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP19]])
+// CHECK-64-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK-64: omp.precond.end:
+// CHECK-64-NEXT: ret void
+// CHECK-64-LABEL: define {{[^@]+}}@.omp_task_privates_map..13
+// CHECK-64-SAME: (%struct..kmp_privates.t.6* noalias noundef [[TMP0:%.*]], i32** noalias noundef [[TMP1:%.*]]) #[[ATTR7]] {
+// CHECK-64-NEXT: entry:
+// CHECK-64-NEXT: [[DOTADDR:%.*]] = alloca %struct..kmp_privates.t.6*, align 8
+// CHECK-64-NEXT: [[DOTADDR1:%.*]] = alloca i32**, align 8
+// CHECK-64-NEXT: store %struct..kmp_privates.t.6* [[TMP0]], %struct..kmp_privates.t.6** [[DOTADDR]], align 8
+// CHECK-64-NEXT: store i32** [[TMP1]], i32*** [[DOTADDR1]], align 8
+// CHECK-64-NEXT: [[TMP2:%.*]] = load %struct..kmp_privates.t.6*, %struct..kmp_privates.t.6** [[DOTADDR]], align 8
+// CHECK-64-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T_6:%.*]], %struct..kmp_privates.t.6* [[TMP2]], i32 0, i32 0
+// CHECK-64-NEXT: [[TMP4:%.*]] = load i32**, i32*** [[DOTADDR1]], align 8
+// CHECK-64-NEXT: store i32* [[TMP3]], i32** [[TMP4]], align 8
+// CHECK-64-NEXT: ret void
+// CHECK-64-LABEL: define {{[^@]+}}@.omp_task_entry..14
+// CHECK-64-SAME: (i32 noundef signext [[TMP0:%.*]], %struct.kmp_task_t_with_privates.5* noalias noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK-64-NEXT: entry:
+// CHECK-64-NEXT: [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTPART_ID__ADDR_I:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT: [[DOTPRIVATES__ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK-64-NEXT: [[DOTCOPY_FN__ADDR_I:%.*]] = alloca void (i8*, ...)*, align 8
+// CHECK-64-NEXT: [[DOTTASK_T__ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK-64-NEXT: [[__CONTEXT_ADDR_I:%.*]] = alloca %struct.anon.4*, align 8
+// CHECK-64-NEXT: [[DOTFIRSTPRIV_PTR_ADDR_I:%.*]] = alloca i32*, align 8
+// CHECK-64-NEXT: [[GLOBAL_CASTED_I:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTADDR1:%.*]] = alloca %struct.kmp_task_t_with_privates.5*, align 8
+// CHECK-64-NEXT: store i32 [[TMP0]], i32* [[DOTADDR]], align 4
+// CHECK-64-NEXT: store %struct.kmp_task_t_with_privates.5* [[TMP1]], %struct.kmp_task_t_with_privates.5** [[DOTADDR1]], align 8
+// CHECK-64-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTADDR]], align 4
+// CHECK-64-NEXT: [[TMP3:%.*]] = load %struct.kmp_task_t_with_privates.5*, %struct.kmp_task_t_with_privates.5** [[DOTADDR1]], align 8
+// CHECK-64-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_5:%.*]], %struct.kmp_task_t_with_privates.5* [[TMP3]], i32 0, i32 0
+// CHECK-64-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 2
+// CHECK-64-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 0
+// CHECK-64-NEXT: [[TMP7:%.*]] = load i8*, i8** [[TMP6]], align 8
+// CHECK-64-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to %struct.anon.4*
+// CHECK-64-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_5]], %struct.kmp_task_t_with_privates.5* [[TMP3]], i32 0, i32 1
+// CHECK-64-NEXT: [[TMP10:%.*]] = bitcast %struct..kmp_privates.t.6* [[TMP9]] to i8*
+// CHECK-64-NEXT: [[TMP11:%.*]] = bitcast %struct.kmp_task_t_with_privates.5* [[TMP3]] to i8*
+// CHECK-64-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META37:![0-9]+]])
+// CHECK-64-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META40:![0-9]+]])
+// CHECK-64-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META42:![0-9]+]])
+// CHECK-64-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META44:![0-9]+]])
+// CHECK-64-NEXT: store i32 [[TMP2]], i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !46
+// CHECK-64-NEXT: store i32* [[TMP5]], i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !46
+// CHECK-64-NEXT: store i8* [[TMP10]], i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !46
+// CHECK-64-NEXT: store void (i8*, ...)* bitcast (void (%struct..kmp_privates.t.6*, i32**)* @.omp_task_privates_map..13 to void (i8*, ...)*), void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !46
+// CHECK-64-NEXT: store i8* [[TMP11]], i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !46
+// CHECK-64-NEXT: store %struct.anon.4* [[TMP8]], %struct.anon.4** [[__CONTEXT_ADDR_I]], align 8, !noalias !46
+// CHECK-64-NEXT: [[TMP12:%.*]] = load %struct.anon.4*, %struct.anon.4** [[__CONTEXT_ADDR_I]], align 8, !noalias !46
+// CHECK-64-NEXT: [[TMP13:%.*]] = load void (i8*, ...)*, void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !46
+// CHECK-64-NEXT: [[TMP14:%.*]] = load i8*, i8** [[DOTPRIVATES__ADDR_I]], align 8, !noalias !46
+// CHECK-64-NEXT: [[TMP15:%.*]] = bitcast void (i8*, ...)* [[TMP13]] to void (i8*, i32**)*
+// CHECK-64-NEXT: call void [[TMP15]](i8* [[TMP14]], i32** [[DOTFIRSTPRIV_PTR_ADDR_I]]) #[[ATTR3]]
+// CHECK-64-NEXT: [[TMP16:%.*]] = load i32*, i32** [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !46
+// CHECK-64-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4
+// CHECK-64-NEXT: [[CONV_I:%.*]] = bitcast i64* [[GLOBAL_CASTED_I]] to i32*
+// CHECK-64-NEXT: store i32 [[TMP17]], i32* [[CONV_I]], align 4, !noalias !46
+// CHECK-64-NEXT: [[TMP18:%.*]] = load i64, i64* [[GLOBAL_CASTED_I]], align 8, !noalias !46
+// CHECK-64-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l83(i64 [[TMP18]]) #[[ATTR3]]
+// CHECK-64-NEXT: ret i32 0
+// CHECK-64-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK-64-SAME: () #[[ATTR7]] {
+// CHECK-64-NEXT: entry:
+// CHECK-64-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK-64-NEXT: ret void
+// CHECK-32-LABEL: define {{[^@]+}}@_Z3fooi
+// CHECK-32-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-32-NEXT: entry:
+// CHECK-32-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[AA:%.*]] = alloca i16, align 2
+// CHECK-32-NEXT: [[B:%.*]] = alloca [10 x float], align 4
+// CHECK-32-NEXT: [[SAVED_STACK:%.*]] = alloca i8*, align 4
+// CHECK-32-NEXT: [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[C:%.*]] = alloca [5 x [10 x double]], align 8
+// CHECK-32-NEXT: [[__VLA_EXPR1:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[D:%.*]] = alloca [[STRUCT_TT:%.*]], align 4
+// CHECK-32-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 4
+// CHECK-32-NEXT: [[DOTDEP_ARR_ADDR:%.*]] = alloca [4 x %struct.kmp_depend_info], align 4
+// CHECK-32-NEXT: [[DEP_COUNTER_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[GLOBAL_CASTED:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [2 x i8*], align 4
+// CHECK-32-NEXT: [[AGG_CAPTURED4:%.*]] = alloca [[STRUCT_ANON_0:%.*]], align 4
+// CHECK-32-NEXT: [[DOTDEP_ARR_ADDR5:%.*]] = alloca [3 x %struct.kmp_depend_info], align 4
+// CHECK-32-NEXT: [[DEP_COUNTER_ADDR6:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[AGG_CAPTURED7:%.*]] = alloca [[STRUCT_ANON_0]], align 4
+// CHECK-32-NEXT: [[DOTDEP_ARR_ADDR8:%.*]] = alloca [3 x %struct.kmp_depend_info], align 4
+// CHECK-32-NEXT: [[DEP_COUNTER_ADDR9:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[GLOBAL_CASTED10:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[AGG_CAPTURED11:%.*]] = alloca [[STRUCT_ANON_4:%.*]], align 4
+// CHECK-32-NEXT: [[DOTDEP_ARR_ADDR12:%.*]] = alloca [1 x %struct.kmp_depend_info], align 4
+// CHECK-32-NEXT: [[DEP_COUNTER_ADDR13:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
+// CHECK-32-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4
+// CHECK-32-NEXT: store i32 0, i32* [[A]], align 4
+// CHECK-32-NEXT: store i16 0, i16* [[AA]], align 2
+// CHECK-32-NEXT: [[TMP1:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK-32-NEXT: [[TMP2:%.*]] = call i8* @llvm.stacksave()
+// CHECK-32-NEXT: store i8* [[TMP2]], i8** [[SAVED_STACK]], align 4
+// CHECK-32-NEXT: [[VLA:%.*]] = alloca float, i32 [[TMP1]], align 4
+// CHECK-32-NEXT: store i32 [[TMP1]], i32* [[__VLA_EXPR0]], align 4
+// CHECK-32-NEXT: [[TMP3:%.*]] = load i32, i32* [[N_ADDR]], align 4
+// CHECK-32-NEXT: [[TMP4:%.*]] = mul nuw i32 5, [[TMP3]]
+// CHECK-32-NEXT: [[VLA1:%.*]] = alloca double, i32 [[TMP4]], align 8
+// CHECK-32-NEXT: store i32 [[TMP3]], i32* [[__VLA_EXPR1]], align 4
+// CHECK-32-NEXT: [[TMP5:%.*]] = load i32, i32* @global, align 4
+// CHECK-32-NEXT: [[TMP6:%.*]] = load i32, i32* [[A]], align 4
+// CHECK-32-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP5]], [[TMP6]]
+// CHECK-32-NEXT: store i32 [[ADD]], i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[AGG_CAPTURED]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-NEXT: store i32 [[TMP8]], i32* [[TMP7]], align 4
+// CHECK-32-NEXT: [[TMP9:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i32 20, i32 4, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates*)* @.omp_task_entry. to i32 (i32, i8*)*))
+// CHECK-32-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to %struct.kmp_task_t_with_privates*
+// CHECK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], %struct.kmp_task_t_with_privates* [[TMP10]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], %struct.kmp_task_t* [[TMP11]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4
+// CHECK-32-NEXT: [[TMP14:%.*]] = bitcast %struct.anon* [[AGG_CAPTURED]] to i8*
+// CHECK-32-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP13]], i8* align 4 [[TMP14]], i32 4, i1 false)
+// CHECK-32-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* [[DOTDEP_ARR_ADDR]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP16:%.*]] = getelementptr [[STRUCT_KMP_DEPEND_INFO:%.*]], %struct.kmp_depend_info* [[TMP15]], i32 0
+// CHECK-32-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP16]], i32 0, i32 0
+// CHECK-32-NEXT: store i32 ptrtoint (i32* @global to i32), i32* [[TMP17]], align 4
+// CHECK-32-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP16]], i32 0, i32 1
+// CHECK-32-NEXT: store i32 4, i32* [[TMP18]], align 4
+// CHECK-32-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP16]], i32 0, i32 2
+// CHECK-32-NEXT: store i8 1, i8* [[TMP19]], align 4
+// CHECK-32-NEXT: [[TMP20:%.*]] = ptrtoint i32* [[A]] to i32
+// CHECK-32-NEXT: [[TMP21:%.*]] = getelementptr [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP15]], i32 1
+// CHECK-32-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP21]], i32 0, i32 0
+// CHECK-32-NEXT: store i32 [[TMP20]], i32* [[TMP22]], align 4
+// CHECK-32-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP21]], i32 0, i32 1
+// CHECK-32-NEXT: store i32 4, i32* [[TMP23]], align 4
+// CHECK-32-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP21]], i32 0, i32 2
+// CHECK-32-NEXT: store i8 3, i8* [[TMP24]], align 4
+// CHECK-32-NEXT: [[TMP25:%.*]] = ptrtoint [10 x float]* [[B]] to i32
+// CHECK-32-NEXT: [[TMP26:%.*]] = getelementptr [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP15]], i32 2
+// CHECK-32-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP26]], i32 0, i32 0
+// CHECK-32-NEXT: store i32 [[TMP25]], i32* [[TMP27]], align 4
+// CHECK-32-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP26]], i32 0, i32 1
+// CHECK-32-NEXT: store i32 40, i32* [[TMP28]], align 4
+// CHECK-32-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP26]], i32 0, i32 2
+// CHECK-32-NEXT: store i8 3, i8* [[TMP29]], align 4
+// CHECK-32-NEXT: [[TMP30:%.*]] = mul nsw i32 4, [[TMP3]]
+// CHECK-32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VLA1]], i32 [[TMP30]]
+// CHECK-32-NEXT: [[TMP31:%.*]] = mul nuw i32 [[TMP3]], 8
+// CHECK-32-NEXT: [[TMP32:%.*]] = ptrtoint double* [[ARRAYIDX]] to i32
+// CHECK-32-NEXT: [[TMP33:%.*]] = getelementptr [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP15]], i32 3
+// CHECK-32-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP33]], i32 0, i32 0
+// CHECK-32-NEXT: store i32 [[TMP32]], i32* [[TMP34]], align 4
+// CHECK-32-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP33]], i32 0, i32 1
+// CHECK-32-NEXT: store i32 [[TMP31]], i32* [[TMP35]], align 4
+// CHECK-32-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP33]], i32 0, i32 2
+// CHECK-32-NEXT: store i8 3, i8* [[TMP36]], align 4
+// CHECK-32-NEXT: store i32 4, i32* [[DEP_COUNTER_ADDR]], align 4
+// CHECK-32-NEXT: [[TMP37:%.*]] = bitcast %struct.kmp_depend_info* [[TMP15]] to i8*
+// CHECK-32-NEXT: call void @__kmpc_omp_taskwait_deps_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 4, i8* [[TMP37]], i32 0, i8* null, i32 0)
+// CHECK-32-NEXT: call void @__kmpc_omp_task_begin_if0(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i8* [[TMP9]])
+// CHECK-32-NEXT: [[TMP38:%.*]] = call i32 @.omp_task_entry.(i32 [[TMP0]], %struct.kmp_task_t_with_privates* [[TMP10]]) #[[ATTR3:[0-9]+]]
+// CHECK-32-NEXT: call void @__kmpc_omp_task_complete_if0(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i8* [[TMP9]])
+// CHECK-32-NEXT: [[TMP39:%.*]] = load i32, i32* @global, align 4
+// CHECK-32-NEXT: [[TMP40:%.*]] = load i32, i32* [[A]], align 4
+// CHECK-32-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP39]], [[TMP40]]
+// CHECK-32-NEXT: store i32 [[ADD3]], i32* [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-32-NEXT: [[TMP41:%.*]] = load i32*, i32** @_ZZ3fooiE6plocal, align 4
+// CHECK-32-NEXT: [[TMP42:%.*]] = load i32, i32* @global, align 4
+// CHECK-32-NEXT: store i32 [[TMP42]], i32* [[GLOBAL_CASTED]], align 4
+// CHECK-32-NEXT: [[TMP43:%.*]] = load i32, i32* [[GLOBAL_CASTED]], align 4
+// CHECK-32-NEXT: [[TMP44:%.*]] = load i32, i32* [[A]], align 4
+// CHECK-32-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP44]], 0
+// CHECK-32-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]]
+// CHECK-32: omp_if.then:
+// CHECK-32-NEXT: [[TMP45:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP46:%.*]] = bitcast i8** [[TMP45]] to i32**
+// CHECK-32-NEXT: store i32* [[TMP41]], i32** [[TMP46]], align 4
+// CHECK-32-NEXT: [[TMP47:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP48:%.*]] = bitcast i8** [[TMP47]] to i32**
+// CHECK-32-NEXT: store i32* [[TMP41]], i32** [[TMP48]], align 4
+// CHECK-32-NEXT: [[TMP49:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK-32-NEXT: store i8* null, i8** [[TMP49]], align 4
+// CHECK-32-NEXT: [[TMP50:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK-32-NEXT: [[TMP51:%.*]] = bitcast i8** [[TMP50]] to i32*
+// CHECK-32-NEXT: store i32 [[TMP43]], i32* [[TMP51]], align 4
+// CHECK-32-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK-32-NEXT: [[TMP53:%.*]] = bitcast i8** [[TMP52]] to i32*
+// CHECK-32-NEXT: store i32 [[TMP43]], i32* [[TMP53]], align 4
+// CHECK-32-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK-32-NEXT: store i8* null, i8** [[TMP54]], align 4
+// CHECK-32-NEXT: [[TMP55:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_ANON_0]], %struct.anon.0* [[AGG_CAPTURED4]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP58:%.*]] = load i32*, i32** @_ZZ3fooiE6plocal, align 4
+// CHECK-32-NEXT: store i32* [[TMP58]], i32** [[TMP57]], align 4
+// CHECK-32-NEXT: [[TMP59:%.*]] = getelementptr inbounds [[STRUCT_ANON_0]], %struct.anon.0* [[AGG_CAPTURED4]], i32 0, i32 1
+// CHECK-32-NEXT: [[TMP60:%.*]] = load i32, i32* @global, align 4
+// CHECK-32-NEXT: store i32 [[TMP60]], i32* [[TMP59]], align 4
+// CHECK-32-NEXT: [[TMP61:%.*]] = getelementptr inbounds [[STRUCT_ANON_0]], %struct.anon.0* [[AGG_CAPTURED4]], i32 0, i32 2
+// CHECK-32-NEXT: [[TMP62:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-32-NEXT: store i32 [[TMP62]], i32* [[TMP61]], align 4
+// CHECK-32-NEXT: [[TMP63:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i32 60, i32 12, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.1*)* @.omp_task_entry..6 to i32 (i32, i8*)*))
+// CHECK-32-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP63]] to %struct.kmp_task_t_with_privates.1*
+// CHECK-32-NEXT: [[TMP65:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_1:%.*]], %struct.kmp_task_t_with_privates.1* [[TMP64]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP66:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP65]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP67:%.*]] = load i8*, i8** [[TMP66]], align 4
+// CHECK-32-NEXT: [[TMP68:%.*]] = bitcast %struct.anon.0* [[AGG_CAPTURED4]] to i8*
+// CHECK-32-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP67]], i8* align 4 [[TMP68]], i32 12, i1 false)
+// CHECK-32-NEXT: [[TMP69:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_1]], %struct.kmp_task_t_with_privates.1* [[TMP64]], i32 0, i32 1
+// CHECK-32-NEXT: [[TMP70:%.*]] = bitcast i8* [[TMP67]] to %struct.anon.0*
+// CHECK-32-NEXT: [[TMP71:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], %struct..kmp_privates.t* [[TMP69]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP72:%.*]] = bitcast [2 x i64]* [[TMP71]] to i8*
+// CHECK-32-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP72]], i8* align 4 bitcast ([2 x i64]* @.offload_sizes to i8*), i32 16, i1 false)
+// CHECK-32-NEXT: [[TMP73:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP69]], i32 0, i32 1
+// CHECK-32-NEXT: [[TMP74:%.*]] = load i32*, i32** @_ZZ3fooiE6plocal, align 4
+// CHECK-32-NEXT: store i32* [[TMP74]], i32** [[TMP73]], align 4
+// CHECK-32-NEXT: [[TMP75:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP69]], i32 0, i32 2
+// CHECK-32-NEXT: [[TMP76:%.*]] = load i32, i32* @global, align 4
+// CHECK-32-NEXT: store i32 [[TMP76]], i32* [[TMP75]], align 4
+// CHECK-32-NEXT: [[TMP77:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP69]], i32 0, i32 3
+// CHECK-32-NEXT: [[TMP78:%.*]] = bitcast [2 x i8*]* [[TMP77]] to i8*
+// CHECK-32-NEXT: [[TMP79:%.*]] = bitcast i8** [[TMP55]] to i8*
+// CHECK-32-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP78]], i8* align 4 [[TMP79]], i32 8, i1 false)
+// CHECK-32-NEXT: [[TMP80:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP69]], i32 0, i32 4
+// CHECK-32-NEXT: [[TMP81:%.*]] = bitcast [2 x i8*]* [[TMP80]] to i8*
+// CHECK-32-NEXT: [[TMP82:%.*]] = bitcast i8** [[TMP56]] to i8*
+// CHECK-32-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP81]], i8* align 4 [[TMP82]], i32 8, i1 false)
+// CHECK-32-NEXT: [[TMP83:%.*]] = getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* [[DOTDEP_ARR_ADDR5]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP84:%.*]] = getelementptr [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP83]], i32 0
+// CHECK-32-NEXT: [[TMP85:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP84]], i32 0, i32 0
+// CHECK-32-NEXT: store i32 ptrtoint (i32* @global to i32), i32* [[TMP85]], align 4
+// CHECK-32-NEXT: [[TMP86:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP84]], i32 0, i32 1
+// CHECK-32-NEXT: store i32 4, i32* [[TMP86]], align 4
+// CHECK-32-NEXT: [[TMP87:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP84]], i32 0, i32 2
+// CHECK-32-NEXT: store i8 3, i8* [[TMP87]], align 4
+// CHECK-32-NEXT: [[TMP88:%.*]] = ptrtoint i32* [[A]] to i32
+// CHECK-32-NEXT: [[TMP89:%.*]] = getelementptr [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP83]], i32 1
+// CHECK-32-NEXT: [[TMP90:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP89]], i32 0, i32 0
+// CHECK-32-NEXT: store i32 [[TMP88]], i32* [[TMP90]], align 4
+// CHECK-32-NEXT: [[TMP91:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP89]], i32 0, i32 1
+// CHECK-32-NEXT: store i32 4, i32* [[TMP91]], align 4
+// CHECK-32-NEXT: [[TMP92:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP89]], i32 0, i32 2
+// CHECK-32-NEXT: store i8 3, i8* [[TMP92]], align 4
+// CHECK-32-NEXT: [[TMP93:%.*]] = mul nuw i32 [[TMP1]], 4
+// CHECK-32-NEXT: [[TMP94:%.*]] = ptrtoint float* [[VLA]] to i32
+// CHECK-32-NEXT: [[TMP95:%.*]] = getelementptr [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP83]], i32 2
+// CHECK-32-NEXT: [[TMP96:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP95]], i32 0, i32 0
+// CHECK-32-NEXT: store i32 [[TMP94]], i32* [[TMP96]], align 4
+// CHECK-32-NEXT: [[TMP97:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP95]], i32 0, i32 1
+// CHECK-32-NEXT: store i32 [[TMP93]], i32* [[TMP97]], align 4
+// CHECK-32-NEXT: [[TMP98:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP95]], i32 0, i32 2
+// CHECK-32-NEXT: store i8 3, i8* [[TMP98]], align 4
+// CHECK-32-NEXT: store i32 3, i32* [[DEP_COUNTER_ADDR6]], align 4
+// CHECK-32-NEXT: [[TMP99:%.*]] = bitcast %struct.kmp_depend_info* [[TMP83]] to i8*
+// CHECK-32-NEXT: call void @__kmpc_omp_taskwait_deps_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 3, i8* [[TMP99]], i32 0, i8* null, i32 0)
+// CHECK-32-NEXT: call void @__kmpc_omp_task_begin_if0(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i8* [[TMP63]])
+// CHECK-32-NEXT: [[TMP100:%.*]] = call i32 @.omp_task_entry..6(i32 [[TMP0]], %struct.kmp_task_t_with_privates.1* [[TMP64]]) #[[ATTR3]]
+// CHECK-32-NEXT: call void @__kmpc_omp_task_complete_if0(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i8* [[TMP63]])
+// CHECK-32-NEXT: br label [[OMP_IF_END:%.*]]
+// CHECK-32: omp_if.else:
+// CHECK-32-NEXT: [[TMP101:%.*]] = getelementptr inbounds [[STRUCT_ANON_0]], %struct.anon.0* [[AGG_CAPTURED7]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP102:%.*]] = load i32*, i32** @_ZZ3fooiE6plocal, align 4
+// CHECK-32-NEXT: store i32* [[TMP102]], i32** [[TMP101]], align 4
+// CHECK-32-NEXT: [[TMP103:%.*]] = getelementptr inbounds [[STRUCT_ANON_0]], %struct.anon.0* [[AGG_CAPTURED7]], i32 0, i32 1
+// CHECK-32-NEXT: [[TMP104:%.*]] = load i32, i32* @global, align 4
+// CHECK-32-NEXT: store i32 [[TMP104]], i32* [[TMP103]], align 4
+// CHECK-32-NEXT: [[TMP105:%.*]] = getelementptr inbounds [[STRUCT_ANON_0]], %struct.anon.0* [[AGG_CAPTURED7]], i32 0, i32 2
+// CHECK-32-NEXT: [[TMP106:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-32-NEXT: store i32 [[TMP106]], i32* [[TMP105]], align 4
+// CHECK-32-NEXT: [[TMP107:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i32 28, i32 12, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.2*)* @.omp_task_entry..9 to i32 (i32, i8*)*))
+// CHECK-32-NEXT: [[TMP108:%.*]] = bitcast i8* [[TMP107]] to %struct.kmp_task_t_with_privates.2*
+// CHECK-32-NEXT: [[TMP109:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_2:%.*]], %struct.kmp_task_t_with_privates.2* [[TMP108]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP110:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP109]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP111:%.*]] = load i8*, i8** [[TMP110]], align 4
+// CHECK-32-NEXT: [[TMP112:%.*]] = bitcast %struct.anon.0* [[AGG_CAPTURED7]] to i8*
+// CHECK-32-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP111]], i8* align 4 [[TMP112]], i32 12, i1 false)
+// CHECK-32-NEXT: [[TMP113:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_2]], %struct.kmp_task_t_with_privates.2* [[TMP108]], i32 0, i32 1
+// CHECK-32-NEXT: [[TMP114:%.*]] = bitcast i8* [[TMP111]] to %struct.anon.0*
+// CHECK-32-NEXT: [[TMP115:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T_3:%.*]], %struct..kmp_privates.t.3* [[TMP113]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP116:%.*]] = load i32*, i32** @_ZZ3fooiE6plocal, align 4
+// CHECK-32-NEXT: store i32* [[TMP116]], i32** [[TMP115]], align 4
+// CHECK-32-NEXT: [[TMP117:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T_3]], %struct..kmp_privates.t.3* [[TMP113]], i32 0, i32 1
+// CHECK-32-NEXT: [[TMP118:%.*]] = load i32, i32* @global, align 4
+// CHECK-32-NEXT: store i32 [[TMP118]], i32* [[TMP117]], align 4
+// CHECK-32-NEXT: [[TMP119:%.*]] = getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* [[DOTDEP_ARR_ADDR8]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP120:%.*]] = getelementptr [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP119]], i32 0
+// CHECK-32-NEXT: [[TMP121:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP120]], i32 0, i32 0
+// CHECK-32-NEXT: store i32 ptrtoint (i32* @global to i32), i32* [[TMP121]], align 4
+// CHECK-32-NEXT: [[TMP122:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP120]], i32 0, i32 1
+// CHECK-32-NEXT: store i32 4, i32* [[TMP122]], align 4
+// CHECK-32-NEXT: [[TMP123:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP120]], i32 0, i32 2
+// CHECK-32-NEXT: store i8 3, i8* [[TMP123]], align 4
+// CHECK-32-NEXT: [[TMP124:%.*]] = ptrtoint i32* [[A]] to i32
+// CHECK-32-NEXT: [[TMP125:%.*]] = getelementptr [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP119]], i32 1
+// CHECK-32-NEXT: [[TMP126:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP125]], i32 0, i32 0
+// CHECK-32-NEXT: store i32 [[TMP124]], i32* [[TMP126]], align 4
+// CHECK-32-NEXT: [[TMP127:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP125]], i32 0, i32 1
+// CHECK-32-NEXT: store i32 4, i32* [[TMP127]], align 4
+// CHECK-32-NEXT: [[TMP128:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP125]], i32 0, i32 2
+// CHECK-32-NEXT: store i8 3, i8* [[TMP128]], align 4
+// CHECK-32-NEXT: [[TMP129:%.*]] = mul nuw i32 [[TMP1]], 4
+// CHECK-32-NEXT: [[TMP130:%.*]] = ptrtoint float* [[VLA]] to i32
+// CHECK-32-NEXT: [[TMP131:%.*]] = getelementptr [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP119]], i32 2
+// CHECK-32-NEXT: [[TMP132:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP131]], i32 0, i32 0
+// CHECK-32-NEXT: store i32 [[TMP130]], i32* [[TMP132]], align 4
+// CHECK-32-NEXT: [[TMP133:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP131]], i32 0, i32 1
+// CHECK-32-NEXT: store i32 [[TMP129]], i32* [[TMP133]], align 4
+// CHECK-32-NEXT: [[TMP134:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP131]], i32 0, i32 2
+// CHECK-32-NEXT: store i8 3, i8* [[TMP134]], align 4
+// CHECK-32-NEXT: store i32 3, i32* [[DEP_COUNTER_ADDR9]], align 4
+// CHECK-32-NEXT: [[TMP135:%.*]] = bitcast %struct.kmp_depend_info* [[TMP119]] to i8*
+// CHECK-32-NEXT: call void @__kmpc_omp_taskwait_deps_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 3, i8* [[TMP135]], i32 0, i8* null, i32 0)
+// CHECK-32-NEXT: call void @__kmpc_omp_task_begin_if0(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i8* [[TMP107]])
+// CHECK-32-NEXT: [[TMP136:%.*]] = call i32 @.omp_task_entry..9(i32 [[TMP0]], %struct.kmp_task_t_with_privates.2* [[TMP108]]) #[[ATTR3]]
+// CHECK-32-NEXT: call void @__kmpc_omp_task_complete_if0(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i8* [[TMP107]])
+// CHECK-32-NEXT: br label [[OMP_IF_END]]
+// CHECK-32: omp_if.end:
+// CHECK-32-NEXT: [[TMP137:%.*]] = load i32, i32* @global, align 4
+// CHECK-32-NEXT: store i32 [[TMP137]], i32* [[GLOBAL_CASTED10]], align 4
+// CHECK-32-NEXT: [[TMP138:%.*]] = load i32, i32* [[GLOBAL_CASTED10]], align 4
+// CHECK-32-NEXT: [[TMP139:%.*]] = getelementptr inbounds [[STRUCT_ANON_4]], %struct.anon.4* [[AGG_CAPTURED11]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP140:%.*]] = load i32, i32* @global, align 4
+// CHECK-32-NEXT: store i32 [[TMP140]], i32* [[TMP139]], align 4
+// CHECK-32-NEXT: [[TMP141:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i32 24, i32 4, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.5*)* @.omp_task_entry..14 to i32 (i32, i8*)*))
+// CHECK-32-NEXT: [[TMP142:%.*]] = bitcast i8* [[TMP141]] to %struct.kmp_task_t_with_privates.5*
+// CHECK-32-NEXT: [[TMP143:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_5:%.*]], %struct.kmp_task_t_with_privates.5* [[TMP142]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP144:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP143]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP145:%.*]] = load i8*, i8** [[TMP144]], align 4
+// CHECK-32-NEXT: [[TMP146:%.*]] = bitcast %struct.anon.4* [[AGG_CAPTURED11]] to i8*
+// CHECK-32-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP145]], i8* align 4 [[TMP146]], i32 4, i1 false)
+// CHECK-32-NEXT: [[TMP147:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_5]], %struct.kmp_task_t_with_privates.5* [[TMP142]], i32 0, i32 1
+// CHECK-32-NEXT: [[TMP148:%.*]] = bitcast i8* [[TMP145]] to %struct.anon.4*
+// CHECK-32-NEXT: [[TMP149:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T_6:%.*]], %struct..kmp_privates.t.6* [[TMP147]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP150:%.*]] = load i32, i32* @global, align 4
+// CHECK-32-NEXT: store i32 [[TMP150]], i32* [[TMP149]], align 4
+// CHECK-32-NEXT: [[TMP151:%.*]] = getelementptr inbounds [1 x %struct.kmp_depend_info], [1 x %struct.kmp_depend_info]* [[DOTDEP_ARR_ADDR12]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP152:%.*]] = getelementptr [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP151]], i32 0
+// CHECK-32-NEXT: [[TMP153:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP152]], i32 0, i32 0
+// CHECK-32-NEXT: store i32 ptrtoint (i32* @global to i32), i32* [[TMP153]], align 4
+// CHECK-32-NEXT: [[TMP154:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP152]], i32 0, i32 1
+// CHECK-32-NEXT: store i32 4, i32* [[TMP154]], align 4
+// CHECK-32-NEXT: [[TMP155:%.*]] = getelementptr inbounds [[STRUCT_KMP_DEPEND_INFO]], %struct.kmp_depend_info* [[TMP152]], i32 0, i32 2
+// CHECK-32-NEXT: store i8 3, i8* [[TMP155]], align 4
+// CHECK-32-NEXT: store i32 1, i32* [[DEP_COUNTER_ADDR13]], align 4
+// CHECK-32-NEXT: [[TMP156:%.*]] = bitcast %struct.kmp_depend_info* [[TMP151]] to i8*
+// CHECK-32-NEXT: call void @__kmpc_omp_taskwait_deps_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i8* [[TMP156]], i32 0, i8* null, i32 0)
+// CHECK-32-NEXT: call void @__kmpc_omp_task_begin_if0(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i8* [[TMP141]])
+// CHECK-32-NEXT: [[TMP157:%.*]] = call i32 @.omp_task_entry..14(i32 [[TMP0]], %struct.kmp_task_t_with_privates.5* [[TMP142]]) #[[ATTR3]]
+// CHECK-32-NEXT: call void @__kmpc_omp_task_complete_if0(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i8* [[TMP141]])
+// CHECK-32-NEXT: [[TMP158:%.*]] = load i32, i32* [[A]], align 4
+// CHECK-32-NEXT: [[TMP159:%.*]] = load i8*, i8** [[SAVED_STACK]], align 4
+// CHECK-32-NEXT: call void @llvm.stackrestore(i8* [[TMP159]])
+// CHECK-32-NEXT: ret i32 [[TMP158]]
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l66
+// CHECK-32-SAME: () #[[ATTR2:[0-9]+]] {
+// CHECK-32-NEXT: entry:
+// CHECK-32-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%struct.ident_t* @[[GLOB3]], i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*))
+// CHECK-32-NEXT: ret void
+// CHECK-32-LABEL: define {{[^@]+}}@.omp_outlined.
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK-32-NEXT: entry:
+// CHECK-32-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-32-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32: cond.true:
+// CHECK-32-NEXT: br label [[COND_END:%.*]]
+// CHECK-32: cond.false:
+// CHECK-32-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT: br label [[COND_END]]
+// CHECK-32: cond.end:
+// CHECK-32-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-32-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32: omp.inner.for.cond:
+// CHECK-32-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-32-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32: omp.inner.for.body:
+// CHECK-32-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB3]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32, i32)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i32 [[TMP7]], i32 [[TMP8]])
+// CHECK-32-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32: omp.inner.for.inc:
+// CHECK-32-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
+// CHECK-32-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-32: omp.inner.for.end:
+// CHECK-32-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32: omp.loop.exit:
+// CHECK-32-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]])
+// CHECK-32-NEXT: ret void
+// CHECK-32-LABEL: define {{[^@]+}}@.omp_outlined..1
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK-32-NEXT: entry:
+// CHECK-32-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT: store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// CHECK-32-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9
+// CHECK-32-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32: cond.true:
+// CHECK-32-NEXT: br label [[COND_END:%.*]]
+// CHECK-32: cond.false:
+// CHECK-32-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT: br label [[COND_END]]
+// CHECK-32: cond.end:
+// CHECK-32-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK-32-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32: omp.inner.for.cond:
+// CHECK-32-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-32-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32: omp.inner.for.body:
+// CHECK-32-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-32-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT: store i32 [[ADD]], i32* [[I]], align 4
+// CHECK-32-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32: omp.body.continue:
+// CHECK-32-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32: omp.inner.for.inc:
+// CHECK-32-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-32-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-32: omp.inner.for.end:
+// CHECK-32-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32: omp.loop.exit:
+// CHECK-32-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// CHECK-32-NEXT: ret void
+// CHECK-32-LABEL: define {{[^@]+}}@.omp_task_entry.
+// CHECK-32-SAME: (i32 noundef [[TMP0:%.*]], %struct.kmp_task_t_with_privates* noalias noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK-32-NEXT: entry:
+// CHECK-32-NEXT: [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTPART_ID__ADDR_I:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT: [[DOTPRIVATES__ADDR_I:%.*]] = alloca i8*, align 4
+// CHECK-32-NEXT: [[DOTCOPY_FN__ADDR_I:%.*]] = alloca void (i8*, ...)*, align 4
+// CHECK-32-NEXT: [[DOTTASK_T__ADDR_I:%.*]] = alloca i8*, align 4
+// CHECK-32-NEXT: [[__CONTEXT_ADDR_I:%.*]] = alloca %struct.anon*, align 4
+// CHECK-32-NEXT: [[KERNEL_ARGS_I:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK-32-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTADDR1:%.*]] = alloca %struct.kmp_task_t_with_privates*, align 4
+// CHECK-32-NEXT: store i32 [[TMP0]], i32* [[DOTADDR]], align 4
+// CHECK-32-NEXT: store %struct.kmp_task_t_with_privates* [[TMP1]], %struct.kmp_task_t_with_privates** [[DOTADDR1]], align 4
+// CHECK-32-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTADDR]], align 4
+// CHECK-32-NEXT: [[TMP3:%.*]] = load %struct.kmp_task_t_with_privates*, %struct.kmp_task_t_with_privates** [[DOTADDR1]], align 4
+// CHECK-32-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], %struct.kmp_task_t_with_privates* [[TMP3]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 2
+// CHECK-32-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP7:%.*]] = load i8*, i8** [[TMP6]], align 4
+// CHECK-32-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to %struct.anon*
+// CHECK-32-NEXT: [[TMP9:%.*]] = bitcast %struct.kmp_task_t_with_privates* [[TMP3]] to i8*
+// CHECK-32-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META8:![0-9]+]])
+// CHECK-32-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META11:![0-9]+]])
+// CHECK-32-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META13:![0-9]+]])
+// CHECK-32-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META15:![0-9]+]])
+// CHECK-32-NEXT: store i32 [[TMP2]], i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !17
+// CHECK-32-NEXT: store i32* [[TMP5]], i32** [[DOTPART_ID__ADDR_I]], align 4, !noalias !17
+// CHECK-32-NEXT: store i8* null, i8** [[DOTPRIVATES__ADDR_I]], align 4, !noalias !17
+// CHECK-32-NEXT: store void (i8*, ...)* null, void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !17
+// CHECK-32-NEXT: store i8* [[TMP9]], i8** [[DOTTASK_T__ADDR_I]], align 4, !noalias !17
+// CHECK-32-NEXT: store %struct.anon* [[TMP8]], %struct.anon** [[__CONTEXT_ADDR_I]], align 4, !noalias !17
+// CHECK-32-NEXT: [[TMP10:%.*]] = load %struct.anon*, %struct.anon** [[__CONTEXT_ADDR_I]], align 4, !noalias !17
+// CHECK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], %struct.anon* [[TMP10]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4
+// CHECK-32-NEXT: [[TMP13:%.*]] = sext i32 [[TMP12]] to i64
+// CHECK-32-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 0
+// CHECK-32-NEXT: store i32 2, i32* [[TMP14]], align 4, !noalias !17
+// CHECK-32-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 1
+// CHECK-32-NEXT: store i32 0, i32* [[TMP15]], align 4, !noalias !17
+// CHECK-32-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 2
+// CHECK-32-NEXT: store i8** null, i8*** [[TMP16]], align 4, !noalias !17
+// CHECK-32-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 3
+// CHECK-32-NEXT: store i8** null, i8*** [[TMP17]], align 4, !noalias !17
+// CHECK-32-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 4
+// CHECK-32-NEXT: store i64* null, i64** [[TMP18]], align 4, !noalias !17
+// CHECK-32-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 5
+// CHECK-32-NEXT: store i64* null, i64** [[TMP19]], align 4, !noalias !17
+// CHECK-32-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 6
+// CHECK-32-NEXT: store i8** null, i8*** [[TMP20]], align 4, !noalias !17
+// CHECK-32-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 7
+// CHECK-32-NEXT: store i8** null, i8*** [[TMP21]], align 4, !noalias !17
+// CHECK-32-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
+// CHECK-32-NEXT: store i64 10, i64* [[TMP22]], align 8, !noalias !17
+// CHECK-32-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 9
+// CHECK-32-NEXT: store i64 0, i64* [[TMP23]], align 8, !noalias !17
+// CHECK-32-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 10
+// CHECK-32-NEXT: store [3 x i32] zeroinitializer, [3 x i32]* [[TMP24]], align 4, !noalias !17
+// CHECK-32-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 11
+// CHECK-32-NEXT: store [3 x i32] zeroinitializer, [3 x i32]* [[TMP25]], align 4, !noalias !17
+// CHECK-32-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 12
+// CHECK-32-NEXT: store i32 0, i32* [[TMP26]], align 4, !noalias !17
+// CHECK-32-NEXT: [[TMP27:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB3]], i64 [[TMP13]], i32 0, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l66.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]])
+// CHECK-32-NEXT: [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0
+// CHECK-32-NEXT: br i1 [[TMP28]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__2_EXIT:%.*]]
+// CHECK-32: omp_offload.failed.i:
+// CHECK-32-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l66() #[[ATTR3]]
+// CHECK-32-NEXT: br label [[DOTOMP_OUTLINED__2_EXIT]]
+// CHECK-32: .omp_outlined..2.exit:
+// CHECK-32-NEXT: ret i32 0
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l76
+// CHECK-32-SAME: (i32* noundef [[PLOCAL:%.*]], i32 noundef [[GLOBAL:%.*]]) #[[ATTR2]] {
+// CHECK-32-NEXT: entry:
+// CHECK-32-NEXT: [[PLOCAL_ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT: [[GLOBAL_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[GLOBAL_CASTED:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: store i32* [[PLOCAL]], i32** [[PLOCAL_ADDR]], align 4
+// CHECK-32-NEXT: store i32 [[GLOBAL]], i32* [[GLOBAL_ADDR]], align 4
+// CHECK-32-NEXT: [[TMP0:%.*]] = load i32*, i32** [[PLOCAL_ADDR]], align 4
+// CHECK-32-NEXT: [[TMP1:%.*]] = load i32, i32* [[GLOBAL_ADDR]], align 4
+// CHECK-32-NEXT: store i32 [[TMP1]], i32* [[GLOBAL_CASTED]], align 4
+// CHECK-32-NEXT: [[TMP2:%.*]] = load i32, i32* [[GLOBAL_CASTED]], align 4
+// CHECK-32-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%struct.ident_t* @[[GLOB3]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32)* @.omp_outlined..3 to void (i32*, i32*, ...)*), i32* [[TMP0]], i32 [[TMP2]])
+// CHECK-32-NEXT: ret void
+// CHECK-32-LABEL: define {{[^@]+}}@.omp_outlined..3
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32* noundef [[PLOCAL:%.*]], i32 noundef [[GLOBAL:%.*]]) #[[ATTR2]] {
+// CHECK-32-NEXT: entry:
+// CHECK-32-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT: [[PLOCAL_ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT: [[GLOBAL_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[GLOBAL_CASTED:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT: store i32* [[PLOCAL]], i32** [[PLOCAL_ADDR]], align 4
+// CHECK-32-NEXT: store i32 [[GLOBAL]], i32* [[GLOBAL_ADDR]], align 4
+// CHECK-32-NEXT: [[TMP0:%.*]] = load i32*, i32** [[PLOCAL_ADDR]], align 4
+// CHECK-32-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK-32-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK-32-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK-32-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-32-NEXT: store i32 0, i32* [[I]], align 4
+// CHECK-32-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK-32-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK-32: omp.precond.then:
+// CHECK-32-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-32-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4
+// CHECK-32-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-32-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK-32-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32: cond.true:
+// CHECK-32-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-32-NEXT: br label [[COND_END:%.*]]
+// CHECK-32: cond.false:
+// CHECK-32-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT: br label [[COND_END]]
+// CHECK-32: cond.end:
+// CHECK-32-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ]
+// CHECK-32-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32: omp.inner.for.cond:
+// CHECK-32-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
+// CHECK-32-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32: omp.inner.for.body:
+// CHECK-32-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT: [[TMP16:%.*]] = load i32*, i32** [[PLOCAL_ADDR]], align 4
+// CHECK-32-NEXT: [[TMP17:%.*]] = load i32, i32* [[GLOBAL_ADDR]], align 4
+// CHECK-32-NEXT: store i32 [[TMP17]], i32* [[GLOBAL_CASTED]], align 4
+// CHECK-32-NEXT: [[TMP18:%.*]] = load i32, i32* [[GLOBAL_CASTED]], align 4
+// CHECK-32-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB3]], i32 4, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32, i32, i32*, i32)* @.omp_outlined..4 to void (i32*, i32*, ...)*), i32 [[TMP14]], i32 [[TMP15]], i32* [[TMP16]], i32 [[TMP18]])
+// CHECK-32-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32: omp.inner.for.inc:
+// CHECK-32-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK-32-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-32: omp.inner.for.end:
+// CHECK-32-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32: omp.loop.exit:
+// CHECK-32-NEXT: [[TMP21:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4
+// CHECK-32-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP22]])
+// CHECK-32-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK-32: omp.precond.end:
+// CHECK-32-NEXT: ret void
+// CHECK-32-LABEL: define {{[^@]+}}@.omp_outlined..4
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32* noundef [[PLOCAL:%.*]], i32 noundef [[GLOBAL:%.*]]) #[[ATTR2]] {
+// CHECK-32-NEXT: entry:
+// CHECK-32-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[PLOCAL_ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT: [[GLOBAL_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT: store i32* [[PLOCAL]], i32** [[PLOCAL_ADDR]], align 4
+// CHECK-32-NEXT: store i32 [[GLOBAL]], i32* [[GLOBAL_ADDR]], align 4
+// CHECK-32-NEXT: [[TMP0:%.*]] = load i32*, i32** [[PLOCAL_ADDR]], align 4
+// CHECK-32-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-32-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// CHECK-32-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK-32-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK-32-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-32-NEXT: store i32 0, i32* [[I]], align 4
+// CHECK-32-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// CHECK-32-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK-32: omp.precond.then:
+// CHECK-32-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-32-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4
+// CHECK-32-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-32-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
+// CHECK-32-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32: cond.true:
+// CHECK-32-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-32-NEXT: br label [[COND_END:%.*]]
+// CHECK-32: cond.false:
+// CHECK-32-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT: br label [[COND_END]]
+// CHECK-32: cond.end:
+// CHECK-32-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ]
+// CHECK-32-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32: omp.inner.for.cond:
+// CHECK-32-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
+// CHECK-32-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32: omp.inner.for.body:
+// CHECK-32-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
+// CHECK-32-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT: store i32 [[ADD]], i32* [[I3]], align 4
+// CHECK-32-NEXT: [[TMP17:%.*]] = load i32, i32* [[GLOBAL_ADDR]], align 4
+// CHECK-32-NEXT: [[TMP18:%.*]] = load i32*, i32** [[PLOCAL_ADDR]], align 4
+// CHECK-32-NEXT: store i32 [[TMP17]], i32* [[TMP18]], align 4
+// CHECK-32-NEXT: [[TMP19:%.*]] = load i32, i32* [[GLOBAL_ADDR]], align 4
+// CHECK-32-NEXT: store i32 [[TMP19]], i32* @_ZZ3fooiE6local1, align 4
+// CHECK-32-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32: omp.body.continue:
+// CHECK-32-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32: omp.inner.for.inc:
+// CHECK-32-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP20]], 1
+// CHECK-32-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-32: omp.inner.for.end:
+// CHECK-32-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32: omp.loop.exit:
+// CHECK-32-NEXT: [[TMP21:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4
+// CHECK-32-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP22]])
+// CHECK-32-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK-32: omp.precond.end:
+// CHECK-32-NEXT: ret void
+// CHECK-32-LABEL: define {{[^@]+}}@.omp_task_privates_map.
+// CHECK-32-SAME: (%struct..kmp_privates.t* noalias noundef [[TMP0:%.*]], i32*** noalias noundef [[TMP1:%.*]], i32** noalias noundef [[TMP2:%.*]], [2 x i8*]** noalias noundef [[TMP3:%.*]], [2 x i8*]** noalias noundef [[TMP4:%.*]], [2 x i64]** noalias noundef [[TMP5:%.*]]) #[[ATTR7:[0-9]+]] {
+// CHECK-32-NEXT: entry:
+// CHECK-32-NEXT: [[DOTADDR:%.*]] = alloca %struct..kmp_privates.t*, align 4
+// CHECK-32-NEXT: [[DOTADDR1:%.*]] = alloca i32***, align 4
+// CHECK-32-NEXT: [[DOTADDR2:%.*]] = alloca i32**, align 4
+// CHECK-32-NEXT: [[DOTADDR3:%.*]] = alloca [2 x i8*]**, align 4
+// CHECK-32-NEXT: [[DOTADDR4:%.*]] = alloca [2 x i8*]**, align 4
+// CHECK-32-NEXT: [[DOTADDR5:%.*]] = alloca [2 x i64]**, align 4
+// CHECK-32-NEXT: store %struct..kmp_privates.t* [[TMP0]], %struct..kmp_privates.t** [[DOTADDR]], align 4
+// CHECK-32-NEXT: store i32*** [[TMP1]], i32**** [[DOTADDR1]], align 4
+// CHECK-32-NEXT: store i32** [[TMP2]], i32*** [[DOTADDR2]], align 4
+// CHECK-32-NEXT: store [2 x i8*]** [[TMP3]], [2 x i8*]*** [[DOTADDR3]], align 4
+// CHECK-32-NEXT: store [2 x i8*]** [[TMP4]], [2 x i8*]*** [[DOTADDR4]], align 4
+// CHECK-32-NEXT: store [2 x i64]** [[TMP5]], [2 x i64]*** [[DOTADDR5]], align 4
+// CHECK-32-NEXT: [[TMP6:%.*]] = load %struct..kmp_privates.t*, %struct..kmp_privates.t** [[DOTADDR]], align 4
+// CHECK-32-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], %struct..kmp_privates.t* [[TMP6]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP8:%.*]] = load [2 x i64]**, [2 x i64]*** [[DOTADDR5]], align 4
+// CHECK-32-NEXT: store [2 x i64]* [[TMP7]], [2 x i64]** [[TMP8]], align 4
+// CHECK-32-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP6]], i32 0, i32 1
+// CHECK-32-NEXT: [[TMP10:%.*]] = load i32***, i32**** [[DOTADDR1]], align 4
+// CHECK-32-NEXT: store i32** [[TMP9]], i32*** [[TMP10]], align 4
+// CHECK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP6]], i32 0, i32 2
+// CHECK-32-NEXT: [[TMP12:%.*]] = load i32**, i32*** [[DOTADDR2]], align 4
+// CHECK-32-NEXT: store i32* [[TMP11]], i32** [[TMP12]], align 4
+// CHECK-32-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP6]], i32 0, i32 3
+// CHECK-32-NEXT: [[TMP14:%.*]] = load [2 x i8*]**, [2 x i8*]*** [[DOTADDR3]], align 4
+// CHECK-32-NEXT: store [2 x i8*]* [[TMP13]], [2 x i8*]** [[TMP14]], align 4
+// CHECK-32-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T]], %struct..kmp_privates.t* [[TMP6]], i32 0, i32 4
+// CHECK-32-NEXT: [[TMP16:%.*]] = load [2 x i8*]**, [2 x i8*]*** [[DOTADDR4]], align 4
+// CHECK-32-NEXT: store [2 x i8*]* [[TMP15]], [2 x i8*]** [[TMP16]], align 4
+// CHECK-32-NEXT: ret void
+// CHECK-32-LABEL: define {{[^@]+}}@.omp_task_entry..6
+// CHECK-32-SAME: (i32 noundef [[TMP0:%.*]], %struct.kmp_task_t_with_privates.1* noalias noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK-32-NEXT: entry:
+// CHECK-32-NEXT: [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTPART_ID__ADDR_I:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT: [[DOTPRIVATES__ADDR_I:%.*]] = alloca i8*, align 4
+// CHECK-32-NEXT: [[DOTCOPY_FN__ADDR_I:%.*]] = alloca void (i8*, ...)*, align 4
+// CHECK-32-NEXT: [[DOTTASK_T__ADDR_I:%.*]] = alloca i8*, align 4
+// CHECK-32-NEXT: [[__CONTEXT_ADDR_I:%.*]] = alloca %struct.anon.0*, align 4
+// CHECK-32-NEXT: [[DOTFIRSTPRIV_PTR_ADDR_I:%.*]] = alloca i32**, align 4
+// CHECK-32-NEXT: [[DOTFIRSTPRIV_PTR_ADDR1_I:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT: [[DOTFIRSTPRIV_PTR_ADDR2_I:%.*]] = alloca [2 x i8*]*, align 4
+// CHECK-32-NEXT: [[DOTFIRSTPRIV_PTR_ADDR3_I:%.*]] = alloca [2 x i8*]*, align 4
+// CHECK-32-NEXT: [[DOTFIRSTPRIV_PTR_ADDR4_I:%.*]] = alloca [2 x i64]*, align 4
+// CHECK-32-NEXT: [[DOTCAPTURE_EXPR__I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTCAPTURE_EXPR_5_I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[GLOBAL_CASTED_I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[KERNEL_ARGS_I:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK-32-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTADDR1:%.*]] = alloca %struct.kmp_task_t_with_privates.1*, align 4
+// CHECK-32-NEXT: store i32 [[TMP0]], i32* [[DOTADDR]], align 4
+// CHECK-32-NEXT: store %struct.kmp_task_t_with_privates.1* [[TMP1]], %struct.kmp_task_t_with_privates.1** [[DOTADDR1]], align 4
+// CHECK-32-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTADDR]], align 4
+// CHECK-32-NEXT: [[TMP3:%.*]] = load %struct.kmp_task_t_with_privates.1*, %struct.kmp_task_t_with_privates.1** [[DOTADDR1]], align 4
+// CHECK-32-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_1:%.*]], %struct.kmp_task_t_with_privates.1* [[TMP3]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 2
+// CHECK-32-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP7:%.*]] = load i8*, i8** [[TMP6]], align 4
+// CHECK-32-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to %struct.anon.0*
+// CHECK-32-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_1]], %struct.kmp_task_t_with_privates.1* [[TMP3]], i32 0, i32 1
+// CHECK-32-NEXT: [[TMP10:%.*]] = bitcast %struct..kmp_privates.t* [[TMP9]] to i8*
+// CHECK-32-NEXT: [[TMP11:%.*]] = bitcast %struct.kmp_task_t_with_privates.1* [[TMP3]] to i8*
+// CHECK-32-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META18:![0-9]+]])
+// CHECK-32-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META21:![0-9]+]])
+// CHECK-32-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META23:![0-9]+]])
+// CHECK-32-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META25:![0-9]+]])
+// CHECK-32-NEXT: store i32 [[TMP2]], i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !27
+// CHECK-32-NEXT: store i32* [[TMP5]], i32** [[DOTPART_ID__ADDR_I]], align 4, !noalias !27
+// CHECK-32-NEXT: store i8* [[TMP10]], i8** [[DOTPRIVATES__ADDR_I]], align 4, !noalias !27
+// CHECK-32-NEXT: store void (i8*, ...)* bitcast (void (%struct..kmp_privates.t*, i32***, i32**, [2 x i8*]**, [2 x i8*]**, [2 x i64]**)* @.omp_task_privates_map. to void (i8*, ...)*), void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !27
+// CHECK-32-NEXT: store i8* [[TMP11]], i8** [[DOTTASK_T__ADDR_I]], align 4, !noalias !27
+// CHECK-32-NEXT: store %struct.anon.0* [[TMP8]], %struct.anon.0** [[__CONTEXT_ADDR_I]], align 4, !noalias !27
+// CHECK-32-NEXT: [[TMP12:%.*]] = load %struct.anon.0*, %struct.anon.0** [[__CONTEXT_ADDR_I]], align 4, !noalias !27
+// CHECK-32-NEXT: [[TMP13:%.*]] = load void (i8*, ...)*, void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !27
+// CHECK-32-NEXT: [[TMP14:%.*]] = load i8*, i8** [[DOTPRIVATES__ADDR_I]], align 4, !noalias !27
+// CHECK-32-NEXT: [[TMP15:%.*]] = bitcast void (i8*, ...)* [[TMP13]] to void (i8*, i32***, i32**, [2 x i8*]**, [2 x i8*]**, [2 x i64]**)*
+// CHECK-32-NEXT: call void [[TMP15]](i8* [[TMP14]], i32*** [[DOTFIRSTPRIV_PTR_ADDR_I]], i32** [[DOTFIRSTPRIV_PTR_ADDR1_I]], [2 x i8*]** [[DOTFIRSTPRIV_PTR_ADDR2_I]], [2 x i8*]** [[DOTFIRSTPRIV_PTR_ADDR3_I]], [2 x i64]** [[DOTFIRSTPRIV_PTR_ADDR4_I]]) #[[ATTR3]]
+// CHECK-32-NEXT: [[TMP16:%.*]] = load i32**, i32*** [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias !27
+// CHECK-32-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 4, !noalias !27
+// CHECK-32-NEXT: [[TMP18:%.*]] = load [2 x i8*]*, [2 x i8*]** [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 4, !noalias !27
+// CHECK-32-NEXT: [[TMP19:%.*]] = load [2 x i8*]*, [2 x i8*]** [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 4, !noalias !27
+// CHECK-32-NEXT: [[TMP20:%.*]] = load [2 x i64]*, [2 x i64]** [[DOTFIRSTPRIV_PTR_ADDR4_I]], align 4, !noalias !27
+// CHECK-32-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP18]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP19]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i64], [2 x i64]* [[TMP20]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_ANON_0:%.*]], %struct.anon.0* [[TMP12]], i32 0, i32 2
+// CHECK-32-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4
+// CHECK-32-NEXT: [[TMP26:%.*]] = sext i32 [[TMP25]] to i64
+// CHECK-32-NEXT: [[TMP27:%.*]] = load i32*, i32** [[TMP16]], align 4
+// CHECK-32-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP27]], align 4
+// CHECK-32-NEXT: store i32 [[TMP28]], i32* [[DOTCAPTURE_EXPR__I]], align 4, !noalias !27
+// CHECK-32-NEXT: [[TMP29:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__I]], align 4, !noalias !27
+// CHECK-32-NEXT: [[SUB6_I:%.*]] = sub nsw i32 [[TMP29]], 1
+// CHECK-32-NEXT: store i32 [[SUB6_I]], i32* [[DOTCAPTURE_EXPR_5_I]], align 4, !noalias !27
+// CHECK-32-NEXT: [[TMP30:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_5_I]], align 4, !noalias !27
+// CHECK-32-NEXT: [[ADD_I:%.*]] = add nsw i32 [[TMP30]], 1
+// CHECK-32-NEXT: [[TMP31:%.*]] = zext i32 [[ADD_I]] to i64
+// CHECK-32-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 0
+// CHECK-32-NEXT: store i32 2, i32* [[TMP32]], align 4, !noalias !27
+// CHECK-32-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 1
+// CHECK-32-NEXT: store i32 2, i32* [[TMP33]], align 4, !noalias !27
+// CHECK-32-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 2
+// CHECK-32-NEXT: store i8** [[TMP21]], i8*** [[TMP34]], align 4, !noalias !27
+// CHECK-32-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 3
+// CHECK-32-NEXT: store i8** [[TMP22]], i8*** [[TMP35]], align 4, !noalias !27
+// CHECK-32-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 4
+// CHECK-32-NEXT: store i64* [[TMP23]], i64** [[TMP36]], align 4, !noalias !27
+// CHECK-32-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 5
+// CHECK-32-NEXT: store i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes, i32 0, i32 0), i64** [[TMP37]], align 4, !noalias !27
+// CHECK-32-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 6
+// CHECK-32-NEXT: store i8** null, i8*** [[TMP38]], align 4, !noalias !27
+// CHECK-32-NEXT: [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 7
+// CHECK-32-NEXT: store i8** null, i8*** [[TMP39]], align 4, !noalias !27
+// CHECK-32-NEXT: [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
+// CHECK-32-NEXT: store i64 [[TMP31]], i64* [[TMP40]], align 8, !noalias !27
+// CHECK-32-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 9
+// CHECK-32-NEXT: store i64 0, i64* [[TMP41]], align 8, !noalias !27
+// CHECK-32-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 10
+// CHECK-32-NEXT: store [3 x i32] zeroinitializer, [3 x i32]* [[TMP42]], align 4, !noalias !27
+// CHECK-32-NEXT: [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 11
+// CHECK-32-NEXT: store [3 x i32] zeroinitializer, [3 x i32]* [[TMP43]], align 4, !noalias !27
+// CHECK-32-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 12
+// CHECK-32-NEXT: store i32 0, i32* [[TMP44]], align 4, !noalias !27
+// CHECK-32-NEXT: [[TMP45:%.*]] = call i32 @__tgt_target_kernel(%struct.ident_t* @[[GLOB3]], i64 [[TMP26]], i32 0, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l76.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]])
+// CHECK-32-NEXT: [[TMP46:%.*]] = icmp ne i32 [[TMP45]], 0
+// CHECK-32-NEXT: br i1 [[TMP46]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__5_EXIT:%.*]]
+// CHECK-32: omp_offload.failed.i:
+// CHECK-32-NEXT: [[TMP47:%.*]] = load i32*, i32** [[TMP16]], align 4
+// CHECK-32-NEXT: [[TMP48:%.*]] = load i32, i32* @global, align 4, !noalias !27
+// CHECK-32-NEXT: store i32 [[TMP48]], i32* [[GLOBAL_CASTED_I]], align 4, !noalias !27
+// CHECK-32-NEXT: [[TMP49:%.*]] = load i32, i32* [[GLOBAL_CASTED_I]], align 4, !noalias !27
+// CHECK-32-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l76(i32* [[TMP47]], i32 [[TMP49]]) #[[ATTR3]]
+// CHECK-32-NEXT: br label [[DOTOMP_OUTLINED__5_EXIT]]
+// CHECK-32: .omp_outlined..5.exit:
+// CHECK-32-NEXT: ret i32 0
+// CHECK-32-LABEL: define {{[^@]+}}@.omp_task_privates_map..8
+// CHECK-32-SAME: (%struct..kmp_privates.t.3* noalias noundef [[TMP0:%.*]], i32*** noalias noundef [[TMP1:%.*]], i32** noalias noundef [[TMP2:%.*]]) #[[ATTR7]] {
+// CHECK-32-NEXT: entry:
+// CHECK-32-NEXT: [[DOTADDR:%.*]] = alloca %struct..kmp_privates.t.3*, align 4
+// CHECK-32-NEXT: [[DOTADDR1:%.*]] = alloca i32***, align 4
+// CHECK-32-NEXT: [[DOTADDR2:%.*]] = alloca i32**, align 4
+// CHECK-32-NEXT: store %struct..kmp_privates.t.3* [[TMP0]], %struct..kmp_privates.t.3** [[DOTADDR]], align 4
+// CHECK-32-NEXT: store i32*** [[TMP1]], i32**** [[DOTADDR1]], align 4
+// CHECK-32-NEXT: store i32** [[TMP2]], i32*** [[DOTADDR2]], align 4
+// CHECK-32-NEXT: [[TMP3:%.*]] = load %struct..kmp_privates.t.3*, %struct..kmp_privates.t.3** [[DOTADDR]], align 4
+// CHECK-32-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T_3:%.*]], %struct..kmp_privates.t.3* [[TMP3]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP5:%.*]] = load i32***, i32**** [[DOTADDR1]], align 4
+// CHECK-32-NEXT: store i32** [[TMP4]], i32*** [[TMP5]], align 4
+// CHECK-32-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T_3]], %struct..kmp_privates.t.3* [[TMP3]], i32 0, i32 1
+// CHECK-32-NEXT: [[TMP7:%.*]] = load i32**, i32*** [[DOTADDR2]], align 4
+// CHECK-32-NEXT: store i32* [[TMP6]], i32** [[TMP7]], align 4
+// CHECK-32-NEXT: ret void
+// CHECK-32-LABEL: define {{[^@]+}}@.omp_task_entry..9
+// CHECK-32-SAME: (i32 noundef [[TMP0:%.*]], %struct.kmp_task_t_with_privates.2* noalias noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK-32-NEXT: entry:
+// CHECK-32-NEXT: [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTPART_ID__ADDR_I:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT: [[DOTPRIVATES__ADDR_I:%.*]] = alloca i8*, align 4
+// CHECK-32-NEXT: [[DOTCOPY_FN__ADDR_I:%.*]] = alloca void (i8*, ...)*, align 4
+// CHECK-32-NEXT: [[DOTTASK_T__ADDR_I:%.*]] = alloca i8*, align 4
+// CHECK-32-NEXT: [[__CONTEXT_ADDR_I:%.*]] = alloca %struct.anon.0*, align 4
+// CHECK-32-NEXT: [[DOTFIRSTPRIV_PTR_ADDR_I:%.*]] = alloca i32**, align 4
+// CHECK-32-NEXT: [[DOTFIRSTPRIV_PTR_ADDR1_I:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT: [[GLOBAL_CASTED_I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTADDR1:%.*]] = alloca %struct.kmp_task_t_with_privates.2*, align 4
+// CHECK-32-NEXT: store i32 [[TMP0]], i32* [[DOTADDR]], align 4
+// CHECK-32-NEXT: store %struct.kmp_task_t_with_privates.2* [[TMP1]], %struct.kmp_task_t_with_privates.2** [[DOTADDR1]], align 4
+// CHECK-32-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTADDR]], align 4
+// CHECK-32-NEXT: [[TMP3:%.*]] = load %struct.kmp_task_t_with_privates.2*, %struct.kmp_task_t_with_privates.2** [[DOTADDR1]], align 4
+// CHECK-32-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_2:%.*]], %struct.kmp_task_t_with_privates.2* [[TMP3]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 2
+// CHECK-32-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP7:%.*]] = load i8*, i8** [[TMP6]], align 4
+// CHECK-32-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to %struct.anon.0*
+// CHECK-32-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_2]], %struct.kmp_task_t_with_privates.2* [[TMP3]], i32 0, i32 1
+// CHECK-32-NEXT: [[TMP10:%.*]] = bitcast %struct..kmp_privates.t.3* [[TMP9]] to i8*
+// CHECK-32-NEXT: [[TMP11:%.*]] = bitcast %struct.kmp_task_t_with_privates.2* [[TMP3]] to i8*
+// CHECK-32-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META28:![0-9]+]])
+// CHECK-32-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META31:![0-9]+]])
+// CHECK-32-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META33:![0-9]+]])
+// CHECK-32-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META35:![0-9]+]])
+// CHECK-32-NEXT: store i32 [[TMP2]], i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !37
+// CHECK-32-NEXT: store i32* [[TMP5]], i32** [[DOTPART_ID__ADDR_I]], align 4, !noalias !37
+// CHECK-32-NEXT: store i8* [[TMP10]], i8** [[DOTPRIVATES__ADDR_I]], align 4, !noalias !37
+// CHECK-32-NEXT: store void (i8*, ...)* bitcast (void (%struct..kmp_privates.t.3*, i32***, i32**)* @.omp_task_privates_map..8 to void (i8*, ...)*), void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !37
+// CHECK-32-NEXT: store i8* [[TMP11]], i8** [[DOTTASK_T__ADDR_I]], align 4, !noalias !37
+// CHECK-32-NEXT: store %struct.anon.0* [[TMP8]], %struct.anon.0** [[__CONTEXT_ADDR_I]], align 4, !noalias !37
+// CHECK-32-NEXT: [[TMP12:%.*]] = load %struct.anon.0*, %struct.anon.0** [[__CONTEXT_ADDR_I]], align 4, !noalias !37
+// CHECK-32-NEXT: [[TMP13:%.*]] = load void (i8*, ...)*, void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !37
+// CHECK-32-NEXT: [[TMP14:%.*]] = load i8*, i8** [[DOTPRIVATES__ADDR_I]], align 4, !noalias !37
+// CHECK-32-NEXT: [[TMP15:%.*]] = bitcast void (i8*, ...)* [[TMP13]] to void (i8*, i32***, i32**)*
+// CHECK-32-NEXT: call void [[TMP15]](i8* [[TMP14]], i32*** [[DOTFIRSTPRIV_PTR_ADDR_I]], i32** [[DOTFIRSTPRIV_PTR_ADDR1_I]]) #[[ATTR3]]
+// CHECK-32-NEXT: [[TMP16:%.*]] = load i32**, i32*** [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias !37
+// CHECK-32-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 4, !noalias !37
+// CHECK-32-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_ANON_0:%.*]], %struct.anon.0* [[TMP12]], i32 0, i32 2
+// CHECK-32-NEXT: [[TMP19:%.*]] = load i32*, i32** [[TMP16]], align 4
+// CHECK-32-NEXT: [[TMP20:%.*]] = load i32, i32* @global, align 4, !noalias !37
+// CHECK-32-NEXT: store i32 [[TMP20]], i32* [[GLOBAL_CASTED_I]], align 4, !noalias !37
+// CHECK-32-NEXT: [[TMP21:%.*]] = load i32, i32* [[GLOBAL_CASTED_I]], align 4, !noalias !37
+// CHECK-32-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l76(i32* [[TMP19]], i32 [[TMP21]]) #[[ATTR3]]
+// CHECK-32-NEXT: ret i32 0
+// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l83
+// CHECK-32-SAME: (i32 noundef [[GLOBAL:%.*]]) #[[ATTR2]] {
+// CHECK-32-NEXT: entry:
+// CHECK-32-NEXT: [[GLOBAL_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[GLOBAL_CASTED:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: store i32 [[GLOBAL]], i32* [[GLOBAL_ADDR]], align 4
+// CHECK-32-NEXT: [[TMP0:%.*]] = load i32, i32* [[GLOBAL_ADDR]], align 4
+// CHECK-32-NEXT: store i32 [[TMP0]], i32* [[GLOBAL_CASTED]], align 4
+// CHECK-32-NEXT: [[TMP1:%.*]] = load i32, i32* [[GLOBAL_CASTED]], align 4
+// CHECK-32-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%struct.ident_t* @[[GLOB3]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32)* @.omp_outlined..10 to void (i32*, i32*, ...)*), i32 [[TMP1]])
+// CHECK-32-NEXT: ret void
+// CHECK-32-LABEL: define {{[^@]+}}@.omp_outlined..10
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[GLOBAL:%.*]]) #[[ATTR2]] {
+// CHECK-32-NEXT: entry:
+// CHECK-32-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT: [[GLOBAL_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[GLOBAL_CASTED:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTBOUND_ZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT: store i32 [[GLOBAL]], i32* [[GLOBAL_ADDR]], align 4
+// CHECK-32-NEXT: [[TMP0:%.*]] = load i32, i32* [[GLOBAL_ADDR]], align 4
+// CHECK-32-NEXT: store i32 [[TMP0]], i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
+// CHECK-32-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK-32-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK-32-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-32-NEXT: store i32 0, i32* [[I]], align 4
+// CHECK-32-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
+// CHECK-32-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK-32: omp.precond.then:
+// CHECK-32-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-32-NEXT: store i32 [[TMP3]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
+// CHECK-32-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-32-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
+// CHECK-32-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32: cond.true:
+// CHECK-32-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-32-NEXT: br label [[COND_END:%.*]]
+// CHECK-32: cond.false:
+// CHECK-32-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT: br label [[COND_END]]
+// CHECK-32: cond.end:
+// CHECK-32-NEXT: [[COND:%.*]] = phi i32 [ [[TMP8]], [[COND_TRUE]] ], [ [[TMP9]], [[COND_FALSE]] ]
+// CHECK-32-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT: store i32 [[TMP10]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32: omp.inner.for.cond:
+// CHECK-32-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP11]], [[TMP12]]
+// CHECK-32-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32: omp.inner.for.body:
+// CHECK-32-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// CHECK-32-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// CHECK-32-NEXT: [[TMP15:%.*]] = load i32, i32* [[GLOBAL_ADDR]], align 4
+// CHECK-32-NEXT: store i32 [[TMP15]], i32* [[GLOBAL_CASTED]], align 4
+// CHECK-32-NEXT: [[TMP16:%.*]] = load i32, i32* [[GLOBAL_CASTED]], align 4
+// CHECK-32-NEXT: [[TMP17:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4
+// CHECK-32-NEXT: call void @__kmpc_serialized_parallel(%struct.ident_t* @[[GLOB3]], i32 [[TMP18]])
+// CHECK-32-NEXT: [[TMP19:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT: store i32 0, i32* [[DOTBOUND_ZERO_ADDR]], align 4
+// CHECK-32-NEXT: call void @.omp_outlined..11(i32* [[TMP19]], i32* [[DOTBOUND_ZERO_ADDR]], i32 [[TMP13]], i32 [[TMP14]], i32 [[TMP16]]) #[[ATTR3]]
+// CHECK-32-NEXT: call void @__kmpc_end_serialized_parallel(%struct.ident_t* @[[GLOB3]], i32 [[TMP18]])
+// CHECK-32-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32: omp.inner.for.inc:
+// CHECK-32-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK-32-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-32: omp.inner.for.end:
+// CHECK-32-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32: omp.loop.exit:
+// CHECK-32-NEXT: [[TMP22:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4
+// CHECK-32-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP23]])
+// CHECK-32-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK-32: omp.precond.end:
+// CHECK-32-NEXT: ret void
+// CHECK-32-LABEL: define {{[^@]+}}@.omp_outlined..11
+// CHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[GLOBAL:%.*]]) #[[ATTR2]] {
+// CHECK-32-NEXT: entry:
+// CHECK-32-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[GLOBAL_ADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// CHECK-32-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT: store i32 [[GLOBAL]], i32* [[GLOBAL_ADDR]], align 4
+// CHECK-32-NEXT: [[TMP0:%.*]] = load i32, i32* [[GLOBAL_ADDR]], align 4
+// CHECK-32-NEXT: store i32 [[TMP0]], i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
+// CHECK-32-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK-32-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK-32-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-32-NEXT: store i32 0, i32* [[I]], align 4
+// CHECK-32-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-32-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
+// CHECK-32-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK-32: omp.precond.then:
+// CHECK-32-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-32-NEXT: store i32 [[TMP3]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK-32-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK-32-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// CHECK-32-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// CHECK-32-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
+// CHECK-32-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP7]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-32-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-32-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+// CHECK-32-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32: cond.true:
+// CHECK-32-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-32-NEXT: br label [[COND_END:%.*]]
+// CHECK-32: cond.false:
+// CHECK-32-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT: br label [[COND_END]]
+// CHECK-32: cond.end:
+// CHECK-32-NEXT: [[COND:%.*]] = phi i32 [ [[TMP10]], [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ]
+// CHECK-32-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT: store i32 [[TMP12]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-32: omp.inner.for.cond:
+// CHECK-32-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
+// CHECK-32-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32: omp.inner.for.body:
+// CHECK-32-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1
+// CHECK-32-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-32-NEXT: store i32 [[ADD]], i32* [[I3]], align 4
+// CHECK-32-NEXT: [[TMP16:%.*]] = load i32, i32* [[GLOBAL_ADDR]], align 4
+// CHECK-32-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP16]], 1
+// CHECK-32-NEXT: store i32 [[ADD6]], i32* [[GLOBAL_ADDR]], align 4
+// CHECK-32-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-32: omp.body.continue:
+// CHECK-32-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-32: omp.inner.for.inc:
+// CHECK-32-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP17]], 1
+// CHECK-32-NEXT: store i32 [[ADD7]], i32* [[DOTOMP_IV]], align 4
+// CHECK-32-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-32: omp.inner.for.end:
+// CHECK-32-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-32: omp.loop.exit:
+// CHECK-32-NEXT: [[TMP18:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK-32-NEXT: [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4
+// CHECK-32-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP19]])
+// CHECK-32-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK-32: omp.precond.end:
+// CHECK-32-NEXT: ret void
+// CHECK-32-LABEL: define {{[^@]+}}@.omp_task_privates_map..13
+// CHECK-32-SAME: (%struct..kmp_privates.t.6* noalias noundef [[TMP0:%.*]], i32** noalias noundef [[TMP1:%.*]]) #[[ATTR7]] {
+// CHECK-32-NEXT: entry:
+// CHECK-32-NEXT: [[DOTADDR:%.*]] = alloca %struct..kmp_privates.t.6*, align 4
+// CHECK-32-NEXT: [[DOTADDR1:%.*]] = alloca i32**, align 4
+// CHECK-32-NEXT: store %struct..kmp_privates.t.6* [[TMP0]], %struct..kmp_privates.t.6** [[DOTADDR]], align 4
+// CHECK-32-NEXT: store i32** [[TMP1]], i32*** [[DOTADDR1]], align 4
+// CHECK-32-NEXT: [[TMP2:%.*]] = load %struct..kmp_privates.t.6*, %struct..kmp_privates.t.6** [[DOTADDR]], align 4
+// CHECK-32-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T_6:%.*]], %struct..kmp_privates.t.6* [[TMP2]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP4:%.*]] = load i32**, i32*** [[DOTADDR1]], align 4
+// CHECK-32-NEXT: store i32* [[TMP3]], i32** [[TMP4]], align 4
+// CHECK-32-NEXT: ret void
+// CHECK-32-LABEL: define {{[^@]+}}@.omp_task_entry..14
+// CHECK-32-SAME: (i32 noundef [[TMP0:%.*]], %struct.kmp_task_t_with_privates.5* noalias noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK-32-NEXT: entry:
+// CHECK-32-NEXT: [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTPART_ID__ADDR_I:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT: [[DOTPRIVATES__ADDR_I:%.*]] = alloca i8*, align 4
+// CHECK-32-NEXT: [[DOTCOPY_FN__ADDR_I:%.*]] = alloca void (i8*, ...)*, align 4
+// CHECK-32-NEXT: [[DOTTASK_T__ADDR_I:%.*]] = alloca i8*, align 4
+// CHECK-32-NEXT: [[__CONTEXT_ADDR_I:%.*]] = alloca %struct.anon.4*, align 4
+// CHECK-32-NEXT: [[DOTFIRSTPRIV_PTR_ADDR_I:%.*]] = alloca i32*, align 4
+// CHECK-32-NEXT: [[GLOBAL_CASTED_I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT: [[DOTADDR1:%.*]] = alloca %struct.kmp_task_t_with_privates.5*, align 4
+// CHECK-32-NEXT: store i32 [[TMP0]], i32* [[DOTADDR]], align 4
+// CHECK-32-NEXT: store %struct.kmp_task_t_with_privates.5* [[TMP1]], %struct.kmp_task_t_with_privates.5** [[DOTADDR1]], align 4
+// CHECK-32-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTADDR]], align 4
+// CHECK-32-NEXT: [[TMP3:%.*]] = load %struct.kmp_task_t_with_privates.5*, %struct.kmp_task_t_with_privates.5** [[DOTADDR1]], align 4
+// CHECK-32-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_5:%.*]], %struct.kmp_task_t_with_privates.5* [[TMP3]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 2
+// CHECK-32-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], %struct.kmp_task_t* [[TMP4]], i32 0, i32 0
+// CHECK-32-NEXT: [[TMP7:%.*]] = load i8*, i8** [[TMP6]], align 4
+// CHECK-32-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to %struct.anon.4*
+// CHECK-32-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_5]], %struct.kmp_task_t_with_privates.5* [[TMP3]], i32 0, i32 1
+// CHECK-32-NEXT: [[TMP10:%.*]] = bitcast %struct..kmp_privates.t.6* [[TMP9]] to i8*
+// CHECK-32-NEXT: [[TMP11:%.*]] = bitcast %struct.kmp_task_t_with_privates.5* [[TMP3]] to i8*
+// CHECK-32-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META38:![0-9]+]])
+// CHECK-32-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META41:![0-9]+]])
+// CHECK-32-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META43:![0-9]+]])
+// CHECK-32-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META45:![0-9]+]])
+// CHECK-32-NEXT: store i32 [[TMP2]], i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !47
+// CHECK-32-NEXT: store i32* [[TMP5]], i32** [[DOTPART_ID__ADDR_I]], align 4, !noalias !47
+// CHECK-32-NEXT: store i8* [[TMP10]], i8** [[DOTPRIVATES__ADDR_I]], align 4, !noalias !47
+// CHECK-32-NEXT: store void (i8*, ...)* bitcast (void (%struct..kmp_privates.t.6*, i32**)* @.omp_task_privates_map..13 to void (i8*, ...)*), void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !47
+// CHECK-32-NEXT: store i8* [[TMP11]], i8** [[DOTTASK_T__ADDR_I]], align 4, !noalias !47
+// CHECK-32-NEXT: store %struct.anon.4* [[TMP8]], %struct.anon.4** [[__CONTEXT_ADDR_I]], align 4, !noalias !47
+// CHECK-32-NEXT: [[TMP12:%.*]] = load %struct.anon.4*, %struct.anon.4** [[__CONTEXT_ADDR_I]], align 4, !noalias !47
+// CHECK-32-NEXT: [[TMP13:%.*]] = load void (i8*, ...)*, void (i8*, ...)** [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !47
+// CHECK-32-NEXT: [[TMP14:%.*]] = load i8*, i8** [[DOTPRIVATES__ADDR_I]], align 4, !noalias !47
+// CHECK-32-NEXT: [[TMP15:%.*]] = bitcast void (i8*, ...)* [[TMP13]] to void (i8*, i32**)*
+// CHECK-32-NEXT: call void [[TMP15]](i8* [[TMP14]], i32** [[DOTFIRSTPRIV_PTR_ADDR_I]]) #[[ATTR3]]
+// CHECK-32-NEXT: [[TMP16:%.*]] = load i32*, i32** [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias !47
+// CHECK-32-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4
+// CHECK-32-NEXT: store i32 [[TMP17]], i32* [[GLOBAL_CASTED_I]], align 4, !noalias !47
+// CHECK-32-NEXT: [[TMP18:%.*]] = load i32, i32* [[GLOBAL_CASTED_I]], align 4, !noalias !47
+// CHECK-32-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l83(i32 [[TMP18]]) #[[ATTR3]]
+// CHECK-32-NEXT: ret i32 0
+// CHECK-32-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK-32-SAME: () #[[ATTR7]] {
+// CHECK-32-NEXT: entry:
+// CHECK-32-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK-32-NEXT: ret void
+// TCHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l66
+// TCHECK-64-SAME: () #[[ATTR0:[0-9]+]] {
+// TCHECK-64-NEXT: entry:
+// TCHECK-64-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%struct.ident_t* @[[GLOB3:[0-9]+]], i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*))
+// TCHECK-64-NEXT: ret void
+// TCHECK-64-LABEL: define {{[^@]+}}@.omp_outlined.
+// TCHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
+// TCHECK-64-NEXT: entry:
+// TCHECK-64-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// TCHECK-64-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// TCHECK-64-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// TCHECK-64-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// TCHECK-64-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// TCHECK-64-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// TCHECK-64-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// TCHECK-64-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// TCHECK-64-NEXT: [[I:%.*]] = alloca i32, align 4
+// TCHECK-64-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// TCHECK-64-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// TCHECK-64-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// TCHECK-64-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// TCHECK-64-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// TCHECK-64-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// TCHECK-64-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// TCHECK-64-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// TCHECK-64-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// TCHECK-64-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// TCHECK-64-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// TCHECK-64-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// TCHECK-64: cond.true:
+// TCHECK-64-NEXT: br label [[COND_END:%.*]]
+// TCHECK-64: cond.false:
+// TCHECK-64-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// TCHECK-64-NEXT: br label [[COND_END]]
+// TCHECK-64: cond.end:
+// TCHECK-64-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// TCHECK-64-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// TCHECK-64-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// TCHECK-64-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// TCHECK-64-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// TCHECK-64: omp.inner.for.cond:
+// TCHECK-64-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// TCHECK-64-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// TCHECK-64-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// TCHECK-64-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// TCHECK-64: omp.inner.for.body:
+// TCHECK-64-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// TCHECK-64-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// TCHECK-64-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// TCHECK-64-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// TCHECK-64-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB3]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i64 [[TMP8]], i64 [[TMP10]])
+// TCHECK-64-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// TCHECK-64: omp.inner.for.inc:
+// TCHECK-64-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// TCHECK-64-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// TCHECK-64-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// TCHECK-64-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// TCHECK-64-NEXT: br label [[OMP_INNER_FOR_COND]]
+// TCHECK-64: omp.inner.for.end:
+// TCHECK-64-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// TCHECK-64: omp.loop.exit:
+// TCHECK-64-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]])
+// TCHECK-64-NEXT: ret void
+// TCHECK-64-LABEL: define {{[^@]+}}@.omp_outlined..1
+// TCHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR0]] {
+// TCHECK-64-NEXT: entry:
+// TCHECK-64-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// TCHECK-64-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// TCHECK-64-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// TCHECK-64-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// TCHECK-64-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// TCHECK-64-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// TCHECK-64-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// TCHECK-64-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// TCHECK-64-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// TCHECK-64-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// TCHECK-64-NEXT: [[I:%.*]] = alloca i32, align 4
+// TCHECK-64-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// TCHECK-64-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// TCHECK-64-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// TCHECK-64-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// TCHECK-64-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4
+// TCHECK-64-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4
+// TCHECK-64-NEXT: [[TMP0:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// TCHECK-64-NEXT: [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// TCHECK-64-NEXT: [[TMP1:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// TCHECK-64-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// TCHECK-64-NEXT: store i32 [[CONV]], i32* [[DOTOMP_LB]], align 4
+// TCHECK-64-NEXT: store i32 [[CONV1]], i32* [[DOTOMP_UB]], align 4
+// TCHECK-64-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// TCHECK-64-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// TCHECK-64-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// TCHECK-64-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// TCHECK-64-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// TCHECK-64-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// TCHECK-64-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9
+// TCHECK-64-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// TCHECK-64: cond.true:
+// TCHECK-64-NEXT: br label [[COND_END:%.*]]
+// TCHECK-64: cond.false:
+// TCHECK-64-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// TCHECK-64-NEXT: br label [[COND_END]]
+// TCHECK-64: cond.end:
+// TCHECK-64-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// TCHECK-64-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// TCHECK-64-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// TCHECK-64-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4
+// TCHECK-64-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// TCHECK-64: omp.inner.for.cond:
+// TCHECK-64-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// TCHECK-64-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// TCHECK-64-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// TCHECK-64-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// TCHECK-64: omp.inner.for.body:
+// TCHECK-64-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// TCHECK-64-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// TCHECK-64-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// TCHECK-64-NEXT: store i32 [[ADD]], i32* [[I]], align 4
+// TCHECK-64-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// TCHECK-64: omp.body.continue:
+// TCHECK-64-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// TCHECK-64: omp.inner.for.inc:
+// TCHECK-64-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// TCHECK-64-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// TCHECK-64-NEXT: store i32 [[ADD3]], i32* [[DOTOMP_IV]], align 4
+// TCHECK-64-NEXT: br label [[OMP_INNER_FOR_COND]]
+// TCHECK-64: omp.inner.for.end:
+// TCHECK-64-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// TCHECK-64: omp.loop.exit:
+// TCHECK-64-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// TCHECK-64-NEXT: ret void
+// TCHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l76
+// TCHECK-64-SAME: (i64* noundef [[PLOCAL:%.*]], i64 noundef [[GLOBAL:%.*]]) #[[ATTR0]] {
+// TCHECK-64-NEXT: entry:
+// TCHECK-64-NEXT: [[PLOCAL_ADDR:%.*]] = alloca i64*, align 8
+// TCHECK-64-NEXT: [[GLOBAL_ADDR:%.*]] = alloca i64, align 8
+// TCHECK-64-NEXT: [[GLOBAL_CASTED:%.*]] = alloca i64, align 8
+// TCHECK-64-NEXT: store i64* [[PLOCAL]], i64** [[PLOCAL_ADDR]], align 8
+// TCHECK-64-NEXT: store i64 [[GLOBAL]], i64* [[GLOBAL_ADDR]], align 8
+// TCHECK-64-NEXT: [[CONV:%.*]] = bitcast i64* [[GLOBAL_ADDR]] to i32*
+// TCHECK-64-NEXT: [[TMP0:%.*]] = load i64*, i64** [[PLOCAL_ADDR]], align 8
+// TCHECK-64-NEXT: [[TMP1:%.*]] = load i32, i32* [[CONV]], align 4
+// TCHECK-64-NEXT: [[CONV1:%.*]] = bitcast i64* [[GLOBAL_CASTED]] to i32*
+// TCHECK-64-NEXT: store i32 [[TMP1]], i32* [[CONV1]], align 4
+// TCHECK-64-NEXT: [[TMP2:%.*]] = load i64, i64* [[GLOBAL_CASTED]], align 8
+// TCHECK-64-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%struct.ident_t* @[[GLOB3]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64*, i64)* @.omp_outlined..2 to void (i32*, i32*, ...)*), i64* [[TMP0]], i64 [[TMP2]])
+// TCHECK-64-NEXT: ret void
+// TCHECK-64-LABEL: define {{[^@]+}}@.omp_outlined..2
+// TCHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64* noundef [[PLOCAL:%.*]], i64 noundef [[GLOBAL:%.*]]) #[[ATTR0]] {
+// TCHECK-64-NEXT: entry:
+// TCHECK-64-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// TCHECK-64-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// TCHECK-64-NEXT: [[PLOCAL_ADDR:%.*]] = alloca i64*, align 8
+// TCHECK-64-NEXT: [[GLOBAL_ADDR:%.*]] = alloca i64, align 8
+// TCHECK-64-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// TCHECK-64-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// TCHECK-64-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i64, align 8
+// TCHECK-64-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// TCHECK-64-NEXT: [[I:%.*]] = alloca i32, align 4
+// TCHECK-64-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// TCHECK-64-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// TCHECK-64-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// TCHECK-64-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// TCHECK-64-NEXT: [[I4:%.*]] = alloca i32, align 4
+// TCHECK-64-NEXT: [[GLOBAL_CASTED:%.*]] = alloca i64, align 8
+// TCHECK-64-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// TCHECK-64-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// TCHECK-64-NEXT: store i64* [[PLOCAL]], i64** [[PLOCAL_ADDR]], align 8
+// TCHECK-64-NEXT: store i64 [[GLOBAL]], i64* [[GLOBAL_ADDR]], align 8
+// TCHECK-64-NEXT: [[CONV:%.*]] = bitcast i64* [[GLOBAL_ADDR]] to i32*
+// TCHECK-64-NEXT: [[TMP0:%.*]] = load i64*, i64** [[PLOCAL_ADDR]], align 8
+// TCHECK-64-NEXT: [[TMP1:%.*]] = load i64, i64* [[TMP0]], align 8
+// TCHECK-64-NEXT: store i64 [[TMP1]], i64* [[DOTCAPTURE_EXPR_]], align 8
+// TCHECK-64-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_]], align 8
+// TCHECK-64-NEXT: [[SUB:%.*]] = sub nsw i64 [[TMP2]], 0
+// TCHECK-64-NEXT: [[DIV:%.*]] = sdiv i64 [[SUB]], 1
+// TCHECK-64-NEXT: [[CONV2:%.*]] = trunc i64 [[DIV]] to i32
+// TCHECK-64-NEXT: [[SUB3:%.*]] = sub nsw i32 [[CONV2]], 1
+// TCHECK-64-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_1]], align 4
+// TCHECK-64-NEXT: store i32 0, i32* [[I]], align 4
+// TCHECK-64-NEXT: [[TMP3:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_]], align 8
+// TCHECK-64-NEXT: [[CMP:%.*]] = icmp slt i64 0, [[TMP3]]
+// TCHECK-64-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// TCHECK-64: omp.precond.then:
+// TCHECK-64-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// TCHECK-64-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// TCHECK-64-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4
+// TCHECK-64-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// TCHECK-64-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// TCHECK-64-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// TCHECK-64-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4
+// TCHECK-64-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// TCHECK-64-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// TCHECK-64-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// TCHECK-64-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// TCHECK-64-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// TCHECK-64: cond.true:
+// TCHECK-64-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// TCHECK-64-NEXT: br label [[COND_END:%.*]]
+// TCHECK-64: cond.false:
+// TCHECK-64-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// TCHECK-64-NEXT: br label [[COND_END]]
+// TCHECK-64: cond.end:
+// TCHECK-64-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ]
+// TCHECK-64-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// TCHECK-64-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// TCHECK-64-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4
+// TCHECK-64-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// TCHECK-64: omp.inner.for.cond:
+// TCHECK-64-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// TCHECK-64-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// TCHECK-64-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
+// TCHECK-64-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// TCHECK-64: omp.inner.for.body:
+// TCHECK-64-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// TCHECK-64-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64
+// TCHECK-64-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// TCHECK-64-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64
+// TCHECK-64-NEXT: [[TMP18:%.*]] = load i64*, i64** [[PLOCAL_ADDR]], align 8
+// TCHECK-64-NEXT: [[TMP19:%.*]] = load i32, i32* [[CONV]], align 4
+// TCHECK-64-NEXT: [[CONV7:%.*]] = bitcast i64* [[GLOBAL_CASTED]] to i32*
+// TCHECK-64-NEXT: store i32 [[TMP19]], i32* [[CONV7]], align 4
+// TCHECK-64-NEXT: [[TMP20:%.*]] = load i64, i64* [[GLOBAL_CASTED]], align 8
+// TCHECK-64-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB3]], i32 4, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i64*, i64)* @.omp_outlined..3 to void (i32*, i32*, ...)*), i64 [[TMP15]], i64 [[TMP17]], i64* [[TMP18]], i64 [[TMP20]])
+// TCHECK-64-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// TCHECK-64: omp.inner.for.inc:
+// TCHECK-64-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// TCHECK-64-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// TCHECK-64-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP21]], [[TMP22]]
+// TCHECK-64-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// TCHECK-64-NEXT: br label [[OMP_INNER_FOR_COND]]
+// TCHECK-64: omp.inner.for.end:
+// TCHECK-64-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// TCHECK-64: omp.loop.exit:
+// TCHECK-64-NEXT: [[TMP23:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// TCHECK-64-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP23]], align 4
+// TCHECK-64-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP24]])
+// TCHECK-64-NEXT: br label [[OMP_PRECOND_END]]
+// TCHECK-64: omp.precond.end:
+// TCHECK-64-NEXT: ret void
+// TCHECK-64-LABEL: define {{[^@]+}}@.omp_outlined..3
+// TCHECK-64-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64* noundef [[PLOCAL:%.*]], i64 noundef [[GLOBAL:%.*]]) #[[ATTR0]] {
+// TCHECK-64-NEXT: entry:
+// TCHECK-64-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// TCHECK-64-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// TCHECK-64-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// TCHECK-64-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// TCHECK-64-NEXT: [[PLOCAL_ADDR:%.*]] = alloca i64*, align 8
+// TCHECK-64-NEXT: [[GLOBAL_ADDR:%.*]] = alloca i64, align 8
+// TCHECK-64-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// TCHECK-64-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// TCHECK-64-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i64, align 8
+// TCHECK-64-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// TCHECK-64-NEXT: [[I:%.*]] = alloca i32, align 4
+// TCHECK-64-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// TCHECK-64-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// TCHECK-64-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// TCHECK-64-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// TCHECK-64-NEXT: [[I6:%.*]] = alloca i32, align 4
+// TCHECK-64-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// TCHECK-64-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// TCHECK-64-NEXT: store i64 [[DOTPREVIOUS_LB_]], i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// TCHECK-64-NEXT: store i64 [[DOTPREVIOUS_UB_]], i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// TCHECK-64-NEXT: store i64* [[PLOCAL]], i64** [[PLOCAL_ADDR]], align 8
+// TCHECK-64-NEXT: store i64 [[GLOBAL]], i64* [[GLOBAL_ADDR]], align 8
+// TCHECK-64-NEXT: [[CONV:%.*]] = bitcast i64* [[GLOBAL_ADDR]] to i32*
+// TCHECK-64-NEXT: [[TMP0:%.*]] = load i64*, i64** [[PLOCAL_ADDR]], align 8
+// TCHECK-64-NEXT: [[TMP1:%.*]] = load i64, i64* [[TMP0]], align 8
+// TCHECK-64-NEXT: store i64 [[TMP1]], i64* [[DOTCAPTURE_EXPR_]], align 8
+// TCHECK-64-NEXT: [[TMP2:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_]], align 8
+// TCHECK-64-NEXT: [[SUB:%.*]] = sub nsw i64 [[TMP2]], 0
+// TCHECK-64-NEXT: [[DIV:%.*]] = sdiv i64 [[SUB]], 1
+// TCHECK-64-NEXT: [[CONV2:%.*]] = trunc i64 [[DIV]] to i32
+// TCHECK-64-NEXT: [[SUB3:%.*]] = sub nsw i32 [[CONV2]], 1
+// TCHECK-64-NEXT: store i32 [[SUB3]], i32* [[DOTCAPTURE_EXPR_1]], align 4
+// TCHECK-64-NEXT: store i32 0, i32* [[I]], align 4
+// TCHECK-64-NEXT: [[TMP3:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR_]], align 8
+// TCHECK-64-NEXT: [[CMP:%.*]] = icmp slt i64 0, [[TMP3]]
+// TCHECK-64-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// TCHECK-64: omp.precond.then:
+// TCHECK-64-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4
+// TCHECK-64-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// TCHECK-64-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4
+// TCHECK-64-NEXT: [[TMP5:%.*]] = load i64, i64* [[DOTPREVIOUS_LB__ADDR]], align 8
+// TCHECK-64-NEXT: [[CONV4:%.*]] = trunc i64 [[TMP5]] to i32
+// TCHECK-64-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTPREVIOUS_UB__ADDR]], align 8
+// TCHECK-64-NEXT: [[CONV5:%.*]] = trunc i64 [[TMP6]] to i32
+// TCHECK-64-NEXT: store i32 [[CONV4]], i32* [[DOTOMP_LB]], align 4
+// TCHECK-64-NEXT: store i32 [[CONV5]], i32* [[DOTOMP_UB]], align 4
+// TCHECK-64-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// TCHECK-64-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// TCHECK-64-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// TCHECK-64-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4
+// TCHECK-64-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// TCHECK-64-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// TCHECK-64-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// TCHECK-64-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
+// TCHECK-64-NEXT: br i1 [[CMP7]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// TCHECK-64: cond.true:
+// TCHECK-64-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// TCHECK-64-NEXT: br label [[COND_END:%.*]]
+// TCHECK-64: cond.false:
+// TCHECK-64-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// TCHECK-64-NEXT: br label [[COND_END]]
+// TCHECK-64: cond.end:
+// TCHECK-64-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ]
+// TCHECK-64-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// TCHECK-64-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// TCHECK-64-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4
+// TCHECK-64-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// TCHECK-64: omp.inner.for.cond:
+// TCHECK-64-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// TCHECK-64-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// TCHECK-64-NEXT: [[CMP8:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
+// TCHECK-64-NEXT: br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// TCHECK-64: omp.inner.for.body:
+// TCHECK-64-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// TCHECK-64-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
+// TCHECK-64-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// TCHECK-64-NEXT: store i32 [[ADD]], i32* [[I6]], align 4
+// TCHECK-64-NEXT: [[TMP17:%.*]] = load i32, i32* [[CONV]], align 4
+// TCHECK-64-NEXT: [[CONV9:%.*]] = sext i32 [[TMP17]] to i64
+// TCHECK-64-NEXT: [[TMP18:%.*]] = load i64*, i64** [[PLOCAL_ADDR]], align 8
+// TCHECK-64-NEXT: store i64 [[CONV9]], i64* [[TMP18]], align 8
+// TCHECK-64-NEXT: [[TMP19:%.*]] = load i32, i32* [[CONV]], align 4
+// TCHECK-64-NEXT: store i32 [[TMP19]], i32* @_ZZ3fooiE6local1, align 4
+// TCHECK-64-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// TCHECK-64: omp.body.continue:
+// TCHECK-64-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// TCHECK-64: omp.inner.for.inc:
+// TCHECK-64-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// TCHECK-64-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP20]], 1
+// TCHECK-64-NEXT: store i32 [[ADD10]], i32* [[DOTOMP_IV]], align 4
+// TCHECK-64-NEXT: br label [[OMP_INNER_FOR_COND]]
+// TCHECK-64: omp.inner.for.end:
+// TCHECK-64-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// TCHECK-64: omp.loop.exit:
+// TCHECK-64-NEXT: [[TMP21:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// TCHECK-64-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4
+// TCHECK-64-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP22]])
+// TCHECK-64-NEXT: br label [[OMP_PRECOND_END]]
+// TCHECK-64: omp.precond.end:
+// TCHECK-64-NEXT: ret void
+// TCHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l66
+// TCHECK-32-SAME: () #[[ATTR0:[0-9]+]] {
+// TCHECK-32-NEXT: entry:
+// TCHECK-32-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%struct.ident_t* @[[GLOB3:[0-9]+]], i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*))
+// TCHECK-32-NEXT: ret void
+// TCHECK-32-LABEL: define {{[^@]+}}@.omp_outlined.
+// TCHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
+// TCHECK-32-NEXT: entry:
+// TCHECK-32-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// TCHECK-32-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// TCHECK-32-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[I:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// TCHECK-32-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// TCHECK-32-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// TCHECK-32-NEXT: store i32 9, i32* [[DOTOMP_COMB_UB]], align 4
+// TCHECK-32-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// TCHECK-32-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// TCHECK-32-NEXT: [[TMP0:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// TCHECK-32-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// TCHECK-32-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// TCHECK-32-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// TCHECK-32-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// TCHECK-32-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// TCHECK-32: cond.true:
+// TCHECK-32-NEXT: br label [[COND_END:%.*]]
+// TCHECK-32: cond.false:
+// TCHECK-32-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// TCHECK-32-NEXT: br label [[COND_END]]
+// TCHECK-32: cond.end:
+// TCHECK-32-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// TCHECK-32-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// TCHECK-32-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// TCHECK-32-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_IV]], align 4
+// TCHECK-32-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// TCHECK-32: omp.inner.for.cond:
+// TCHECK-32-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// TCHECK-32-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// TCHECK-32-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// TCHECK-32-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// TCHECK-32: omp.inner.for.body:
+// TCHECK-32-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// TCHECK-32-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// TCHECK-32-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB3]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32, i32)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i32 [[TMP7]], i32 [[TMP8]])
+// TCHECK-32-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// TCHECK-32: omp.inner.for.inc:
+// TCHECK-32-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// TCHECK-32-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// TCHECK-32-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
+// TCHECK-32-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// TCHECK-32-NEXT: br label [[OMP_INNER_FOR_COND]]
+// TCHECK-32: omp.inner.for.end:
+// TCHECK-32-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// TCHECK-32: omp.loop.exit:
+// TCHECK-32-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP1]])
+// TCHECK-32-NEXT: ret void
+// TCHECK-32-LABEL: define {{[^@]+}}@.omp_outlined..1
+// TCHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR0]] {
+// TCHECK-32-NEXT: entry:
+// TCHECK-32-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// TCHECK-32-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// TCHECK-32-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[I:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// TCHECK-32-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// TCHECK-32-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// TCHECK-32-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// TCHECK-32-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4
+// TCHECK-32-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4
+// TCHECK-32-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// TCHECK-32-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// TCHECK-32-NEXT: store i32 [[TMP0]], i32* [[DOTOMP_LB]], align 4
+// TCHECK-32-NEXT: store i32 [[TMP1]], i32* [[DOTOMP_UB]], align 4
+// TCHECK-32-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// TCHECK-32-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// TCHECK-32-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// TCHECK-32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+// TCHECK-32-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// TCHECK-32-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// TCHECK-32-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9
+// TCHECK-32-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// TCHECK-32: cond.true:
+// TCHECK-32-NEXT: br label [[COND_END:%.*]]
+// TCHECK-32: cond.false:
+// TCHECK-32-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// TCHECK-32-NEXT: br label [[COND_END]]
+// TCHECK-32: cond.end:
+// TCHECK-32-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// TCHECK-32-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// TCHECK-32-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// TCHECK-32-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4
+// TCHECK-32-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// TCHECK-32: omp.inner.for.cond:
+// TCHECK-32-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// TCHECK-32-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// TCHECK-32-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// TCHECK-32-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// TCHECK-32: omp.inner.for.body:
+// TCHECK-32-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// TCHECK-32-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// TCHECK-32-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// TCHECK-32-NEXT: store i32 [[ADD]], i32* [[I]], align 4
+// TCHECK-32-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// TCHECK-32: omp.body.continue:
+// TCHECK-32-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// TCHECK-32: omp.inner.for.inc:
+// TCHECK-32-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// TCHECK-32-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP10]], 1
+// TCHECK-32-NEXT: store i32 [[ADD2]], i32* [[DOTOMP_IV]], align 4
+// TCHECK-32-NEXT: br label [[OMP_INNER_FOR_COND]]
+// TCHECK-32: omp.inner.for.end:
+// TCHECK-32-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// TCHECK-32: omp.loop.exit:
+// TCHECK-32-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]])
+// TCHECK-32-NEXT: ret void
+// TCHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l76
+// TCHECK-32-SAME: (i32* noundef [[PLOCAL:%.*]], i32 noundef [[GLOBAL:%.*]]) #[[ATTR0]] {
+// TCHECK-32-NEXT: entry:
+// TCHECK-32-NEXT: [[PLOCAL_ADDR:%.*]] = alloca i32*, align 4
+// TCHECK-32-NEXT: [[GLOBAL_ADDR:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[GLOBAL_CASTED:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: store i32* [[PLOCAL]], i32** [[PLOCAL_ADDR]], align 4
+// TCHECK-32-NEXT: store i32 [[GLOBAL]], i32* [[GLOBAL_ADDR]], align 4
+// TCHECK-32-NEXT: [[TMP0:%.*]] = load i32*, i32** [[PLOCAL_ADDR]], align 4
+// TCHECK-32-NEXT: [[TMP1:%.*]] = load i32, i32* [[GLOBAL_ADDR]], align 4
+// TCHECK-32-NEXT: store i32 [[TMP1]], i32* [[GLOBAL_CASTED]], align 4
+// TCHECK-32-NEXT: [[TMP2:%.*]] = load i32, i32* [[GLOBAL_CASTED]], align 4
+// TCHECK-32-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%struct.ident_t* @[[GLOB3]], i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32)* @.omp_outlined..2 to void (i32*, i32*, ...)*), i32* [[TMP0]], i32 [[TMP2]])
+// TCHECK-32-NEXT: ret void
+// TCHECK-32-LABEL: define {{[^@]+}}@.omp_outlined..2
+// TCHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32* noundef [[PLOCAL:%.*]], i32 noundef [[GLOBAL:%.*]]) #[[ATTR0]] {
+// TCHECK-32-NEXT: entry:
+// TCHECK-32-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// TCHECK-32-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// TCHECK-32-NEXT: [[PLOCAL_ADDR:%.*]] = alloca i32*, align 4
+// TCHECK-32-NEXT: [[GLOBAL_ADDR:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[I:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[I3:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[GLOBAL_CASTED:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// TCHECK-32-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// TCHECK-32-NEXT: store i32* [[PLOCAL]], i32** [[PLOCAL_ADDR]], align 4
+// TCHECK-32-NEXT: store i32 [[GLOBAL]], i32* [[GLOBAL_ADDR]], align 4
+// TCHECK-32-NEXT: [[TMP0:%.*]] = load i32*, i32** [[PLOCAL_ADDR]], align 4
+// TCHECK-32-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// TCHECK-32-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4
+// TCHECK-32-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// TCHECK-32-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// TCHECK-32-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// TCHECK-32-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// TCHECK-32-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4
+// TCHECK-32-NEXT: store i32 0, i32* [[I]], align 4
+// TCHECK-32-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// TCHECK-32-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// TCHECK-32-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// TCHECK-32: omp.precond.then:
+// TCHECK-32-NEXT: store i32 0, i32* [[DOTOMP_COMB_LB]], align 4
+// TCHECK-32-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// TCHECK-32-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_COMB_UB]], align 4
+// TCHECK-32-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// TCHECK-32-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// TCHECK-32-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// TCHECK-32-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4
+// TCHECK-32-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 92, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_COMB_LB]], i32* [[DOTOMP_COMB_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// TCHECK-32-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// TCHECK-32-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// TCHECK-32-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// TCHECK-32-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// TCHECK-32: cond.true:
+// TCHECK-32-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// TCHECK-32-NEXT: br label [[COND_END:%.*]]
+// TCHECK-32: cond.false:
+// TCHECK-32-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// TCHECK-32-NEXT: br label [[COND_END]]
+// TCHECK-32: cond.end:
+// TCHECK-32-NEXT: [[COND:%.*]] = phi i32 [ [[TMP9]], [[COND_TRUE]] ], [ [[TMP10]], [[COND_FALSE]] ]
+// TCHECK-32-NEXT: store i32 [[COND]], i32* [[DOTOMP_COMB_UB]], align 4
+// TCHECK-32-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// TCHECK-32-NEXT: store i32 [[TMP11]], i32* [[DOTOMP_IV]], align 4
+// TCHECK-32-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// TCHECK-32: omp.inner.for.cond:
+// TCHECK-32-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// TCHECK-32-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// TCHECK-32-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
+// TCHECK-32-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// TCHECK-32: omp.inner.for.body:
+// TCHECK-32-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_COMB_LB]], align 4
+// TCHECK-32-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_COMB_UB]], align 4
+// TCHECK-32-NEXT: [[TMP16:%.*]] = load i32*, i32** [[PLOCAL_ADDR]], align 4
+// TCHECK-32-NEXT: [[TMP17:%.*]] = load i32, i32* [[GLOBAL_ADDR]], align 4
+// TCHECK-32-NEXT: store i32 [[TMP17]], i32* [[GLOBAL_CASTED]], align 4
+// TCHECK-32-NEXT: [[TMP18:%.*]] = load i32, i32* [[GLOBAL_CASTED]], align 4
+// TCHECK-32-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @[[GLOB3]], i32 4, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32, i32, i32*, i32)* @.omp_outlined..3 to void (i32*, i32*, ...)*), i32 [[TMP14]], i32 [[TMP15]], i32* [[TMP16]], i32 [[TMP18]])
+// TCHECK-32-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// TCHECK-32: omp.inner.for.inc:
+// TCHECK-32-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// TCHECK-32-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
+// TCHECK-32-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// TCHECK-32-NEXT: store i32 [[ADD]], i32* [[DOTOMP_IV]], align 4
+// TCHECK-32-NEXT: br label [[OMP_INNER_FOR_COND]]
+// TCHECK-32: omp.inner.for.end:
+// TCHECK-32-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// TCHECK-32: omp.loop.exit:
+// TCHECK-32-NEXT: [[TMP21:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// TCHECK-32-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4
+// TCHECK-32-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP22]])
+// TCHECK-32-NEXT: br label [[OMP_PRECOND_END]]
+// TCHECK-32: omp.precond.end:
+// TCHECK-32-NEXT: ret void
+// TCHECK-32-LABEL: define {{[^@]+}}@.omp_outlined..3
+// TCHECK-32-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32* noundef [[PLOCAL:%.*]], i32 noundef [[GLOBAL:%.*]]) #[[ATTR0]] {
+// TCHECK-32-NEXT: entry:
+// TCHECK-32-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
+// TCHECK-32-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
+// TCHECK-32-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[PLOCAL_ADDR:%.*]] = alloca i32*, align 4
+// TCHECK-32-NEXT: [[GLOBAL_ADDR:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[I:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: [[I3:%.*]] = alloca i32, align 4
+// TCHECK-32-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// TCHECK-32-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
+// TCHECK-32-NEXT: store i32 [[DOTPREVIOUS_LB_]], i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// TCHECK-32-NEXT: store i32 [[DOTPREVIOUS_UB_]], i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// TCHECK-32-NEXT: store i32* [[PLOCAL]], i32** [[PLOCAL_ADDR]], align 4
+// TCHECK-32-NEXT: store i32 [[GLOBAL]], i32* [[GLOBAL_ADDR]], align 4
+// TCHECK-32-NEXT: [[TMP0:%.*]] = load i32*, i32** [[PLOCAL_ADDR]], align 4
+// TCHECK-32-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// TCHECK-32-NEXT: store i32 [[TMP1]], i32* [[DOTCAPTURE_EXPR_]], align 4
+// TCHECK-32-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// TCHECK-32-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
+// TCHECK-32-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// TCHECK-32-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// TCHECK-32-NEXT: store i32 [[SUB2]], i32* [[DOTCAPTURE_EXPR_1]], align 4
+// TCHECK-32-NEXT: store i32 0, i32* [[I]], align 4
+// TCHECK-32-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_]], align 4
+// TCHECK-32-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
+// TCHECK-32-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// TCHECK-32: omp.precond.then:
+// TCHECK-32-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4
+// TCHECK-32-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// TCHECK-32-NEXT: store i32 [[TMP4]], i32* [[DOTOMP_UB]], align 4
+// TCHECK-32-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTPREVIOUS_LB__ADDR]], align 4
+// TCHECK-32-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTPREVIOUS_UB__ADDR]], align 4
+// TCHECK-32-NEXT: store i32 [[TMP5]], i32* [[DOTOMP_LB]], align 4
+// TCHECK-32-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_UB]], align 4
+// TCHECK-32-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4
+// TCHECK-32-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
+// TCHECK-32-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// TCHECK-32-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4
+// TCHECK-32-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB2]], i32 [[TMP8]], i32 34, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
+// TCHECK-32-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// TCHECK-32-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// TCHECK-32-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
+// TCHECK-32-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// TCHECK-32: cond.true:
+// TCHECK-32-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR_1]], align 4
+// TCHECK-32-NEXT: br label [[COND_END:%.*]]
+// TCHECK-32: cond.false:
+// TCHECK-32-NEXT: [[TMP12:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// TCHECK-32-NEXT: br label [[COND_END]]
+// TCHECK-32: cond.end:
+// TCHECK-32-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ]
+// TCHECK-32-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
+// TCHECK-32-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
+// TCHECK-32-NEXT: store i32 [[TMP13]], i32* [[DOTOMP_IV]], align 4
+// TCHECK-32-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// TCHECK-32: omp.inner.for.cond:
+// TCHECK-32-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// TCHECK-32-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
+// TCHECK-32-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
+// TCHECK-32-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// TCHECK-32: omp.inner.for.body:
+// TCHECK-32-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// TCHECK-32-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
+// TCHECK-32-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// TCHECK-32-NEXT: store i32 [[ADD]], i32* [[I3]], align 4
+// TCHECK-32-NEXT: [[TMP17:%.*]] = load i32, i32* [[GLOBAL_ADDR]], align 4
+// TCHECK-32-NEXT: [[TMP18:%.*]] = load i32*, i32** [[PLOCAL_ADDR]], align 4
+// TCHECK-32-NEXT: store i32 [[TMP17]], i32* [[TMP18]], align 4
+// TCHECK-32-NEXT: [[TMP19:%.*]] = load i32, i32* [[GLOBAL_ADDR]], align 4
+// TCHECK-32-NEXT: store i32 [[TMP19]], i32* @_ZZ3fooiE6local1, align 4
+// TCHECK-32-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// TCHECK-32: omp.body.continue:
+// TCHECK-32-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// TCHECK-32: omp.inner.for.inc:
+// TCHECK-32-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
+// TCHECK-32-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP20]], 1
+// TCHECK-32-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_IV]], align 4
+// TCHECK-32-NEXT: br label [[OMP_INNER_FOR_COND]]
+// TCHECK-32: omp.inner.for.end:
+// TCHECK-32-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// TCHECK-32: omp.loop.exit:
+// TCHECK-32-NEXT: [[TMP21:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
+// TCHECK-32-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4
+// TCHECK-32-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB2]], i32 [[TMP22]])
+// TCHECK-32-NEXT: br label [[OMP_PRECOND_END]]
+// TCHECK-32: omp.precond.end:
+// TCHECK-32-NEXT: ret void
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l66
+// CHECK-SAME: () #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3:[0-9]+]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l66.omp_outlined)
+// CHECK-NEXT: ret void
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK-NEXT: ret void
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+// TCHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l66
+// TCHECK-SAME: () #[[ATTR0:[0-9]+]] {
+// TCHECK-NEXT: entry:
+// TCHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3:[0-9]+]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l66.omp_outlined)
+// TCHECK-NEXT: ret void
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+// SIMD-ONLY0: {{.*}}
+// SIMD-ONLY1: {{.*}}
diff --git a/clang/test/OpenMP/target_teams_generic_loop_if_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_if_codegen.cpp
new file mode 100644
index 00000000000000..f00b324b9148d1
--- /dev/null
+++ b/clang/test/OpenMP/target_teams_generic_loop_if_codegen.cpp
@@ -0,0 +1,1576 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple x86_64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK1
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple x86_64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple x86_64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK1
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple x86_64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple aarch64-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple aarch64-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple aarch64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK1
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple aarch64-unknown-unknown -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple aarch64-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple aarch64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple aarch64-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple aarch64-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple aarch64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK1
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple aarch64-unknown-unknown -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple aarch64-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple aarch64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+void fn1();
+void fn2();
+void fn3();
+void fn4();
+void fn5();
+void fn6();
+
+int Arg;
+
+void gtid_test() {
+#pragma omp target teams loop
+ for(int i = 0 ; i < 100; i++) {}
+
+#pragma omp target teams loop if (parallel: false)
+ for(int i = 0 ; i < 100; i++) {
+ gtid_test();
+ }
+}
+
+
+template <typename T>
+int tmain(T Arg) {
+#pragma omp target teams loop if (true)
+ for(int i = 0 ; i < 100; i++) {
+ fn1();
+ }
+#pragma omp target teams loop if (false)
+ for(int i = 0 ; i < 100; i++) {
+ fn2();
+ }
+#pragma omp target teams loop if (parallel: Arg)
+ for(int i = 0 ; i < 100; i++) {
+ fn3();
+ }
+ return 0;
+}
+
+int main() {
+#pragma omp target teams loop if (true)
+ for(int i = 0 ; i < 100; i++) {
+
+
+ fn4();
+ }
+
+#pragma omp target teams loop if (false)
+ for(int i = 0 ; i < 100; i++) {
+
+
+ fn5();
+ }
+
+#pragma omp target teams loop if (Arg)
+ for(int i = 0 ; i < 100; i++) {
+
+
+ fn6();
+ }
+
+ return tmain(Arg);
+}
+
+
+
+
+
+
+// call void [[T_OUTLINE_FUN_3:@.+]](
+
+#endif
+// CHECK1-LABEL: define {{[^@]+}}@_Z9gtid_testv
+// CHECK1-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[KERNEL_ARGS2:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK1-NEXT: store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK1-NEXT: store i32 0, ptr [[TMP1]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK1-NEXT: store ptr null, ptr [[TMP2]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK1-NEXT: store ptr null, ptr [[TMP3]], align 8
+// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK1-NEXT: store ptr null, ptr [[TMP4]], align 8
+// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK1-NEXT: store ptr null, ptr [[TMP5]], align 8
+// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK1-NEXT: store ptr null, ptr [[TMP6]], align 8
+// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK1-NEXT: store ptr null, ptr [[TMP7]], align 8
+// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK1-NEXT: store i64 100, ptr [[TMP8]], align 8
+// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK1-NEXT: store i64 0, ptr [[TMP9]], align 8
+// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP10]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP11]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK1-NEXT: store i32 0, ptr [[TMP12]], align 4
+// CHECK1-NEXT: [[TMP13:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48.region_id, ptr [[KERNEL_ARGS]])
+// CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
+// CHECK1-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK1: omp_offload.failed:
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48() #[[ATTR2:[0-9]+]]
+// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK1: omp_offload.cont:
+// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
+// CHECK1-NEXT: store i32 2, ptr [[TMP15]], align 4
+// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 1
+// CHECK1-NEXT: store i32 0, ptr [[TMP16]], align 4
+// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 2
+// CHECK1-NEXT: store ptr null, ptr [[TMP17]], align 8
+// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 3
+// CHECK1-NEXT: store ptr null, ptr [[TMP18]], align 8
+// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 4
+// CHECK1-NEXT: store ptr null, ptr [[TMP19]], align 8
+// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 5
+// CHECK1-NEXT: store ptr null, ptr [[TMP20]], align 8
+// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 6
+// CHECK1-NEXT: store ptr null, ptr [[TMP21]], align 8
+// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 7
+// CHECK1-NEXT: store ptr null, ptr [[TMP22]], align 8
+// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 8
+// CHECK1-NEXT: store i64 100, ptr [[TMP23]], align 8
+// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 9
+// CHECK1-NEXT: store i64 0, ptr [[TMP24]], align 8
+// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 10
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP25]], align 4
+// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 11
+// CHECK1-NEXT: store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP26]], align 4
+// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 12
+// CHECK1-NEXT: store i32 0, ptr [[TMP27]], align 4
+// CHECK1-NEXT: [[TMP28:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 1, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l51.region_id, ptr [[KERNEL_ARGS2]])
+// CHECK1-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
+// CHECK1-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
+// CHECK1: omp_offload.failed3:
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l51() #[[ATTR2]]
+// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT4]]
+// CHECK1: omp_offload.cont4:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48
+// CHECK1-SAME: () #[[ATTR1:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48.omp_outlined)
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 99, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 99
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]])
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP1]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48.omp_outlined.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 99, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK1-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK1-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[CONV1]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 99
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK1-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1: omp.body.continue:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK1-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP3]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l51
+// CHECK1-SAME: () #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l51.omp_outlined)
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l51.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTBOUND_ZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 99, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 99
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK1-NEXT: call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
+// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l51.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]]
+// CHECK1-NEXT: call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP1]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l51.omp_outlined.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 99, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK1-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK1-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[CONV1]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 99
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK1-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK1-NEXT: call void @_Z9gtid_testv()
+// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1: omp.body.continue:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK1-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP3]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@main
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i8, align 1
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: [[_TMP5:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[RETVAL]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK1-NEXT: store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK1-NEXT: store i32 0, ptr [[TMP1]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK1-NEXT: store ptr null, ptr [[TMP2]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK1-NEXT: store ptr null, ptr [[TMP3]], align 8
+// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK1-NEXT: store ptr null, ptr [[TMP4]], align 8
+// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK1-NEXT: store ptr null, ptr [[TMP5]], align 8
+// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK1-NEXT: store ptr null, ptr [[TMP6]], align 8
+// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK1-NEXT: store ptr null, ptr [[TMP7]], align 8
+// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK1-NEXT: store i64 100, ptr [[TMP8]], align 8
+// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK1-NEXT: store i64 0, ptr [[TMP9]], align 8
+// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP10]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP11]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK1-NEXT: store i32 0, ptr [[TMP12]], align 4
+// CHECK1-NEXT: [[TMP13:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l76.region_id, ptr [[KERNEL_ARGS]])
+// CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
+// CHECK1-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK1: omp_offload.failed:
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l76() #[[ATTR2]]
+// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK1: omp_offload.cont:
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83() #[[ATTR2]]
+// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr @Arg, align 4
+// CHECK1-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP15]], 0
+// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8
+// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1
+// CHECK1-NEXT: [[TMP16:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1
+// CHECK1-NEXT: [[TOBOOL1:%.*]] = trunc i8 [[TMP16]] to i1
+// CHECK1-NEXT: [[FROMBOOL2:%.*]] = zext i1 [[TOBOOL1]] to i8
+// CHECK1-NEXT: store i8 [[FROMBOOL2]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1
+// CHECK1-NEXT: [[TMP17:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8
+// CHECK1-NEXT: [[TMP18:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1
+// CHECK1-NEXT: [[TOBOOL3:%.*]] = trunc i8 [[TMP18]] to i1
+// CHECK1-NEXT: br i1 [[TOBOOL3]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]]
+// CHECK1: omp_if.then:
+// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT: store i64 [[TMP17]], ptr [[TMP19]], align 8
+// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT: store i64 [[TMP17]], ptr [[TMP20]], align 8
+// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK1-NEXT: store ptr null, ptr [[TMP21]], align 8
+// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP24:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1
+// CHECK1-NEXT: [[TOBOOL4:%.*]] = trunc i8 [[TMP24]] to i1
+// CHECK1-NEXT: [[TMP25:%.*]] = select i1 [[TOBOOL4]], i32 0, i32 1
+// CHECK1-NEXT: [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP25]], 0
+// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
+// CHECK1-NEXT: store i32 2, ptr [[TMP27]], align 4
+// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
+// CHECK1-NEXT: store i32 1, ptr [[TMP28]], align 4
+// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
+// CHECK1-NEXT: store ptr [[TMP22]], ptr [[TMP29]], align 8
+// CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 3
+// CHECK1-NEXT: store ptr [[TMP23]], ptr [[TMP30]], align 8
+// CHECK1-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 4
+// CHECK1-NEXT: store ptr @.offload_sizes, ptr [[TMP31]], align 8
+// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 5
+// CHECK1-NEXT: store ptr @.offload_maptypes, ptr [[TMP32]], align 8
+// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 6
+// CHECK1-NEXT: store ptr null, ptr [[TMP33]], align 8
+// CHECK1-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 7
+// CHECK1-NEXT: store ptr null, ptr [[TMP34]], align 8
+// CHECK1-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 8
+// CHECK1-NEXT: store i64 100, ptr [[TMP35]], align 8
+// CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 9
+// CHECK1-NEXT: store i64 0, ptr [[TMP36]], align 8
+// CHECK1-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 10
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP37]], align 4
+// CHECK1-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 11
+// CHECK1-NEXT: store [3 x i32] [[TMP26]], ptr [[TMP38]], align 4
+// CHECK1-NEXT: [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 12
+// CHECK1-NEXT: store i32 0, ptr [[TMP39]], align 4
+// CHECK1-NEXT: [[TMP40:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP25]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.region_id, ptr [[KERNEL_ARGS6]])
+// CHECK1-NEXT: [[TMP41:%.*]] = icmp ne i32 [[TMP40]], 0
+// CHECK1-NEXT: br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]]
+// CHECK1: omp_offload.failed7:
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90(i64 [[TMP17]]) #[[ATTR2]]
+// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT8]]
+// CHECK1: omp_offload.cont8:
+// CHECK1-NEXT: br label [[OMP_IF_END:%.*]]
+// CHECK1: omp_if.else:
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90(i64 [[TMP17]]) #[[ATTR2]]
+// CHECK1-NEXT: br label [[OMP_IF_END]]
+// CHECK1: omp_if.end:
+// CHECK1-NEXT: [[TMP42:%.*]] = load i32, ptr @Arg, align 4
+// CHECK1-NEXT: [[CALL:%.*]] = call noundef i32 @_Z5tmainIiEiT_(i32 noundef [[TMP42]])
+// CHECK1-NEXT: ret i32 [[CALL]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l76
+// CHECK1-SAME: () #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l76.omp_outlined)
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l76.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 99, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 99
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l76.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]])
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP1]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l76.omp_outlined.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 99, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK1-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK1-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[CONV1]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 99
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK1-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK1-NEXT: call void @_Z3fn4v()
+// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1: omp.body.continue:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK1-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP3]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83
+// CHECK1-SAME: () #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83.omp_outlined)
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTBOUND_ZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 99, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 99
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK1-NEXT: call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
+// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]]
+// CHECK1-NEXT: call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP1]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83.omp_outlined.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 99, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK1-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK1-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[CONV1]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 99
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK1-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK1-NEXT: call void @_Z3fn5v()
+// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1: omp.body.continue:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK1-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP3]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90
+// CHECK1-SAME: (i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1
+// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP0]] to i1
+// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8
+// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1
+// CHECK1-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.omp_outlined, i64 [[TMP1]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTBOUND_ZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 99, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 99
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK1-NEXT: [[TMP11:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1
+// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP11]] to i1
+// CHECK1-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]]
+// CHECK1: omp_if.then:
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]])
+// CHECK1-NEXT: br label [[OMP_IF_END:%.*]]
+// CHECK1: omp_if.else:
+// CHECK1-NEXT: call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
+// CHECK1-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.omp_outlined.omp_outlined(ptr [[TMP12]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]]
+// CHECK1-NEXT: call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
+// CHECK1-NEXT: br label [[OMP_IF_END]]
+// CHECK1: omp_if.end:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP1]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.omp_outlined.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 99, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK1-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK1-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[CONV1]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 99
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK1-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK1-NEXT: call void @_Z3fn6v()
+// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1: omp.body.continue:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK1-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP3]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiEiT_
+// CHECK1-SAME: (i32 noundef [[ARG:%.*]]) #[[ATTR0]] comdat {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[ARG_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i8, align 1
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: [[_TMP4:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK1-NEXT: store i32 [[ARG]], ptr [[ARG_ADDR]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK1-NEXT: store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK1-NEXT: store i32 0, ptr [[TMP1]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK1-NEXT: store ptr null, ptr [[TMP2]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK1-NEXT: store ptr null, ptr [[TMP3]], align 8
+// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK1-NEXT: store ptr null, ptr [[TMP4]], align 8
+// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK1-NEXT: store ptr null, ptr [[TMP5]], align 8
+// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK1-NEXT: store ptr null, ptr [[TMP6]], align 8
+// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK1-NEXT: store ptr null, ptr [[TMP7]], align 8
+// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK1-NEXT: store i64 100, ptr [[TMP8]], align 8
+// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK1-NEXT: store i64 0, ptr [[TMP9]], align 8
+// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP10]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP11]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK1-NEXT: store i32 0, ptr [[TMP12]], align 4
+// CHECK1-NEXT: [[TMP13:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l60.region_id, ptr [[KERNEL_ARGS]])
+// CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
+// CHECK1-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK1: omp_offload.failed:
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l60() #[[ATTR2]]
+// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK1: omp_offload.cont:
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l64() #[[ATTR2]]
+// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
+// CHECK1-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP15]], 0
+// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8
+// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR_]], align 1
+// CHECK1-NEXT: [[TMP16:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1
+// CHECK1-NEXT: [[TOBOOL1:%.*]] = trunc i8 [[TMP16]] to i1
+// CHECK1-NEXT: [[FROMBOOL2:%.*]] = zext i1 [[TOBOOL1]] to i8
+// CHECK1-NEXT: store i8 [[FROMBOOL2]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1
+// CHECK1-NEXT: [[TMP17:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8
+// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT: store i64 [[TMP17]], ptr [[TMP18]], align 8
+// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT: store i64 [[TMP17]], ptr [[TMP19]], align 8
+// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK1-NEXT: store ptr null, ptr [[TMP20]], align 8
+// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP23:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1
+// CHECK1-NEXT: [[TOBOOL3:%.*]] = trunc i8 [[TMP23]] to i1
+// CHECK1-NEXT: [[TMP24:%.*]] = select i1 [[TOBOOL3]], i32 0, i32 1
+// CHECK1-NEXT: [[TMP25:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP24]], 0
+// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
+// CHECK1-NEXT: store i32 2, ptr [[TMP26]], align 4
+// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
+// CHECK1-NEXT: store i32 1, ptr [[TMP27]], align 4
+// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
+// CHECK1-NEXT: store ptr [[TMP21]], ptr [[TMP28]], align 8
+// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 3
+// CHECK1-NEXT: store ptr [[TMP22]], ptr [[TMP29]], align 8
+// CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 4
+// CHECK1-NEXT: store ptr @.offload_sizes.1, ptr [[TMP30]], align 8
+// CHECK1-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 5
+// CHECK1-NEXT: store ptr @.offload_maptypes.2, ptr [[TMP31]], align 8
+// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 6
+// CHECK1-NEXT: store ptr null, ptr [[TMP32]], align 8
+// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 7
+// CHECK1-NEXT: store ptr null, ptr [[TMP33]], align 8
+// CHECK1-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 8
+// CHECK1-NEXT: store i64 100, ptr [[TMP34]], align 8
+// CHECK1-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 9
+// CHECK1-NEXT: store i64 0, ptr [[TMP35]], align 8
+// CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 10
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP36]], align 4
+// CHECK1-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 11
+// CHECK1-NEXT: store [3 x i32] [[TMP25]], ptr [[TMP37]], align 4
+// CHECK1-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 12
+// CHECK1-NEXT: store i32 0, ptr [[TMP38]], align 4
+// CHECK1-NEXT: [[TMP39:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 [[TMP24]], ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l68.region_id, ptr [[KERNEL_ARGS5]])
+// CHECK1-NEXT: [[TMP40:%.*]] = icmp ne i32 [[TMP39]], 0
+// CHECK1-NEXT: br i1 [[TMP40]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
+// CHECK1: omp_offload.failed6:
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l68(i64 [[TMP17]]) #[[ATTR2]]
+// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT7]]
+// CHECK1: omp_offload.cont7:
+// CHECK1-NEXT: ret i32 0
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l60
+// CHECK1-SAME: () #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l60.omp_outlined)
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l60.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 99, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 99
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l60.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]])
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP1]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l60.omp_outlined.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 99, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK1-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK1-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[CONV1]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 99
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK1-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK1-NEXT: call void @_Z3fn1v()
+// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1: omp.body.continue:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK1-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP3]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l64
+// CHECK1-SAME: () #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l64.omp_outlined)
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l64.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTBOUND_ZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 99, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 99
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK1-NEXT: call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
+// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l64.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]]
+// CHECK1-NEXT: call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP1]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l64.omp_outlined.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 99, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK1-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK1-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[CONV1]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 99
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK1-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK1-NEXT: call void @_Z3fn2v()
+// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1: omp.body.continue:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK1-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP3]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l68
+// CHECK1-SAME: (i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1
+// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP0]] to i1
+// CHECK1-NEXT: [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8
+// CHECK1-NEXT: store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1
+// CHECK1-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l68.omp_outlined, i64 [[TMP1]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l68.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTBOUND_ZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 99, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 99
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK1-NEXT: [[TMP11:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1
+// CHECK1-NEXT: [[TOBOOL:%.*]] = trunc i8 [[TMP11]] to i1
+// CHECK1-NEXT: br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]]
+// CHECK1: omp_if.then:
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l68.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]])
+// CHECK1-NEXT: br label [[OMP_IF_END:%.*]]
+// CHECK1: omp_if.else:
+// CHECK1-NEXT: call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
+// CHECK1-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l68.omp_outlined.omp_outlined(ptr [[TMP12]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]]
+// CHECK1-NEXT: call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
+// CHECK1-NEXT: br label [[OMP_IF_END]]
+// CHECK1: omp_if.end:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP1]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l68.omp_outlined.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 99, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK1-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK1-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[CONV1]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 99
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK1-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK1-NEXT: call void @_Z3fn3v()
+// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1: omp.body.continue:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK1-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP3]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK1-NEXT: ret void
+//
diff --git a/clang/test/OpenMP/target_teams_generic_loop_order_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_order_codegen.cpp
new file mode 100644
index 00000000000000..fb93d58b6bd17a
--- /dev/null
+++ b/clang/test/OpenMP/target_teams_generic_loop_order_codegen.cpp
@@ -0,0 +1,207 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK1
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// REQUIRES: powerpc-registered-target
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+void gtid_test() {
+#pragma omp target teams loop order(concurrent)
+ for(int i = 0 ; i < 100; i++) {}
+}
+
+
+
+
+#endif
+// CHECK1-LABEL: define {{[^@]+}}@_Z9gtid_testv
+// CHECK1-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK1-NEXT: store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK1-NEXT: store i32 0, ptr [[TMP1]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK1-NEXT: store ptr null, ptr [[TMP2]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK1-NEXT: store ptr null, ptr [[TMP3]], align 8
+// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK1-NEXT: store ptr null, ptr [[TMP4]], align 8
+// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK1-NEXT: store ptr null, ptr [[TMP5]], align 8
+// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK1-NEXT: store ptr null, ptr [[TMP6]], align 8
+// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK1-NEXT: store ptr null, ptr [[TMP7]], align 8
+// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK1-NEXT: store i64 100, ptr [[TMP8]], align 8
+// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK1-NEXT: store i64 0, ptr [[TMP9]], align 8
+// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP10]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP11]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK1-NEXT: store i32 0, ptr [[TMP12]], align 4
+// CHECK1-NEXT: [[TMP13:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l16.region_id, ptr [[KERNEL_ARGS]])
+// CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
+// CHECK1-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK1: omp_offload.failed:
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l16() #[[ATTR2:[0-9]+]]
+// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK1: omp_offload.cont:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l16
+// CHECK1-SAME: () #[[ATTR1:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l16.omp_outlined)
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l16.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 99, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 99
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l16.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]])
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP1]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l16.omp_outlined.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 99, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK1-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK1-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[CONV1]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 99
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP4:![0-9]+]]
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP4]]
+// CHECK1-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK1-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP4]]
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP4]]
+// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1: omp.body.continue:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP4]]
+// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK1-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP4]]
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP3]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] section ".text.startup" {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK1-NEXT: ret void
+//
diff --git a/clang/test/OpenMP/target_teams_generic_loop_private_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_private_codegen.cpp
new file mode 100644
index 00000000000000..58a0b5062281af
--- /dev/null
+++ b/clang/test/OpenMP/target_teams_generic_loop_private_codegen.cpp
@@ -0,0 +1,3124 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefix=CHECK3
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefix=CHECK3
+
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefix=CHECK5
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefix=CHECK5
+
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DLAMBDA -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+
+// Test target codegen - host bc file has to be created first. (no significant
diff erences with host version of target region)
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefix=CHECK13
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefix=CHECK13
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefix=CHECK15
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefix=CHECK15
+
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefix=CHECK17
+
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+struct St {
+ int a, b;
+ St() : a(0), b(0) {}
+ St(const St &st) : a(st.a + st.b), b(0) {}
+ ~St() {}
+};
+
+volatile int g = 1212;
+volatile int &g1 = g;
+
+template <class T>
+struct S {
+ T f;
+ S(T a) : f(a + g) {}
+ S() : f(g) {}
+ S(const S &s, St t = St()) : f(s.f + t.a) {}
+ operator T() { return T(); }
+ ~S() {}
+};
+
+
+template <typename T>
+T tmain() {
+ S<T> test;
+ T t_var = T();
+ T vec[] = {1, 2};
+ S<T> s_arr[] = {1, 2};
+ S<T> &var = test;
+#pragma omp target teams loop private(t_var, vec, s_arr, var)
+ for (int i = 0; i < 2; ++i) {
+ vec[i] = t_var;
+ s_arr[i] = var;
+ }
+ return T();
+}
+
+// HCHECK-DAG: [[TEST:@.+]] ={{.*}} global [[S_FLOAT_TY]] zeroinitializer,
+S<float> test;
+// HCHECK-DAG: [[T_VAR:@.+]] ={{.+}} global i{{[0-9]+}} 333,
+int t_var = 333;
+// HCHECK-DAG: [[VEC:@.+]] ={{.+}} global [2 x i{{[0-9]+}}] [i{{[0-9]+}} 1, i{{[0-9]+}} 2],
+int vec[] = {1, 2};
+// HCHECK-DAG: [[S_ARR:@.+]] ={{.+}} global [2 x [[S_FLOAT_TY]]] zeroinitializer,
+S<float> s_arr[] = {1, 2};
+// HCHECK-DAG: [[VAR:@.+]] ={{.+}} global [[S_FLOAT_TY]] zeroinitializer,
+S<float> var(3);
+// HCHECK-DAG: [[SIVAR:@.+]] = internal global i{{[0-9]+}} 0,
+
+int main() {
+ static int sivar;
+#ifdef LAMBDA
+ [&]() {
+#pragma omp target teams loop private(g, g1, sivar)
+ for (int i = 0; i < 2; ++i) {
+
+ // Skip global, bound tid and loop vars
+
+ g = 1;
+ g1 = 1;
+ sivar = 2;
+
+ // Skip global, bound tid and loop vars
+ [&]() {
+ g = 2;
+ g1 = 2;
+ sivar = 4;
+
+ }();
+ }
+ }();
+ return 0;
+#else
+#pragma omp target teams loop private(t_var, vec, s_arr, var, sivar)
+ for (int i = 0; i < 2; ++i) {
+ vec[i] = t_var;
+ s_arr[i] = var;
+ sivar += i;
+ }
+ return tmain<int>();
+#endif
+}
+
+// HCHECK: define {{.*}}i{{[0-9]+}} @main()
+// HCHECK: call i32 @__tgt_target_teams_mapper(ptr @{{.+}}, i64 -1, ptr @{{[^,]+}}, i32 0, ptr null, ptr null, {{.+}} null, {{.+}} null, ptr null, ptr null, i32 0, i32 0)
+// HCHECK: call void @[[OFFL1:.+]]()
+// HCHECK: {{%.+}} = call{{.*}} i32 @[[TMAIN_INT:.+]]()
+// HCHECK: ret
+
+// HCHECK: define{{.*}} void @[[OFFL1]]()
+
+// Skip global, bound tid and loop vars
+
+// private(s_arr)
+
+// private(var)
+
+
+// Skip global, bound tid and loop vars
+
+// private(s_arr)
+
+// private(var)
+
+
+// HCHECK: define{{.*}} i{{[0-9]+}} @[[TMAIN_INT]]()
+// HCHECK: call i32 @__tgt_target_teams_mapper(ptr @{{.+}}, i64 -1, ptr @{{[^,]+}}, i32 0,
+// HCHECK: call void @[[TOFFL1:.+]]()
+// HCHECK: ret
+
+// HCHECK: define {{.*}}void @[[TOFFL1]]()
+
+// Skip global, bound tid and loop vars
+
+// private(s_arr)
+
+
+// private(var)
+
+
+// Skip global, bound tid and loop vars
+// prev lb and ub
+// iter variables
+
+// private(s_arr)
+
+
+// private(var)
+
+
+
+#endif
+// CHECK1-LABEL: define {{[^@]+}}@__cxx_global_var_init
+// CHECK1-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) @test)
+// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN1SIfED1Ev, ptr @test, ptr @__dso_handle) #[[ATTR2:[0-9]+]]
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_ZN1SIfEC1Ev
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: call void @_ZN1SIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_ZN1SIfED1Ev
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR2]]
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_ZN1SIfEC2Ev
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP0:%.*]] = load volatile i32, ptr @g, align 4
+// CHECK1-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
+// CHECK1-NEXT: store float [[CONV]], ptr [[F]], align 4
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_ZN1SIfED2Ev
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@__cxx_global_var_init.1
+// CHECK1-SAME: () #[[ATTR0]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) @s_arr, float noundef 1.000000e+00)
+// CHECK1-NEXT: call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) getelementptr inbounds ([[STRUCT_S:%.*]], ptr @s_arr, i64 1), float noundef 2.000000e+00)
+// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @__cxx_global_array_dtor, ptr null, ptr @__dso_handle) #[[ATTR2]]
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_ZN1SIfEC1Ef
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], float noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca float, align 4
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: store float [[A]], ptr [[A_ADDR]], align 4
+// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR]], align 4
+// CHECK1-NEXT: call void @_ZN1SIfEC2Ef(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], float noundef [[TMP0]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@__cxx_global_array_dtor
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR0]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK1-NEXT: br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK1: arraydestroy.body:
+// CHECK1-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ getelementptr inbounds ([[STRUCT_S:%.*]], ptr @s_arr, i64 2), [[ENTRY:%.*]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK1-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
+// CHECK1-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR2]]
+// CHECK1-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], @s_arr
+// CHECK1-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE1:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK1: arraydestroy.done1:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_ZN1SIfEC2Ef
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], float noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca float, align 4
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: store float [[A]], ptr [[A_ADDR]], align 4
+// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = load volatile i32, ptr @g, align 4
+// CHECK1-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP1]] to float
+// CHECK1-NEXT: [[ADD:%.*]] = fadd float [[TMP0]], [[CONV]]
+// CHECK1-NEXT: store float [[ADD]], ptr [[F]], align 4
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@__cxx_global_var_init.2
+// CHECK1-SAME: () #[[ATTR0]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) @var, float noundef 3.000000e+00)
+// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN1SIfED1Ev, ptr @var, ptr @__dso_handle) #[[ATTR2]]
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@main
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[RETVAL]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK1-NEXT: store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK1-NEXT: store i32 0, ptr [[TMP1]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK1-NEXT: store ptr null, ptr [[TMP2]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK1-NEXT: store ptr null, ptr [[TMP3]], align 8
+// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK1-NEXT: store ptr null, ptr [[TMP4]], align 8
+// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK1-NEXT: store ptr null, ptr [[TMP5]], align 8
+// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK1-NEXT: store ptr null, ptr [[TMP6]], align 8
+// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK1-NEXT: store ptr null, ptr [[TMP7]], align 8
+// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK1-NEXT: store i64 2, ptr [[TMP8]], align 8
+// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK1-NEXT: store i64 0, ptr [[TMP9]], align 8
+// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP10]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP11]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK1-NEXT: store i32 0, ptr [[TMP12]], align 4
+// CHECK1-NEXT: [[TMP13:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.region_id, ptr [[KERNEL_ARGS]])
+// CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
+// CHECK1-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK1: omp_offload.failed:
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124() #[[ATTR2]]
+// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK1: omp_offload.cont:
+// CHECK1-NEXT: [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v()
+// CHECK1-NEXT: ret i32 [[CALL]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124
+// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined)
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[T_VAR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4
+// CHECK1-NEXT: [[S_ARR:%.*]] = alloca [2 x %struct.S], align 4
+// CHECK1-NEXT: [[VAR:%.*]] = alloca [[STRUCT_S:%.*]], align 4
+// CHECK1-NEXT: [[SIVAR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
+// CHECK1-NEXT: [[ARRAYCTOR_END:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN]], i64 2
+// CHECK1-NEXT: br label [[ARRAYCTOR_LOOP:%.*]]
+// CHECK1: arrayctor.loop:
+// CHECK1-NEXT: [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
+// CHECK1-NEXT: call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]])
+// CHECK1-NEXT: [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYCTOR_CUR]], i64 1
+// CHECK1-NEXT: [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
+// CHECK1-NEXT: br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
+// CHECK1: arrayctor.cont:
+// CHECK1-NEXT: call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]])
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 1
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
+// CHECK1: omp.inner.for.cond.cleanup:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]])
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP14]])
+// CHECK1-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR2]]
+// CHECK1-NEXT: [[ARRAY_BEGIN2:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN2]], i64 2
+// CHECK1-NEXT: br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK1: arraydestroy.body:
+// CHECK1-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK1-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
+// CHECK1-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR2]]
+// CHECK1-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN2]]
+// CHECK1-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE3:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK1: arraydestroy.done3:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[T_VAR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4
+// CHECK1-NEXT: [[S_ARR:%.*]] = alloca [2 x %struct.S], align 4
+// CHECK1-NEXT: [[VAR:%.*]] = alloca [[STRUCT_S:%.*]], align 4
+// CHECK1-NEXT: [[SIVAR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK1-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK1-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[CONV1]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
+// CHECK1-NEXT: [[ARRAYCTOR_END:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN]], i64 2
+// CHECK1-NEXT: br label [[ARRAYCTOR_LOOP:%.*]]
+// CHECK1: arrayctor.loop:
+// CHECK1-NEXT: [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
+// CHECK1-NEXT: call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]])
+// CHECK1-NEXT: [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYCTOR_CUR]], i64 1
+// CHECK1-NEXT: [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
+// CHECK1-NEXT: br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
+// CHECK1: arrayctor.cont:
+// CHECK1-NEXT: call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]])
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 1
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK1-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
+// CHECK1: omp.inner.for.cond.cleanup:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[T_VAR]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP11]] to i64
+// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC]], i64 0, i64 [[IDXPROM]]
+// CHECK1-NEXT: store i32 [[TMP10]], ptr [[ARRAYIDX]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: [[IDXPROM3:%.*]] = sext i32 [[TMP12]] to i64
+// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 [[IDXPROM3]]
+// CHECK1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[ARRAYIDX4]], ptr align 4 [[VAR]], i64 4, i1 false)
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[SIVAR]], align 4
+// CHECK1-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP14]], [[TMP13]]
+// CHECK1-NEXT: store i32 [[ADD5]], ptr [[SIVAR]], align 4
+// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1: omp.body.continue:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP15]], 1
+// CHECK1-NEXT: store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: [[TMP16:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP17]])
+// CHECK1-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR2]]
+// CHECK1-NEXT: [[ARRAY_BEGIN7:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN7]], i64 2
+// CHECK1-NEXT: br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK1: arraydestroy.body:
+// CHECK1-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP18]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK1-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
+// CHECK1-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR2]]
+// CHECK1-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN7]]
+// CHECK1-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE8:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK1: arraydestroy.done8:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
+// CHECK1-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
+// CHECK1-NEXT: [[T_VAR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4
+// CHECK1-NEXT: [[S_ARR:%.*]] = alloca [2 x %struct.S.0], align 4
+// CHECK1-NEXT: [[VAR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[_TMP1:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK1-NEXT: call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
+// CHECK1-NEXT: store i32 0, ptr [[T_VAR]], align 4
+// CHECK1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
+// CHECK1-NEXT: [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
+// CHECK1-NEXT: call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
+// CHECK1-NEXT: [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT: call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
+// CHECK1-NEXT: store ptr [[TEST]], ptr [[VAR]], align 8
+// CHECK1-NEXT: store ptr undef, ptr [[_TMP1]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK1-NEXT: store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK1-NEXT: store i32 0, ptr [[TMP1]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK1-NEXT: store ptr null, ptr [[TMP2]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK1-NEXT: store ptr null, ptr [[TMP3]], align 8
+// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK1-NEXT: store ptr null, ptr [[TMP4]], align 8
+// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK1-NEXT: store ptr null, ptr [[TMP5]], align 8
+// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK1-NEXT: store ptr null, ptr [[TMP6]], align 8
+// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK1-NEXT: store ptr null, ptr [[TMP7]], align 8
+// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK1-NEXT: store i64 2, ptr [[TMP8]], align 8
+// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK1-NEXT: store i64 0, ptr [[TMP9]], align 8
+// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP10]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP11]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK1-NEXT: store i32 0, ptr [[TMP12]], align 4
+// CHECK1-NEXT: [[TMP13:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.region_id, ptr [[KERNEL_ARGS]])
+// CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
+// CHECK1-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK1: omp_offload.failed:
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80() #[[ATTR2]]
+// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK1: omp_offload.cont:
+// CHECK1-NEXT: store i32 0, ptr [[RETVAL]], align 4
+// CHECK1-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i64 2
+// CHECK1-NEXT: br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK1: arraydestroy.body:
+// CHECK1-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK1-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
+// CHECK1-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR2]]
+// CHECK1-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
+// CHECK1-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK1: arraydestroy.done2:
+// CHECK1-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR2]]
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[RETVAL]], align 4
+// CHECK1-NEXT: ret i32 [[TMP16]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_ZN1SIiEC1Ev
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: call void @_ZN1SIiEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_ZN1SIiEC1Ei
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef signext [[A:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK1-NEXT: call void @_ZN1SIiEC2Ei(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], i32 noundef signext [[TMP0]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80
+// CHECK1-SAME: () #[[ATTR4]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined)
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[_TMP1:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[T_VAR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4
+// CHECK1-NEXT: [[S_ARR:%.*]] = alloca [2 x %struct.S.0], align 4
+// CHECK1-NEXT: [[VAR:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
+// CHECK1-NEXT: [[_TMP2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr undef, ptr [[_TMP1]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
+// CHECK1-NEXT: [[ARRAYCTOR_END:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i64 2
+// CHECK1-NEXT: br label [[ARRAYCTOR_LOOP:%.*]]
+// CHECK1: arrayctor.loop:
+// CHECK1-NEXT: [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
+// CHECK1-NEXT: call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]])
+// CHECK1-NEXT: [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYCTOR_CUR]], i64 1
+// CHECK1-NEXT: [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
+// CHECK1-NEXT: br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
+// CHECK1: arrayctor.cont:
+// CHECK1-NEXT: call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]])
+// CHECK1-NEXT: store ptr [[VAR]], ptr [[_TMP2]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 1
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP3:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK1-NEXT: br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
+// CHECK1: omp.inner.for.cond.cleanup:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]])
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP14]])
+// CHECK1-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR2]]
+// CHECK1-NEXT: [[ARRAY_BEGIN4:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN4]], i64 2
+// CHECK1-NEXT: br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK1: arraydestroy.body:
+// CHECK1-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK1-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
+// CHECK1-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR2]]
+// CHECK1-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN4]]
+// CHECK1-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE5:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK1: arraydestroy.done5:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[_TMP1:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[T_VAR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4
+// CHECK1-NEXT: [[S_ARR:%.*]] = alloca [2 x %struct.S.0], align 4
+// CHECK1-NEXT: [[VAR:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
+// CHECK1-NEXT: [[_TMP3:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: store ptr undef, ptr [[_TMP1]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK1-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK1-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[CONV2]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
+// CHECK1-NEXT: [[ARRAYCTOR_END:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i64 2
+// CHECK1-NEXT: br label [[ARRAYCTOR_LOOP:%.*]]
+// CHECK1: arrayctor.loop:
+// CHECK1-NEXT: [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
+// CHECK1-NEXT: call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]])
+// CHECK1-NEXT: [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYCTOR_CUR]], i64 1
+// CHECK1-NEXT: [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
+// CHECK1-NEXT: br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
+// CHECK1: arrayctor.cont:
+// CHECK1-NEXT: call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]])
+// CHECK1-NEXT: store ptr [[VAR]], ptr [[_TMP3]], align 8
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 1
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP4:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK1-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
+// CHECK1: omp.inner.for.cond.cleanup:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[T_VAR]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP11]] to i64
+// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC]], i64 0, i64 [[IDXPROM]]
+// CHECK1-NEXT: store i32 [[TMP10]], ptr [[ARRAYIDX]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = load ptr, ptr [[_TMP3]], align 8
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: [[IDXPROM5:%.*]] = sext i32 [[TMP13]] to i64
+// CHECK1-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 [[IDXPROM5]]
+// CHECK1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[ARRAYIDX6]], ptr align 4 [[TMP12]], i64 4, i1 false)
+// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1: omp.body.continue:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP14]], 1
+// CHECK1-NEXT: store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: [[TMP15:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP16]])
+// CHECK1-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR2]]
+// CHECK1-NEXT: [[ARRAY_BEGIN8:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN8]], i64 2
+// CHECK1-NEXT: br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK1: arraydestroy.body:
+// CHECK1-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP17]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK1-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
+// CHECK1-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR2]]
+// CHECK1-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN8]]
+// CHECK1-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE9:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK1: arraydestroy.done9:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_ZN1SIiED1Ev
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR2]]
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_ZN1SIiEC2Ev
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP0:%.*]] = load volatile i32, ptr @g, align 4
+// CHECK1-NEXT: store i32 [[TMP0]], ptr [[F]], align 4
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_ZN1SIiEC2Ei
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef signext [[A:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = load volatile i32, ptr @g, align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP1]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[F]], align 4
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_ZN1SIiED2Ev
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_GLOBAL__sub_I_target_teams_generic_loop_private_codegen.cpp
+// CHECK1-SAME: () #[[ATTR0]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: call void @__cxx_global_var_init()
+// CHECK1-NEXT: call void @__cxx_global_var_init.1()
+// CHECK1-NEXT: call void @__cxx_global_var_init.2()
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK1-SAME: () #[[ATTR0]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@__cxx_global_var_init
+// CHECK3-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) @test)
+// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN1SIfED1Ev, ptr @test, ptr @__dso_handle) #[[ATTR2:[0-9]+]]
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_ZN1SIfEC1Ev
+// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: call void @_ZN1SIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]])
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_ZN1SIfED1Ev
+// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR2]]
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_ZN1SIfEC2Ev
+// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP0:%.*]] = load volatile i32, ptr @g, align 4
+// CHECK3-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
+// CHECK3-NEXT: store float [[CONV]], ptr [[F]], align 4
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_ZN1SIfED2Ev
+// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@__cxx_global_var_init.1
+// CHECK3-SAME: () #[[ATTR0]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) @s_arr, float noundef 1.000000e+00)
+// CHECK3-NEXT: call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) getelementptr inbounds ([[STRUCT_S:%.*]], ptr @s_arr, i32 1), float noundef 2.000000e+00)
+// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @__cxx_global_array_dtor, ptr null, ptr @__dso_handle) #[[ATTR2]]
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_ZN1SIfEC1Ef
+// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], float noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca float, align 4
+// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: store float [[A]], ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: call void @_ZN1SIfEC2Ef(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], float noundef [[TMP0]])
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@__cxx_global_array_dtor
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR0]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
+// CHECK3-NEXT: br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK3: arraydestroy.body:
+// CHECK3-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ getelementptr inbounds ([[STRUCT_S:%.*]], ptr @s_arr, i32 2), [[ENTRY:%.*]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK3-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
+// CHECK3-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR2]]
+// CHECK3-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], @s_arr
+// CHECK3-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE1:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK3: arraydestroy.done1:
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_ZN1SIfEC2Ef
+// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], float noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca float, align 4
+// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: store float [[A]], ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load volatile i32, ptr @g, align 4
+// CHECK3-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP1]] to float
+// CHECK3-NEXT: [[ADD:%.*]] = fadd float [[TMP0]], [[CONV]]
+// CHECK3-NEXT: store float [[ADD]], ptr [[F]], align 4
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@__cxx_global_var_init.2
+// CHECK3-SAME: () #[[ATTR0]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) @var, float noundef 3.000000e+00)
+// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN1SIfED1Ev, ptr @var, ptr @__dso_handle) #[[ATTR2]]
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@main
+// CHECK3-SAME: () #[[ATTR3:[0-9]+]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK3-NEXT: store i32 0, ptr [[RETVAL]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK3-NEXT: store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK3-NEXT: store i32 0, ptr [[TMP1]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK3-NEXT: store ptr null, ptr [[TMP2]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK3-NEXT: store ptr null, ptr [[TMP3]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK3-NEXT: store ptr null, ptr [[TMP4]], align 4
+// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK3-NEXT: store ptr null, ptr [[TMP5]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK3-NEXT: store ptr null, ptr [[TMP6]], align 4
+// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK3-NEXT: store ptr null, ptr [[TMP7]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK3-NEXT: store i64 2, ptr [[TMP8]], align 8
+// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK3-NEXT: store i64 0, ptr [[TMP9]], align 8
+// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP10]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP11]], align 4
+// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK3-NEXT: store i32 0, ptr [[TMP12]], align 4
+// CHECK3-NEXT: [[TMP13:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.region_id, ptr [[KERNEL_ARGS]])
+// CHECK3-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
+// CHECK3-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK3: omp_offload.failed:
+// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124() #[[ATTR2]]
+// CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK3: omp_offload.cont:
+// CHECK3-NEXT: [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v()
+// CHECK3-NEXT: ret i32 [[CALL]]
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124
+// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined)
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[T_VAR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4
+// CHECK3-NEXT: [[S_ARR:%.*]] = alloca [2 x %struct.S], align 4
+// CHECK3-NEXT: [[VAR:%.*]] = alloca [[STRUCT_S:%.*]], align 4
+// CHECK3-NEXT: [[SIVAR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
+// CHECK3-NEXT: [[ARRAYCTOR_END:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN]], i32 2
+// CHECK3-NEXT: br label [[ARRAYCTOR_LOOP:%.*]]
+// CHECK3: arrayctor.loop:
+// CHECK3-NEXT: [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
+// CHECK3-NEXT: call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]])
+// CHECK3-NEXT: [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYCTOR_CUR]], i32 1
+// CHECK3-NEXT: [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
+// CHECK3-NEXT: br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
+// CHECK3: arrayctor.cont:
+// CHECK3-NEXT: call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]])
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 1
+// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK3: cond.true:
+// CHECK3-NEXT: br label [[COND_END:%.*]]
+// CHECK3: cond.false:
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: br label [[COND_END]]
+// CHECK3: cond.end:
+// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK3-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3: omp.inner.for.cond:
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK3-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
+// CHECK3: omp.inner.for.cond.cleanup:
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3: omp.inner.for.body:
+// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined.omp_outlined, i32 [[TMP7]], i32 [[TMP8]])
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3: omp.inner.for.inc:
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK3: omp.inner.for.end:
+// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3: omp.loop.exit:
+// CHECK3-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP12]])
+// CHECK3-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR2]]
+// CHECK3-NEXT: [[ARRAY_BEGIN2:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN2]], i32 2
+// CHECK3-NEXT: br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK3: arraydestroy.body:
+// CHECK3-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP13]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK3-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
+// CHECK3-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR2]]
+// CHECK3-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN2]]
+// CHECK3-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE3:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK3: arraydestroy.done3:
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[T_VAR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4
+// CHECK3-NEXT: [[S_ARR:%.*]] = alloca [2 x %struct.S], align 4
+// CHECK3-NEXT: [[VAR:%.*]] = alloca [[STRUCT_S:%.*]], align 4
+// CHECK3-NEXT: [[SIVAR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP0]], ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
+// CHECK3-NEXT: [[ARRAYCTOR_END:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN]], i32 2
+// CHECK3-NEXT: br label [[ARRAYCTOR_LOOP:%.*]]
+// CHECK3: arrayctor.loop:
+// CHECK3-NEXT: [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
+// CHECK3-NEXT: call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]])
+// CHECK3-NEXT: [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYCTOR_CUR]], i32 1
+// CHECK3-NEXT: [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
+// CHECK3-NEXT: br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
+// CHECK3: arrayctor.cont:
+// CHECK3-NEXT: call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]])
+// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 1
+// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK3: cond.true:
+// CHECK3-NEXT: br label [[COND_END:%.*]]
+// CHECK3: cond.false:
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: br label [[COND_END]]
+// CHECK3: cond.end:
+// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK3-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3: omp.inner.for.cond:
+// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK3-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
+// CHECK3: omp.inner.for.cond.cleanup:
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3: omp.inner.for.body:
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[T_VAR]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4
+// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC]], i32 0, i32 [[TMP11]]
+// CHECK3-NEXT: store i32 [[TMP10]], ptr [[ARRAYIDX]], align 4
+// CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[I]], align 4
+// CHECK3-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 [[TMP12]]
+// CHECK3-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[ARRAYIDX2]], ptr align 4 [[VAR]], i32 4, i1 false)
+// CHECK3-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4
+// CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[SIVAR]], align 4
+// CHECK3-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP14]], [[TMP13]]
+// CHECK3-NEXT: store i32 [[ADD3]], ptr [[SIVAR]], align 4
+// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK3: omp.body.continue:
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3: omp.inner.for.inc:
+// CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP15]], 1
+// CHECK3-NEXT: store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK3: omp.inner.for.end:
+// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3: omp.loop.exit:
+// CHECK3-NEXT: [[TMP16:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP17]])
+// CHECK3-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR2]]
+// CHECK3-NEXT: [[ARRAY_BEGIN5:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN5]], i32 2
+// CHECK3-NEXT: br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK3: arraydestroy.body:
+// CHECK3-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP18]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK3-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
+// CHECK3-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR2]]
+// CHECK3-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN5]]
+// CHECK3-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE6:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK3: arraydestroy.done6:
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
+// CHECK3-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
+// CHECK3-NEXT: [[T_VAR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4
+// CHECK3-NEXT: [[S_ARR:%.*]] = alloca [2 x %struct.S.0], align 4
+// CHECK3-NEXT: [[VAR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[_TMP1:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK3-NEXT: call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
+// CHECK3-NEXT: store i32 0, ptr [[T_VAR]], align 4
+// CHECK3-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
+// CHECK3-NEXT: [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
+// CHECK3-NEXT: call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
+// CHECK3-NEXT: [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK3-NEXT: call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
+// CHECK3-NEXT: store ptr [[TEST]], ptr [[VAR]], align 4
+// CHECK3-NEXT: store ptr undef, ptr [[_TMP1]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK3-NEXT: store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK3-NEXT: store i32 0, ptr [[TMP1]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK3-NEXT: store ptr null, ptr [[TMP2]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK3-NEXT: store ptr null, ptr [[TMP3]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK3-NEXT: store ptr null, ptr [[TMP4]], align 4
+// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK3-NEXT: store ptr null, ptr [[TMP5]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK3-NEXT: store ptr null, ptr [[TMP6]], align 4
+// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK3-NEXT: store ptr null, ptr [[TMP7]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK3-NEXT: store i64 2, ptr [[TMP8]], align 8
+// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK3-NEXT: store i64 0, ptr [[TMP9]], align 8
+// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP10]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP11]], align 4
+// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK3-NEXT: store i32 0, ptr [[TMP12]], align 4
+// CHECK3-NEXT: [[TMP13:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.region_id, ptr [[KERNEL_ARGS]])
+// CHECK3-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
+// CHECK3-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK3: omp_offload.failed:
+// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80() #[[ATTR2]]
+// CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK3: omp_offload.cont:
+// CHECK3-NEXT: store i32 0, ptr [[RETVAL]], align 4
+// CHECK3-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i32 2
+// CHECK3-NEXT: br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK3: arraydestroy.body:
+// CHECK3-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK3-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
+// CHECK3-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR2]]
+// CHECK3-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
+// CHECK3-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK3: arraydestroy.done2:
+// CHECK3-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR2]]
+// CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[RETVAL]], align 4
+// CHECK3-NEXT: ret i32 [[TMP16]]
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_ZN1SIiEC1Ev
+// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: call void @_ZN1SIiEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]])
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_ZN1SIiEC1Ei
+// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: call void @_ZN1SIiEC2Ei(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], i32 noundef [[TMP0]])
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80
+// CHECK3-SAME: () #[[ATTR4]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined)
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[_TMP1:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[T_VAR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4
+// CHECK3-NEXT: [[S_ARR:%.*]] = alloca [2 x %struct.S.0], align 4
+// CHECK3-NEXT: [[VAR:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
+// CHECK3-NEXT: [[_TMP2:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr undef, ptr [[_TMP1]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
+// CHECK3-NEXT: [[ARRAYCTOR_END:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i32 2
+// CHECK3-NEXT: br label [[ARRAYCTOR_LOOP:%.*]]
+// CHECK3: arrayctor.loop:
+// CHECK3-NEXT: [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
+// CHECK3-NEXT: call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]])
+// CHECK3-NEXT: [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYCTOR_CUR]], i32 1
+// CHECK3-NEXT: [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
+// CHECK3-NEXT: br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
+// CHECK3: arrayctor.cont:
+// CHECK3-NEXT: call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]])
+// CHECK3-NEXT: store ptr [[VAR]], ptr [[_TMP2]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 1
+// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK3: cond.true:
+// CHECK3-NEXT: br label [[COND_END:%.*]]
+// CHECK3: cond.false:
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: br label [[COND_END]]
+// CHECK3: cond.end:
+// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK3-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3: omp.inner.for.cond:
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[CMP3:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK3-NEXT: br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
+// CHECK3: omp.inner.for.cond.cleanup:
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3: omp.inner.for.body:
+// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined.omp_outlined, i32 [[TMP7]], i32 [[TMP8]])
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3: omp.inner.for.inc:
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK3: omp.inner.for.end:
+// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3: omp.loop.exit:
+// CHECK3-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP12]])
+// CHECK3-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR2]]
+// CHECK3-NEXT: [[ARRAY_BEGIN4:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN4]], i32 2
+// CHECK3-NEXT: br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK3: arraydestroy.body:
+// CHECK3-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP13]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK3-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
+// CHECK3-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR2]]
+// CHECK3-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN4]]
+// CHECK3-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE5:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK3: arraydestroy.done5:
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[_TMP1:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[T_VAR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4
+// CHECK3-NEXT: [[S_ARR:%.*]] = alloca [2 x %struct.S.0], align 4
+// CHECK3-NEXT: [[VAR:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
+// CHECK3-NEXT: [[_TMP2:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: store ptr undef, ptr [[_TMP1]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP0]], ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
+// CHECK3-NEXT: [[ARRAYCTOR_END:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i32 2
+// CHECK3-NEXT: br label [[ARRAYCTOR_LOOP:%.*]]
+// CHECK3: arrayctor.loop:
+// CHECK3-NEXT: [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
+// CHECK3-NEXT: call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]])
+// CHECK3-NEXT: [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYCTOR_CUR]], i32 1
+// CHECK3-NEXT: [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
+// CHECK3-NEXT: br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
+// CHECK3: arrayctor.cont:
+// CHECK3-NEXT: call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]])
+// CHECK3-NEXT: store ptr [[VAR]], ptr [[_TMP2]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 1
+// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK3: cond.true:
+// CHECK3-NEXT: br label [[COND_END:%.*]]
+// CHECK3: cond.false:
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: br label [[COND_END]]
+// CHECK3: cond.end:
+// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK3-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3: omp.inner.for.cond:
+// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[CMP3:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK3-NEXT: br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
+// CHECK3: omp.inner.for.cond.cleanup:
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3: omp.inner.for.body:
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[T_VAR]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4
+// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC]], i32 0, i32 [[TMP11]]
+// CHECK3-NEXT: store i32 [[TMP10]], ptr [[ARRAYIDX]], align 4
+// CHECK3-NEXT: [[TMP12:%.*]] = load ptr, ptr [[_TMP2]], align 4
+// CHECK3-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4
+// CHECK3-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 [[TMP13]]
+// CHECK3-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[ARRAYIDX4]], ptr align 4 [[TMP12]], i32 4, i1 false)
+// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK3: omp.body.continue:
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3: omp.inner.for.inc:
+// CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP14]], 1
+// CHECK3-NEXT: store i32 [[ADD5]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK3: omp.inner.for.end:
+// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3: omp.loop.exit:
+// CHECK3-NEXT: [[TMP15:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP16]])
+// CHECK3-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR2]]
+// CHECK3-NEXT: [[ARRAY_BEGIN6:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN6]], i32 2
+// CHECK3-NEXT: br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK3: arraydestroy.body:
+// CHECK3-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP17]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK3-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
+// CHECK3-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR2]]
+// CHECK3-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN6]]
+// CHECK3-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE7:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK3: arraydestroy.done7:
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_ZN1SIiED1Ev
+// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR2]]
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_ZN1SIiEC2Ev
+// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP0:%.*]] = load volatile i32, ptr @g, align 4
+// CHECK3-NEXT: store i32 [[TMP0]], ptr [[F]], align 4
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_ZN1SIiEC2Ei
+// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load volatile i32, ptr @g, align 4
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP1]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[F]], align 4
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_ZN1SIiED2Ev
+// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_GLOBAL__sub_I_target_teams_generic_loop_private_codegen.cpp
+// CHECK3-SAME: () #[[ATTR0]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: call void @__cxx_global_var_init()
+// CHECK3-NEXT: call void @__cxx_global_var_init.1()
+// CHECK3-NEXT: call void @__cxx_global_var_init.2()
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK3-SAME: () #[[ATTR0]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK5-LABEL: define {{[^@]+}}@__cxx_global_var_init
+// CHECK5-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK5-NEXT: entry:
+// CHECK5-NEXT: call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) @test)
+// CHECK5-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN1SIfED1Ev, ptr @test, ptr @__dso_handle) #[[ATTR2:[0-9]+]]
+// CHECK5-NEXT: ret void
+//
+//
+// CHECK5-LABEL: define {{[^@]+}}@_ZN1SIfEC1Ev
+// CHECK5-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 {
+// CHECK5-NEXT: entry:
+// CHECK5-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK5-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK5-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK5-NEXT: call void @_ZN1SIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]])
+// CHECK5-NEXT: ret void
+//
+//
+// CHECK5-LABEL: define {{[^@]+}}@_ZN1SIfED1Ev
+// CHECK5-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK5-NEXT: entry:
+// CHECK5-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK5-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK5-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK5-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR2]]
+// CHECK5-NEXT: ret void
+//
+//
+// CHECK5-LABEL: define {{[^@]+}}@_ZN1SIfEC2Ev
+// CHECK5-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK5-NEXT: entry:
+// CHECK5-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK5-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK5-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK5-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK5-NEXT: [[TMP0:%.*]] = load volatile i32, ptr @g, align 4
+// CHECK5-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
+// CHECK5-NEXT: store float [[CONV]], ptr [[F]], align 4
+// CHECK5-NEXT: ret void
+//
+//
+// CHECK5-LABEL: define {{[^@]+}}@_ZN1SIfED2Ev
+// CHECK5-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK5-NEXT: entry:
+// CHECK5-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK5-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK5-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK5-NEXT: ret void
+//
+//
+// CHECK5-LABEL: define {{[^@]+}}@__cxx_global_var_init.1
+// CHECK5-SAME: () #[[ATTR0]] {
+// CHECK5-NEXT: entry:
+// CHECK5-NEXT: call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) @s_arr, float noundef 1.000000e+00)
+// CHECK5-NEXT: call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) getelementptr inbounds ([[STRUCT_S:%.*]], ptr @s_arr, i64 1), float noundef 2.000000e+00)
+// CHECK5-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @__cxx_global_array_dtor, ptr null, ptr @__dso_handle) #[[ATTR2]]
+// CHECK5-NEXT: ret void
+//
+//
+// CHECK5-LABEL: define {{[^@]+}}@_ZN1SIfEC1Ef
+// CHECK5-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], float noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK5-NEXT: entry:
+// CHECK5-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca float, align 4
+// CHECK5-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK5-NEXT: store float [[A]], ptr [[A_ADDR]], align 4
+// CHECK5-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK5-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR]], align 4
+// CHECK5-NEXT: call void @_ZN1SIfEC2Ef(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], float noundef [[TMP0]])
+// CHECK5-NEXT: ret void
+//
+//
+// CHECK5-LABEL: define {{[^@]+}}@__cxx_global_array_dtor
+// CHECK5-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR0]] {
+// CHECK5-NEXT: entry:
+// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK5-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK5-NEXT: br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK5: arraydestroy.body:
+// CHECK5-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ getelementptr inbounds ([[STRUCT_S:%.*]], ptr @s_arr, i64 2), [[ENTRY:%.*]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK5-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
+// CHECK5-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR2]]
+// CHECK5-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], @s_arr
+// CHECK5-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE1:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK5: arraydestroy.done1:
+// CHECK5-NEXT: ret void
+//
+//
+// CHECK5-LABEL: define {{[^@]+}}@_ZN1SIfEC2Ef
+// CHECK5-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], float noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK5-NEXT: entry:
+// CHECK5-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK5-NEXT: [[A_ADDR:%.*]] = alloca float, align 4
+// CHECK5-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK5-NEXT: store float [[A]], ptr [[A_ADDR]], align 4
+// CHECK5-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK5-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK5-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR]], align 4
+// CHECK5-NEXT: [[TMP1:%.*]] = load volatile i32, ptr @g, align 4
+// CHECK5-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP1]] to float
+// CHECK5-NEXT: [[ADD:%.*]] = fadd float [[TMP0]], [[CONV]]
+// CHECK5-NEXT: store float [[ADD]], ptr [[F]], align 4
+// CHECK5-NEXT: ret void
+//
+//
+// CHECK5-LABEL: define {{[^@]+}}@__cxx_global_var_init.2
+// CHECK5-SAME: () #[[ATTR0]] {
+// CHECK5-NEXT: entry:
+// CHECK5-NEXT: call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) @var, float noundef 3.000000e+00)
+// CHECK5-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN1SIfED1Ev, ptr @var, ptr @__dso_handle) #[[ATTR2]]
+// CHECK5-NEXT: ret void
+//
+//
+// CHECK5-LABEL: define {{[^@]+}}@main
+// CHECK5-SAME: () #[[ATTR3:[0-9]+]] {
+// CHECK5-NEXT: entry:
+// CHECK5-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[REF_TMP:%.*]] = alloca [[CLASS_ANON:%.*]], align 1
+// CHECK5-NEXT: store i32 0, ptr [[RETVAL]], align 4
+// CHECK5-NEXT: call void @"_ZZ4mainENK3$_0clEv"(ptr noundef nonnull align 1 dereferenceable(1) [[REF_TMP]])
+// CHECK5-NEXT: ret i32 0
+//
+//
+// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104
+// CHECK5-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK5-NEXT: entry:
+// CHECK5-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3:[0-9]+]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined)
+// CHECK5-NEXT: ret void
+//
+//
+// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5]] {
+// CHECK5-NEXT: entry:
+// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[_TMP1:%.*]] = alloca ptr, align 8
+// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[G:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[G1:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[_TMP2:%.*]] = alloca ptr, align 8
+// CHECK5-NEXT: [[SIVAR:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK5-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK5-NEXT: store ptr undef, ptr [[_TMP1]], align 8
+// CHECK5-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK5-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK5-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK5-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK5-NEXT: store ptr [[G1]], ptr [[_TMP2]], align 8
+// CHECK5-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK5-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK5-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK5-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 1
+// CHECK5-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK5: cond.true:
+// CHECK5-NEXT: br label [[COND_END:%.*]]
+// CHECK5: cond.false:
+// CHECK5-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK5-NEXT: br label [[COND_END]]
+// CHECK5: cond.end:
+// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK5-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK5-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK5-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK5: omp.inner.for.cond:
+// CHECK5-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK5-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK5-NEXT: [[CMP3:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK5-NEXT: br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK5: omp.inner.for.body:
+// CHECK5-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK5-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK5-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK5-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK5-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]])
+// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK5: omp.inner.for.inc:
+// CHECK5-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK5-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK5-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK5: omp.inner.for.end:
+// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK5: omp.loop.exit:
+// CHECK5-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP1]])
+// CHECK5-NEXT: ret void
+//
+//
+// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined.omp_outlined
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR5]] {
+// CHECK5-NEXT: entry:
+// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[_TMP1:%.*]] = alloca ptr, align 8
+// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[G:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[G1:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[_TMP3:%.*]] = alloca ptr, align 8
+// CHECK5-NEXT: [[SIVAR:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[REF_TMP:%.*]] = alloca [[CLASS_ANON_0:%.*]], align 8
+// CHECK5-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK5-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK5-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK5-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK5-NEXT: store ptr undef, ptr [[_TMP1]], align 8
+// CHECK5-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK5-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4
+// CHECK5-NEXT: [[TMP0:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK5-NEXT: [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK5-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK5-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK5-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK5-NEXT: store i32 [[CONV2]], ptr [[DOTOMP_UB]], align 4
+// CHECK5-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK5-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK5-NEXT: store ptr [[G1]], ptr [[_TMP3]], align 8
+// CHECK5-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK5-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CHECK5-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK5-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 1
+// CHECK5-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK5: cond.true:
+// CHECK5-NEXT: br label [[COND_END:%.*]]
+// CHECK5: cond.false:
+// CHECK5-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK5-NEXT: br label [[COND_END]]
+// CHECK5: cond.end:
+// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK5-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK5-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK5-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
+// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK5: omp.inner.for.cond:
+// CHECK5-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK5-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK5-NEXT: [[CMP4:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK5-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK5: omp.inner.for.body:
+// CHECK5-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK5-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK5-NEXT: store i32 1, ptr [[G]], align 4
+// CHECK5-NEXT: [[TMP10:%.*]] = load ptr, ptr [[_TMP3]], align 8
+// CHECK5-NEXT: store volatile i32 1, ptr [[TMP10]], align 4
+// CHECK5-NEXT: store i32 2, ptr [[SIVAR]], align 4
+// CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 0
+// CHECK5-NEXT: store ptr [[G]], ptr [[TMP11]], align 8
+// CHECK5-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 1
+// CHECK5-NEXT: [[TMP13:%.*]] = load ptr, ptr [[_TMP3]], align 8
+// CHECK5-NEXT: store ptr [[TMP13]], ptr [[TMP12]], align 8
+// CHECK5-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 2
+// CHECK5-NEXT: store ptr [[SIVAR]], ptr [[TMP14]], align 8
+// CHECK5-NEXT: call void @"_ZZZ4mainENK3$_0clEvENKUlvE_clEv"(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]])
+// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK5: omp.body.continue:
+// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK5: omp.inner.for.inc:
+// CHECK5-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK5-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP15]], 1
+// CHECK5-NEXT: store i32 [[ADD5]], ptr [[DOTOMP_IV]], align 4
+// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK5: omp.inner.for.end:
+// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK5: omp.loop.exit:
+// CHECK5-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP3]])
+// CHECK5-NEXT: ret void
+//
+//
+// CHECK5-LABEL: define {{[^@]+}}@_GLOBAL__sub_I_target_teams_generic_loop_private_codegen.cpp
+// CHECK5-SAME: () #[[ATTR0]] {
+// CHECK5-NEXT: entry:
+// CHECK5-NEXT: call void @__cxx_global_var_init()
+// CHECK5-NEXT: call void @__cxx_global_var_init.1()
+// CHECK5-NEXT: call void @__cxx_global_var_init.2()
+// CHECK5-NEXT: ret void
+//
+//
+// CHECK5-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK5-SAME: () #[[ATTR0]] {
+// CHECK5-NEXT: entry:
+// CHECK5-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK5-NEXT: ret void
+//
+//
+// CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124
+// CHECK13-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK13-NEXT: entry:
+// CHECK13-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3:[0-9]+]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined)
+// CHECK13-NEXT: ret void
+//
+//
+// CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
+// CHECK13-NEXT: entry:
+// CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK13-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK13-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK13-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK13-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK13-NEXT: [[T_VAR:%.*]] = alloca i32, align 4
+// CHECK13-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4
+// CHECK13-NEXT: [[S_ARR:%.*]] = alloca [2 x %struct.S], align 4
+// CHECK13-NEXT: [[VAR:%.*]] = alloca [[STRUCT_S:%.*]], align 4
+// CHECK13-NEXT: [[SIVAR:%.*]] = alloca i32, align 4
+// CHECK13-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK13-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK13-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK13-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK13-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK13-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK13-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK13-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
+// CHECK13-NEXT: [[ARRAYCTOR_END:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN]], i64 2
+// CHECK13-NEXT: br label [[ARRAYCTOR_LOOP:%.*]]
+// CHECK13: arrayctor.loop:
+// CHECK13-NEXT: [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
+// CHECK13-NEXT: call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR4:[0-9]+]]
+// CHECK13-NEXT: [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYCTOR_CUR]], i64 1
+// CHECK13-NEXT: [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
+// CHECK13-NEXT: br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
+// CHECK13: arrayctor.cont:
+// CHECK13-NEXT: call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
+// CHECK13-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK13-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK13-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK13-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK13-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 1
+// CHECK13-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK13: cond.true:
+// CHECK13-NEXT: br label [[COND_END:%.*]]
+// CHECK13: cond.false:
+// CHECK13-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK13-NEXT: br label [[COND_END]]
+// CHECK13: cond.end:
+// CHECK13-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK13-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK13-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK13-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK13: omp.inner.for.cond:
+// CHECK13-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK13-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK13-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK13-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
+// CHECK13: omp.inner.for.cond.cleanup:
+// CHECK13-NEXT: br label [[OMP_INNER_FOR_END:%.*]]
+// CHECK13: omp.inner.for.body:
+// CHECK13-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK13-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK13-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK13-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK13-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]])
+// CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK13: omp.inner.for.inc:
+// CHECK13-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK13-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK13-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK13: omp.inner.for.end:
+// CHECK13-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK13: omp.loop.exit:
+// CHECK13-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK13-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
+// CHECK13-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP14]])
+// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5:[0-9]+]]
+// CHECK13-NEXT: [[ARRAY_BEGIN2:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
+// CHECK13-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN2]], i64 2
+// CHECK13-NEXT: br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK13: arraydestroy.body:
+// CHECK13-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK13-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
+// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK13-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN2]]
+// CHECK13-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE3:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK13: arraydestroy.done3:
+// CHECK13-NEXT: ret void
+//
+//
+// CHECK13-LABEL: define {{[^@]+}}@_ZN1SIfEC1Ev
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 {
+// CHECK13-NEXT: entry:
+// CHECK13-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK13-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK13-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK13-NEXT: call void @_ZN1SIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
+// CHECK13-NEXT: ret void
+//
+//
+// CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined.omp_outlined
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR0]] {
+// CHECK13-NEXT: entry:
+// CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK13-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK13-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK13-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK13-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK13-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK13-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK13-NEXT: [[T_VAR:%.*]] = alloca i32, align 4
+// CHECK13-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4
+// CHECK13-NEXT: [[S_ARR:%.*]] = alloca [2 x %struct.S], align 4
+// CHECK13-NEXT: [[VAR:%.*]] = alloca [[STRUCT_S:%.*]], align 4
+// CHECK13-NEXT: [[SIVAR:%.*]] = alloca i32, align 4
+// CHECK13-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK13-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK13-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK13-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK13-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK13-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK13-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4
+// CHECK13-NEXT: [[TMP0:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK13-NEXT: [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK13-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK13-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK13-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK13-NEXT: store i32 [[CONV1]], ptr [[DOTOMP_UB]], align 4
+// CHECK13-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK13-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK13-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
+// CHECK13-NEXT: [[ARRAYCTOR_END:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN]], i64 2
+// CHECK13-NEXT: br label [[ARRAYCTOR_LOOP:%.*]]
+// CHECK13: arrayctor.loop:
+// CHECK13-NEXT: [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
+// CHECK13-NEXT: call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR4]]
+// CHECK13-NEXT: [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYCTOR_CUR]], i64 1
+// CHECK13-NEXT: [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
+// CHECK13-NEXT: br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
+// CHECK13: arrayctor.cont:
+// CHECK13-NEXT: call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
+// CHECK13-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK13-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CHECK13-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK13-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK13-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 1
+// CHECK13-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK13: cond.true:
+// CHECK13-NEXT: br label [[COND_END:%.*]]
+// CHECK13: cond.false:
+// CHECK13-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK13-NEXT: br label [[COND_END]]
+// CHECK13: cond.end:
+// CHECK13-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK13-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK13-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK13-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
+// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK13: omp.inner.for.cond:
+// CHECK13-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK13-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK13-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK13-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
+// CHECK13: omp.inner.for.cond.cleanup:
+// CHECK13-NEXT: br label [[OMP_INNER_FOR_END:%.*]]
+// CHECK13: omp.inner.for.body:
+// CHECK13-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK13-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK13-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK13-NEXT: [[TMP10:%.*]] = load i32, ptr [[T_VAR]], align 4
+// CHECK13-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4
+// CHECK13-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP11]] to i64
+// CHECK13-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC]], i64 0, i64 [[IDXPROM]]
+// CHECK13-NEXT: store i32 [[TMP10]], ptr [[ARRAYIDX]], align 4
+// CHECK13-NEXT: [[TMP12:%.*]] = load i32, ptr [[I]], align 4
+// CHECK13-NEXT: [[IDXPROM3:%.*]] = sext i32 [[TMP12]] to i64
+// CHECK13-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 [[IDXPROM3]]
+// CHECK13-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[ARRAYIDX4]], ptr align 4 [[VAR]], i64 4, i1 false)
+// CHECK13-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4
+// CHECK13-NEXT: [[TMP14:%.*]] = load i32, ptr [[SIVAR]], align 4
+// CHECK13-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP14]], [[TMP13]]
+// CHECK13-NEXT: store i32 [[ADD5]], ptr [[SIVAR]], align 4
+// CHECK13-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK13: omp.body.continue:
+// CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK13: omp.inner.for.inc:
+// CHECK13-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK13-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP15]], 1
+// CHECK13-NEXT: store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4
+// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK13: omp.inner.for.end:
+// CHECK13-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK13: omp.loop.exit:
+// CHECK13-NEXT: [[TMP16:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK13-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
+// CHECK13-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP17]])
+// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK13-NEXT: [[ARRAY_BEGIN7:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
+// CHECK13-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN7]], i64 2
+// CHECK13-NEXT: br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK13: arraydestroy.body:
+// CHECK13-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP18]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK13-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
+// CHECK13-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK13-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN7]]
+// CHECK13-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE8:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK13: arraydestroy.done8:
+// CHECK13-NEXT: ret void
+//
+//
+// CHECK13-LABEL: define {{[^@]+}}@_ZN1SIfED1Ev
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK13-NEXT: entry:
+// CHECK13-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK13-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK13-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK13-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK13-NEXT: ret void
+//
+//
+// CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80
+// CHECK13-SAME: () #[[ATTR0]] {
+// CHECK13-NEXT: entry:
+// CHECK13-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined)
+// CHECK13-NEXT: ret void
+//
+//
+// CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
+// CHECK13-NEXT: entry:
+// CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK13-NEXT: [[_TMP1:%.*]] = alloca ptr, align 8
+// CHECK13-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK13-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK13-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK13-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK13-NEXT: [[T_VAR:%.*]] = alloca i32, align 4
+// CHECK13-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4
+// CHECK13-NEXT: [[S_ARR:%.*]] = alloca [2 x %struct.S.0], align 4
+// CHECK13-NEXT: [[VAR:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
+// CHECK13-NEXT: [[_TMP2:%.*]] = alloca ptr, align 8
+// CHECK13-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK13-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK13-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK13-NEXT: store ptr undef, ptr [[_TMP1]], align 8
+// CHECK13-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK13-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK13-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK13-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK13-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
+// CHECK13-NEXT: [[ARRAYCTOR_END:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i64 2
+// CHECK13-NEXT: br label [[ARRAYCTOR_LOOP:%.*]]
+// CHECK13: arrayctor.loop:
+// CHECK13-NEXT: [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
+// CHECK13-NEXT: call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR4]]
+// CHECK13-NEXT: [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYCTOR_CUR]], i64 1
+// CHECK13-NEXT: [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
+// CHECK13-NEXT: br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
+// CHECK13: arrayctor.cont:
+// CHECK13-NEXT: call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
+// CHECK13-NEXT: store ptr [[VAR]], ptr [[_TMP2]], align 8
+// CHECK13-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK13-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK13-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK13-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK13-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 1
+// CHECK13-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK13: cond.true:
+// CHECK13-NEXT: br label [[COND_END:%.*]]
+// CHECK13: cond.false:
+// CHECK13-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK13-NEXT: br label [[COND_END]]
+// CHECK13: cond.end:
+// CHECK13-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK13-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK13-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK13-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK13: omp.inner.for.cond:
+// CHECK13-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK13-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK13-NEXT: [[CMP3:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK13-NEXT: br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
+// CHECK13: omp.inner.for.cond.cleanup:
+// CHECK13-NEXT: br label [[OMP_INNER_FOR_END:%.*]]
+// CHECK13: omp.inner.for.body:
+// CHECK13-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK13-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK13-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK13-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK13-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]])
+// CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK13: omp.inner.for.inc:
+// CHECK13-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK13-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK13-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK13: omp.inner.for.end:
+// CHECK13-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK13: omp.loop.exit:
+// CHECK13-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK13-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
+// CHECK13-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP14]])
+// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK13-NEXT: [[ARRAY_BEGIN4:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
+// CHECK13-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN4]], i64 2
+// CHECK13-NEXT: br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK13: arraydestroy.body:
+// CHECK13-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK13-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
+// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK13-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN4]]
+// CHECK13-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE5:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK13: arraydestroy.done5:
+// CHECK13-NEXT: ret void
+//
+//
+// CHECK13-LABEL: define {{[^@]+}}@_ZN1SIiEC1Ev
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK13-NEXT: entry:
+// CHECK13-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK13-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK13-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK13-NEXT: call void @_ZN1SIiEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
+// CHECK13-NEXT: ret void
+//
+//
+// CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined.omp_outlined
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR0]] {
+// CHECK13-NEXT: entry:
+// CHECK13-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK13-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK13-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK13-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK13-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK13-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK13-NEXT: [[_TMP1:%.*]] = alloca ptr, align 8
+// CHECK13-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK13-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK13-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK13-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK13-NEXT: [[T_VAR:%.*]] = alloca i32, align 4
+// CHECK13-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4
+// CHECK13-NEXT: [[S_ARR:%.*]] = alloca [2 x %struct.S.0], align 4
+// CHECK13-NEXT: [[VAR:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
+// CHECK13-NEXT: [[_TMP3:%.*]] = alloca ptr, align 8
+// CHECK13-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK13-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK13-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK13-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK13-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK13-NEXT: store ptr undef, ptr [[_TMP1]], align 8
+// CHECK13-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK13-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4
+// CHECK13-NEXT: [[TMP0:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK13-NEXT: [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK13-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK13-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK13-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK13-NEXT: store i32 [[CONV2]], ptr [[DOTOMP_UB]], align 4
+// CHECK13-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK13-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK13-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
+// CHECK13-NEXT: [[ARRAYCTOR_END:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i64 2
+// CHECK13-NEXT: br label [[ARRAYCTOR_LOOP:%.*]]
+// CHECK13: arrayctor.loop:
+// CHECK13-NEXT: [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
+// CHECK13-NEXT: call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR4]]
+// CHECK13-NEXT: [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYCTOR_CUR]], i64 1
+// CHECK13-NEXT: [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
+// CHECK13-NEXT: br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
+// CHECK13: arrayctor.cont:
+// CHECK13-NEXT: call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
+// CHECK13-NEXT: store ptr [[VAR]], ptr [[_TMP3]], align 8
+// CHECK13-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK13-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CHECK13-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK13-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK13-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 1
+// CHECK13-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK13: cond.true:
+// CHECK13-NEXT: br label [[COND_END:%.*]]
+// CHECK13: cond.false:
+// CHECK13-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK13-NEXT: br label [[COND_END]]
+// CHECK13: cond.end:
+// CHECK13-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK13-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK13-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK13-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
+// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK13: omp.inner.for.cond:
+// CHECK13-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK13-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK13-NEXT: [[CMP4:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK13-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
+// CHECK13: omp.inner.for.cond.cleanup:
+// CHECK13-NEXT: br label [[OMP_INNER_FOR_END:%.*]]
+// CHECK13: omp.inner.for.body:
+// CHECK13-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK13-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK13-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK13-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK13-NEXT: [[TMP10:%.*]] = load i32, ptr [[T_VAR]], align 4
+// CHECK13-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4
+// CHECK13-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP11]] to i64
+// CHECK13-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC]], i64 0, i64 [[IDXPROM]]
+// CHECK13-NEXT: store i32 [[TMP10]], ptr [[ARRAYIDX]], align 4
+// CHECK13-NEXT: [[TMP12:%.*]] = load ptr, ptr [[_TMP3]], align 8
+// CHECK13-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4
+// CHECK13-NEXT: [[IDXPROM5:%.*]] = sext i32 [[TMP13]] to i64
+// CHECK13-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 [[IDXPROM5]]
+// CHECK13-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[ARRAYIDX6]], ptr align 4 [[TMP12]], i64 4, i1 false)
+// CHECK13-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK13: omp.body.continue:
+// CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK13: omp.inner.for.inc:
+// CHECK13-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK13-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP14]], 1
+// CHECK13-NEXT: store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4
+// CHECK13-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK13: omp.inner.for.end:
+// CHECK13-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK13: omp.loop.exit:
+// CHECK13-NEXT: [[TMP15:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK13-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
+// CHECK13-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP16]])
+// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK13-NEXT: [[ARRAY_BEGIN8:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
+// CHECK13-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN8]], i64 2
+// CHECK13-NEXT: br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK13: arraydestroy.body:
+// CHECK13-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP17]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK13-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
+// CHECK13-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK13-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN8]]
+// CHECK13-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE9:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK13: arraydestroy.done9:
+// CHECK13-NEXT: ret void
+//
+//
+// CHECK13-LABEL: define {{[^@]+}}@_ZN1SIiED1Ev
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK13-NEXT: entry:
+// CHECK13-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK13-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK13-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK13-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK13-NEXT: ret void
+//
+//
+// CHECK13-LABEL: define {{[^@]+}}@_ZN1SIfEC2Ev
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK13-NEXT: entry:
+// CHECK13-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK13-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK13-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK13-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK13-NEXT: [[TMP0:%.*]] = load volatile i32, ptr @g, align 4
+// CHECK13-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
+// CHECK13-NEXT: store float [[CONV]], ptr [[F]], align 4
+// CHECK13-NEXT: ret void
+//
+//
+// CHECK13-LABEL: define {{[^@]+}}@_ZN1SIfED2Ev
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK13-NEXT: entry:
+// CHECK13-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK13-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK13-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK13-NEXT: ret void
+//
+//
+// CHECK13-LABEL: define {{[^@]+}}@_ZN1SIiEC2Ev
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK13-NEXT: entry:
+// CHECK13-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK13-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK13-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK13-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK13-NEXT: [[TMP0:%.*]] = load volatile i32, ptr @g, align 4
+// CHECK13-NEXT: store i32 [[TMP0]], ptr [[F]], align 4
+// CHECK13-NEXT: ret void
+//
+//
+// CHECK13-LABEL: define {{[^@]+}}@_ZN1SIiED2Ev
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK13-NEXT: entry:
+// CHECK13-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK13-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK13-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK13-NEXT: ret void
+//
+//
+// CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124
+// CHECK15-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK15-NEXT: entry:
+// CHECK15-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3:[0-9]+]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined)
+// CHECK15-NEXT: ret void
+//
+//
+// CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
+// CHECK15-NEXT: entry:
+// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: [[T_VAR:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4
+// CHECK15-NEXT: [[S_ARR:%.*]] = alloca [2 x %struct.S], align 4
+// CHECK15-NEXT: [[VAR:%.*]] = alloca [[STRUCT_S:%.*]], align 4
+// CHECK15-NEXT: [[SIVAR:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK15-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK15-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK15-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK15-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK15-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK15-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
+// CHECK15-NEXT: [[ARRAYCTOR_END:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN]], i32 2
+// CHECK15-NEXT: br label [[ARRAYCTOR_LOOP:%.*]]
+// CHECK15: arrayctor.loop:
+// CHECK15-NEXT: [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
+// CHECK15-NEXT: call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR4:[0-9]+]]
+// CHECK15-NEXT: [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYCTOR_CUR]], i32 1
+// CHECK15-NEXT: [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
+// CHECK15-NEXT: br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
+// CHECK15: arrayctor.cont:
+// CHECK15-NEXT: call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
+// CHECK15-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK15-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK15-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK15-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK15-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 1
+// CHECK15-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK15: cond.true:
+// CHECK15-NEXT: br label [[COND_END:%.*]]
+// CHECK15: cond.false:
+// CHECK15-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK15-NEXT: br label [[COND_END]]
+// CHECK15: cond.end:
+// CHECK15-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK15-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK15-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK15-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK15: omp.inner.for.cond:
+// CHECK15-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK15-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK15-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK15-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
+// CHECK15: omp.inner.for.cond.cleanup:
+// CHECK15-NEXT: br label [[OMP_INNER_FOR_END:%.*]]
+// CHECK15: omp.inner.for.body:
+// CHECK15-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK15-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK15-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined.omp_outlined, i32 [[TMP7]], i32 [[TMP8]])
+// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK15: omp.inner.for.inc:
+// CHECK15-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK15-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
+// CHECK15-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK15: omp.inner.for.end:
+// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK15: omp.loop.exit:
+// CHECK15-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK15-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
+// CHECK15-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP12]])
+// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5:[0-9]+]]
+// CHECK15-NEXT: [[ARRAY_BEGIN2:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
+// CHECK15-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN2]], i32 2
+// CHECK15-NEXT: br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK15: arraydestroy.body:
+// CHECK15-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP13]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK15-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
+// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK15-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN2]]
+// CHECK15-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE3:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK15: arraydestroy.done3:
+// CHECK15-NEXT: ret void
+//
+//
+// CHECK15-LABEL: define {{[^@]+}}@_ZN1SIfEC1Ev
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 {
+// CHECK15-NEXT: entry:
+// CHECK15-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK15-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK15-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK15-NEXT: call void @_ZN1SIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
+// CHECK15-NEXT: ret void
+//
+//
+// CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined.omp_outlined
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR0]] {
+// CHECK15-NEXT: entry:
+// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK15-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: [[T_VAR:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4
+// CHECK15-NEXT: [[S_ARR:%.*]] = alloca [2 x %struct.S], align 4
+// CHECK15-NEXT: [[VAR:%.*]] = alloca [[STRUCT_S:%.*]], align 4
+// CHECK15-NEXT: [[SIVAR:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK15-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK15-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK15-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK15-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK15-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4
+// CHECK15-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK15-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK15-NEXT: store i32 [[TMP0]], ptr [[DOTOMP_LB]], align 4
+// CHECK15-NEXT: store i32 [[TMP1]], ptr [[DOTOMP_UB]], align 4
+// CHECK15-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK15-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK15-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
+// CHECK15-NEXT: [[ARRAYCTOR_END:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN]], i32 2
+// CHECK15-NEXT: br label [[ARRAYCTOR_LOOP:%.*]]
+// CHECK15: arrayctor.loop:
+// CHECK15-NEXT: [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
+// CHECK15-NEXT: call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR4]]
+// CHECK15-NEXT: [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYCTOR_CUR]], i32 1
+// CHECK15-NEXT: [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
+// CHECK15-NEXT: br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
+// CHECK15: arrayctor.cont:
+// CHECK15-NEXT: call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
+// CHECK15-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK15-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CHECK15-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK15-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK15-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 1
+// CHECK15-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK15: cond.true:
+// CHECK15-NEXT: br label [[COND_END:%.*]]
+// CHECK15: cond.false:
+// CHECK15-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK15-NEXT: br label [[COND_END]]
+// CHECK15: cond.end:
+// CHECK15-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK15-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK15-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK15-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
+// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK15: omp.inner.for.cond:
+// CHECK15-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK15-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK15-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK15-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
+// CHECK15: omp.inner.for.cond.cleanup:
+// CHECK15-NEXT: br label [[OMP_INNER_FOR_END:%.*]]
+// CHECK15: omp.inner.for.body:
+// CHECK15-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK15-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK15-NEXT: [[TMP10:%.*]] = load i32, ptr [[T_VAR]], align 4
+// CHECK15-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4
+// CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC]], i32 0, i32 [[TMP11]]
+// CHECK15-NEXT: store i32 [[TMP10]], ptr [[ARRAYIDX]], align 4
+// CHECK15-NEXT: [[TMP12:%.*]] = load i32, ptr [[I]], align 4
+// CHECK15-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 [[TMP12]]
+// CHECK15-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[ARRAYIDX2]], ptr align 4 [[VAR]], i32 4, i1 false)
+// CHECK15-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4
+// CHECK15-NEXT: [[TMP14:%.*]] = load i32, ptr [[SIVAR]], align 4
+// CHECK15-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP14]], [[TMP13]]
+// CHECK15-NEXT: store i32 [[ADD3]], ptr [[SIVAR]], align 4
+// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK15: omp.body.continue:
+// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK15: omp.inner.for.inc:
+// CHECK15-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK15-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP15]], 1
+// CHECK15-NEXT: store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4
+// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK15: omp.inner.for.end:
+// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK15: omp.loop.exit:
+// CHECK15-NEXT: [[TMP16:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK15-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
+// CHECK15-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP17]])
+// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK15-NEXT: [[ARRAY_BEGIN5:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
+// CHECK15-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN5]], i32 2
+// CHECK15-NEXT: br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK15: arraydestroy.body:
+// CHECK15-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP18]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK15-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
+// CHECK15-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK15-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN5]]
+// CHECK15-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE6:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK15: arraydestroy.done6:
+// CHECK15-NEXT: ret void
+//
+//
+// CHECK15-LABEL: define {{[^@]+}}@_ZN1SIfED1Ev
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK15-NEXT: entry:
+// CHECK15-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK15-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK15-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK15-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK15-NEXT: ret void
+//
+//
+// CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80
+// CHECK15-SAME: () #[[ATTR0]] {
+// CHECK15-NEXT: entry:
+// CHECK15-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined)
+// CHECK15-NEXT: ret void
+//
+//
+// CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
+// CHECK15-NEXT: entry:
+// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: [[_TMP1:%.*]] = alloca ptr, align 4
+// CHECK15-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: [[T_VAR:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4
+// CHECK15-NEXT: [[S_ARR:%.*]] = alloca [2 x %struct.S.0], align 4
+// CHECK15-NEXT: [[VAR:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
+// CHECK15-NEXT: [[_TMP2:%.*]] = alloca ptr, align 4
+// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK15-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK15-NEXT: store ptr undef, ptr [[_TMP1]], align 4
+// CHECK15-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK15-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK15-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK15-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK15-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
+// CHECK15-NEXT: [[ARRAYCTOR_END:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i32 2
+// CHECK15-NEXT: br label [[ARRAYCTOR_LOOP:%.*]]
+// CHECK15: arrayctor.loop:
+// CHECK15-NEXT: [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
+// CHECK15-NEXT: call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR4]]
+// CHECK15-NEXT: [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYCTOR_CUR]], i32 1
+// CHECK15-NEXT: [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
+// CHECK15-NEXT: br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
+// CHECK15: arrayctor.cont:
+// CHECK15-NEXT: call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
+// CHECK15-NEXT: store ptr [[VAR]], ptr [[_TMP2]], align 4
+// CHECK15-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK15-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK15-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK15-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK15-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 1
+// CHECK15-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK15: cond.true:
+// CHECK15-NEXT: br label [[COND_END:%.*]]
+// CHECK15: cond.false:
+// CHECK15-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK15-NEXT: br label [[COND_END]]
+// CHECK15: cond.end:
+// CHECK15-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK15-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK15-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK15-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK15: omp.inner.for.cond:
+// CHECK15-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK15-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK15-NEXT: [[CMP3:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK15-NEXT: br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
+// CHECK15: omp.inner.for.cond.cleanup:
+// CHECK15-NEXT: br label [[OMP_INNER_FOR_END:%.*]]
+// CHECK15: omp.inner.for.body:
+// CHECK15-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK15-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK15-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined.omp_outlined, i32 [[TMP7]], i32 [[TMP8]])
+// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK15: omp.inner.for.inc:
+// CHECK15-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK15-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
+// CHECK15-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK15: omp.inner.for.end:
+// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK15: omp.loop.exit:
+// CHECK15-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK15-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
+// CHECK15-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP12]])
+// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK15-NEXT: [[ARRAY_BEGIN4:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
+// CHECK15-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN4]], i32 2
+// CHECK15-NEXT: br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK15: arraydestroy.body:
+// CHECK15-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP13]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK15-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
+// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK15-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN4]]
+// CHECK15-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE5:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK15: arraydestroy.done5:
+// CHECK15-NEXT: ret void
+//
+//
+// CHECK15-LABEL: define {{[^@]+}}@_ZN1SIiEC1Ev
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK15-NEXT: entry:
+// CHECK15-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK15-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK15-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK15-NEXT: call void @_ZN1SIiEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
+// CHECK15-NEXT: ret void
+//
+//
+// CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined.omp_outlined
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR0]] {
+// CHECK15-NEXT: entry:
+// CHECK15-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK15-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK15-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: [[_TMP1:%.*]] = alloca ptr, align 4
+// CHECK15-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: [[T_VAR:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4
+// CHECK15-NEXT: [[S_ARR:%.*]] = alloca [2 x %struct.S.0], align 4
+// CHECK15-NEXT: [[VAR:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
+// CHECK15-NEXT: [[_TMP2:%.*]] = alloca ptr, align 4
+// CHECK15-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK15-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK15-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK15-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK15-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK15-NEXT: store ptr undef, ptr [[_TMP1]], align 4
+// CHECK15-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK15-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4
+// CHECK15-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK15-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK15-NEXT: store i32 [[TMP0]], ptr [[DOTOMP_LB]], align 4
+// CHECK15-NEXT: store i32 [[TMP1]], ptr [[DOTOMP_UB]], align 4
+// CHECK15-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK15-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK15-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
+// CHECK15-NEXT: [[ARRAYCTOR_END:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i32 2
+// CHECK15-NEXT: br label [[ARRAYCTOR_LOOP:%.*]]
+// CHECK15: arrayctor.loop:
+// CHECK15-NEXT: [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
+// CHECK15-NEXT: call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR4]]
+// CHECK15-NEXT: [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYCTOR_CUR]], i32 1
+// CHECK15-NEXT: [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
+// CHECK15-NEXT: br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
+// CHECK15: arrayctor.cont:
+// CHECK15-NEXT: call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
+// CHECK15-NEXT: store ptr [[VAR]], ptr [[_TMP2]], align 4
+// CHECK15-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK15-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CHECK15-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK15-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK15-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 1
+// CHECK15-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK15: cond.true:
+// CHECK15-NEXT: br label [[COND_END:%.*]]
+// CHECK15: cond.false:
+// CHECK15-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK15-NEXT: br label [[COND_END]]
+// CHECK15: cond.end:
+// CHECK15-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK15-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK15-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK15-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
+// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK15: omp.inner.for.cond:
+// CHECK15-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK15-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK15-NEXT: [[CMP3:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK15-NEXT: br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
+// CHECK15: omp.inner.for.cond.cleanup:
+// CHECK15-NEXT: br label [[OMP_INNER_FOR_END:%.*]]
+// CHECK15: omp.inner.for.body:
+// CHECK15-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK15-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK15-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK15-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK15-NEXT: [[TMP10:%.*]] = load i32, ptr [[T_VAR]], align 4
+// CHECK15-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4
+// CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC]], i32 0, i32 [[TMP11]]
+// CHECK15-NEXT: store i32 [[TMP10]], ptr [[ARRAYIDX]], align 4
+// CHECK15-NEXT: [[TMP12:%.*]] = load ptr, ptr [[_TMP2]], align 4
+// CHECK15-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4
+// CHECK15-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 [[TMP13]]
+// CHECK15-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[ARRAYIDX4]], ptr align 4 [[TMP12]], i32 4, i1 false)
+// CHECK15-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK15: omp.body.continue:
+// CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK15: omp.inner.for.inc:
+// CHECK15-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK15-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP14]], 1
+// CHECK15-NEXT: store i32 [[ADD5]], ptr [[DOTOMP_IV]], align 4
+// CHECK15-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK15: omp.inner.for.end:
+// CHECK15-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK15: omp.loop.exit:
+// CHECK15-NEXT: [[TMP15:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK15-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
+// CHECK15-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP16]])
+// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK15-NEXT: [[ARRAY_BEGIN6:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
+// CHECK15-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN6]], i32 2
+// CHECK15-NEXT: br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK15: arraydestroy.body:
+// CHECK15-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP17]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK15-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
+// CHECK15-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK15-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN6]]
+// CHECK15-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE7:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK15: arraydestroy.done7:
+// CHECK15-NEXT: ret void
+//
+//
+// CHECK15-LABEL: define {{[^@]+}}@_ZN1SIiED1Ev
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK15-NEXT: entry:
+// CHECK15-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK15-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK15-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK15-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK15-NEXT: ret void
+//
+//
+// CHECK15-LABEL: define {{[^@]+}}@_ZN1SIfEC2Ev
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK15-NEXT: entry:
+// CHECK15-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK15-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK15-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK15-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK15-NEXT: [[TMP0:%.*]] = load volatile i32, ptr @g, align 4
+// CHECK15-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
+// CHECK15-NEXT: store float [[CONV]], ptr [[F]], align 4
+// CHECK15-NEXT: ret void
+//
+//
+// CHECK15-LABEL: define {{[^@]+}}@_ZN1SIfED2Ev
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK15-NEXT: entry:
+// CHECK15-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK15-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK15-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK15-NEXT: ret void
+//
+//
+// CHECK15-LABEL: define {{[^@]+}}@_ZN1SIiEC2Ev
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK15-NEXT: entry:
+// CHECK15-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK15-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK15-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK15-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK15-NEXT: [[TMP0:%.*]] = load volatile i32, ptr @g, align 4
+// CHECK15-NEXT: store i32 [[TMP0]], ptr [[F]], align 4
+// CHECK15-NEXT: ret void
+//
+//
+// CHECK15-LABEL: define {{[^@]+}}@_ZN1SIiED2Ev
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK15-NEXT: entry:
+// CHECK15-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK15-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK15-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK15-NEXT: ret void
+//
+//
+// CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104
+// CHECK17-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK17-NEXT: entry:
+// CHECK17-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3:[0-9]+]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined)
+// CHECK17-NEXT: ret void
+//
+//
+// CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
+// CHECK17-NEXT: entry:
+// CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK17-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK17-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK17-NEXT: [[_TMP1:%.*]] = alloca ptr, align 8
+// CHECK17-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK17-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK17-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK17-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK17-NEXT: [[G:%.*]] = alloca i32, align 4
+// CHECK17-NEXT: [[G1:%.*]] = alloca i32, align 4
+// CHECK17-NEXT: [[_TMP2:%.*]] = alloca ptr, align 8
+// CHECK17-NEXT: [[SIVAR:%.*]] = alloca i32, align 4
+// CHECK17-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK17-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK17-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK17-NEXT: store ptr undef, ptr [[_TMP1]], align 8
+// CHECK17-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK17-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK17-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK17-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK17-NEXT: store ptr [[G1]], ptr [[_TMP2]], align 8
+// CHECK17-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK17-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK17-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK17-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK17-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 1
+// CHECK17-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK17: cond.true:
+// CHECK17-NEXT: br label [[COND_END:%.*]]
+// CHECK17: cond.false:
+// CHECK17-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK17-NEXT: br label [[COND_END]]
+// CHECK17: cond.end:
+// CHECK17-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK17-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK17-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK17-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK17-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK17: omp.inner.for.cond:
+// CHECK17-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK17-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK17-NEXT: [[CMP3:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK17-NEXT: br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK17: omp.inner.for.body:
+// CHECK17-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK17-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK17-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK17-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK17-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]])
+// CHECK17-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK17: omp.inner.for.inc:
+// CHECK17-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK17-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK17-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK17-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK17-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK17: omp.inner.for.end:
+// CHECK17-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK17: omp.loop.exit:
+// CHECK17-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP1]])
+// CHECK17-NEXT: ret void
+//
+//
+// CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined.omp_outlined
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR0]] {
+// CHECK17-NEXT: entry:
+// CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK17-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK17-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK17-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK17-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK17-NEXT: [[_TMP1:%.*]] = alloca ptr, align 8
+// CHECK17-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK17-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK17-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK17-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK17-NEXT: [[G:%.*]] = alloca i32, align 4
+// CHECK17-NEXT: [[G1:%.*]] = alloca i32, align 4
+// CHECK17-NEXT: [[_TMP3:%.*]] = alloca ptr, align 8
+// CHECK17-NEXT: [[SIVAR:%.*]] = alloca i32, align 4
+// CHECK17-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK17-NEXT: [[REF_TMP:%.*]] = alloca [[CLASS_ANON:%.*]], align 8
+// CHECK17-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK17-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK17-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK17-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK17-NEXT: store ptr undef, ptr [[_TMP1]], align 8
+// CHECK17-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK17-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4
+// CHECK17-NEXT: [[TMP0:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK17-NEXT: [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK17-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK17-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK17-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK17-NEXT: store i32 [[CONV2]], ptr [[DOTOMP_UB]], align 4
+// CHECK17-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK17-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK17-NEXT: store ptr [[G1]], ptr [[_TMP3]], align 8
+// CHECK17-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK17-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CHECK17-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK17-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK17-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 1
+// CHECK17-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK17: cond.true:
+// CHECK17-NEXT: br label [[COND_END:%.*]]
+// CHECK17: cond.false:
+// CHECK17-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK17-NEXT: br label [[COND_END]]
+// CHECK17: cond.end:
+// CHECK17-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK17-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK17-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK17-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
+// CHECK17-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK17: omp.inner.for.cond:
+// CHECK17-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK17-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK17-NEXT: [[CMP4:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK17-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK17: omp.inner.for.body:
+// CHECK17-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK17-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK17-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK17-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK17-NEXT: store i32 1, ptr [[G]], align 4
+// CHECK17-NEXT: [[TMP10:%.*]] = load ptr, ptr [[_TMP3]], align 8
+// CHECK17-NEXT: store volatile i32 1, ptr [[TMP10]], align 4
+// CHECK17-NEXT: store i32 2, ptr [[SIVAR]], align 4
+// CHECK17-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[REF_TMP]], i32 0, i32 0
+// CHECK17-NEXT: store ptr [[G]], ptr [[TMP11]], align 8
+// CHECK17-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[REF_TMP]], i32 0, i32 1
+// CHECK17-NEXT: [[TMP13:%.*]] = load ptr, ptr [[_TMP3]], align 8
+// CHECK17-NEXT: store ptr [[TMP13]], ptr [[TMP12]], align 8
+// CHECK17-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[REF_TMP]], i32 0, i32 2
+// CHECK17-NEXT: store ptr [[SIVAR]], ptr [[TMP14]], align 8
+// CHECK17-NEXT: call void @"_ZZZ4mainENK3$_0clEvENKUlvE_clEv"(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR3:[0-9]+]]
+// CHECK17-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK17: omp.body.continue:
+// CHECK17-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK17: omp.inner.for.inc:
+// CHECK17-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK17-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP15]], 1
+// CHECK17-NEXT: store i32 [[ADD5]], ptr [[DOTOMP_IV]], align 4
+// CHECK17-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK17: omp.inner.for.end:
+// CHECK17-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK17: omp.loop.exit:
+// CHECK17-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP3]])
+// CHECK17-NEXT: ret void
+//
diff --git a/clang/test/OpenMP/target_teams_generic_loop_reduction_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_reduction_codegen.cpp
new file mode 100644
index 00000000000000..fa48c02a656475
--- /dev/null
+++ b/clang/test/OpenMP/target_teams_generic_loop_reduction_codegen.cpp
@@ -0,0 +1,1510 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK3
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK3
+
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK5
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK5
+
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DLAMBDA -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+template <typename T>
+T tmain() {
+ T t_var = T();
+ T vec[] = {1, 2};
+#pragma omp target teams distribute parallel for reduction(+: t_var)
+ for (int i = 0; i < 2; ++i) {
+ t_var += (T) i;
+ }
+ return T();
+}
+
+int main() {
+ static int sivar;
+#ifdef LAMBDA
+
+ [&]() {
+#pragma omp target teams distribute parallel for reduction(+: sivar)
+ for (int i = 0; i < 2; ++i) {
+
+ // Skip global and bound tid vars
+
+
+
+ // Skip global and bound tid vars, and prev lb and ub vars
+ // skip loop vars
+
+
+ sivar += i;
+
+ [&]() {
+
+ sivar += 4;
+
+ }();
+ }
+ }();
+ return 0;
+#else
+#pragma omp target teams distribute parallel for reduction(+: sivar)
+ for (int i = 0; i < 2; ++i) {
+ sivar += i;
+ }
+ return tmain<int>();
+#endif
+}
+
+
+
+
+// Skip global and bound tid vars
+
+
+// Skip global and bound tid vars, and prev lb and ub
+// skip loop vars
+
+
+
+
+// Skip global and bound tid vars
+
+
+// Skip global and bound tid vars, and prev lb and ub vars
+// skip loop vars
+
+#endif
+// CHECK1-LABEL: define {{[^@]+}}@main
+// CHECK1-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[RETVAL]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT: store ptr @_ZZ4mainE5sivar, ptr [[TMP0]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT: store ptr @_ZZ4mainE5sivar, ptr [[TMP1]], align 8
+// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK1-NEXT: store ptr null, ptr [[TMP2]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK1-NEXT: store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK1-NEXT: store i32 1, ptr [[TMP6]], align 4
+// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK1-NEXT: store ptr [[TMP3]], ptr [[TMP7]], align 8
+// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK1-NEXT: store ptr [[TMP4]], ptr [[TMP8]], align 8
+// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK1-NEXT: store ptr @.offload_sizes, ptr [[TMP9]], align 8
+// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK1-NEXT: store ptr @.offload_maptypes, ptr [[TMP10]], align 8
+// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK1-NEXT: store ptr null, ptr [[TMP11]], align 8
+// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK1-NEXT: store ptr null, ptr [[TMP12]], align 8
+// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK1-NEXT: store i64 2, ptr [[TMP13]], align 8
+// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK1-NEXT: store i64 0, ptr [[TMP14]], align 8
+// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP15]], align 4
+// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4
+// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK1-NEXT: store i32 0, ptr [[TMP17]], align 4
+// CHECK1-NEXT: [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB4:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.region_id, ptr [[KERNEL_ARGS]])
+// CHECK1-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// CHECK1-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK1: omp_offload.failed:
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66(ptr @_ZZ4mainE5sivar) #[[ATTR2:[0-9]+]]
+// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK1: omp_offload.cont:
+// CHECK1-NEXT: [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v()
+// CHECK1-NEXT: ret i32 [[CALL]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[SIVAR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SIVAR_ADDR]], align 8
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB4]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined, ptr [[TMP0]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[SIVAR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[SIVAR1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SIVAR_ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[SIVAR1]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP2]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 1
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK1-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB4]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp_outlined, i64 [[TMP9]], i64 [[TMP11]], ptr [[SIVAR1]])
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP2]])
+// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
+// CHECK1-NEXT: store ptr [[SIVAR1]], ptr [[TMP14]], align 8
+// CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3:[0-9]+]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK1-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+// CHECK1-NEXT: ]
+// CHECK1: .omp.reduction.case1:
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[SIVAR1]], align 4
+// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+// CHECK1-NEXT: store i32 [[ADD3]], ptr [[TMP0]], align 4
+// CHECK1-NEXT: call void @__kmpc_end_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK1: .omp.reduction.case2:
+// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[SIVAR1]], align 4
+// CHECK1-NEXT: [[TMP19:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP18]] monotonic, align 4
+// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK1: .omp.reduction.default:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[SIVAR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[SIVAR2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SIVAR_ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK1-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32
+// CHECK1-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[CONV1]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[SIVAR2]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP4]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP5]], 1
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP6]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP3:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK1-NEXT: br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[SIVAR2]], align 4
+// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP12]], [[TMP11]]
+// CHECK1-NEXT: store i32 [[ADD4]], ptr [[SIVAR2]], align 4
+// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1: omp.body.continue:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP13]], 1
+// CHECK1-NEXT: store i32 [[ADD5]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP4]])
+// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
+// CHECK1-NEXT: store ptr [[SIVAR2]], ptr [[TMP14]], align 8
+// CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK1-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+// CHECK1-NEXT: ]
+// CHECK1: .omp.reduction.case1:
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[SIVAR2]], align 4
+// CHECK1-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+// CHECK1-NEXT: store i32 [[ADD6]], ptr [[TMP0]], align 4
+// CHECK1-NEXT: call void @__kmpc_end_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK1: .omp.reduction.case2:
+// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[SIVAR2]], align 4
+// CHECK1-NEXT: [[TMP19:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP18]] monotonic, align 4
+// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK1: .omp.reduction.default:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp_outlined.omp.reduction.reduction_func
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK1-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
+// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
+// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[TMP7]], align 4
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp.reduction.reduction_func
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK1-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
+// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
+// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[TMP7]], align 4
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
+// CHECK1-SAME: () #[[ATTR5:[0-9]+]] comdat {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[T_VAR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4
+// CHECK1-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[T_VAR]], align 4
+// CHECK1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
+// CHECK1-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT: store ptr [[T_VAR]], ptr [[TMP0]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT: store ptr [[T_VAR]], ptr [[TMP1]], align 8
+// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK1-NEXT: store ptr null, ptr [[TMP2]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK1-NEXT: store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK1-NEXT: store i32 1, ptr [[TMP6]], align 4
+// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK1-NEXT: store ptr [[TMP3]], ptr [[TMP7]], align 8
+// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK1-NEXT: store ptr [[TMP4]], ptr [[TMP8]], align 8
+// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK1-NEXT: store ptr @.offload_sizes.1, ptr [[TMP9]], align 8
+// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK1-NEXT: store ptr @.offload_maptypes.2, ptr [[TMP10]], align 8
+// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK1-NEXT: store ptr null, ptr [[TMP11]], align 8
+// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK1-NEXT: store ptr null, ptr [[TMP12]], align 8
+// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK1-NEXT: store i64 2, ptr [[TMP13]], align 8
+// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK1-NEXT: store i64 0, ptr [[TMP14]], align 8
+// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP15]], align 4
+// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4
+// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK1-NEXT: store i32 0, ptr [[TMP17]], align 4
+// CHECK1-NEXT: [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB4]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.region_id, ptr [[KERNEL_ARGS]])
+// CHECK1-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// CHECK1-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK1: omp_offload.failed:
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(ptr [[T_VAR]]) #[[ATTR2]]
+// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK1: omp_offload.cont:
+// CHECK1-NEXT: ret i32 0
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[T_VAR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[T_VAR]], ptr [[T_VAR_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[T_VAR_ADDR]], align 8
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB4]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined, ptr [[TMP0]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[T_VAR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[T_VAR1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[T_VAR]], ptr [[T_VAR_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[T_VAR_ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[T_VAR1]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP2]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 1
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK1-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB4]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined, i64 [[TMP9]], i64 [[TMP11]], ptr [[T_VAR1]])
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP2]])
+// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
+// CHECK1-NEXT: store ptr [[T_VAR1]], ptr [[TMP14]], align 8
+// CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK1-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+// CHECK1-NEXT: ]
+// CHECK1: .omp.reduction.case1:
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[T_VAR1]], align 4
+// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+// CHECK1-NEXT: store i32 [[ADD3]], ptr [[TMP0]], align 4
+// CHECK1-NEXT: call void @__kmpc_end_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK1: .omp.reduction.case2:
+// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[T_VAR1]], align 4
+// CHECK1-NEXT: [[TMP19:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP18]] monotonic, align 4
+// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK1: .omp.reduction.default:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[T_VAR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[T_VAR2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[T_VAR]], ptr [[T_VAR_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[T_VAR_ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK1-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32
+// CHECK1-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[CONV1]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[T_VAR2]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP4]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP5]], 1
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP6]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP3:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK1-NEXT: br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[T_VAR2]], align 4
+// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP12]], [[TMP11]]
+// CHECK1-NEXT: store i32 [[ADD4]], ptr [[T_VAR2]], align 4
+// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1: omp.body.continue:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP13]], 1
+// CHECK1-NEXT: store i32 [[ADD5]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP4]])
+// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
+// CHECK1-NEXT: store ptr [[T_VAR2]], ptr [[TMP14]], align 8
+// CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK1-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+// CHECK1-NEXT: ]
+// CHECK1: .omp.reduction.case1:
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[T_VAR2]], align 4
+// CHECK1-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+// CHECK1-NEXT: store i32 [[ADD6]], ptr [[TMP0]], align 4
+// CHECK1-NEXT: call void @__kmpc_end_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK1: .omp.reduction.case2:
+// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[T_VAR2]], align 4
+// CHECK1-NEXT: [[TMP19:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP18]] monotonic, align 4
+// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK1: .omp.reduction.default:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined.omp.reduction.reduction_func
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK1-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
+// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
+// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[TMP7]], align 4
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK1-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
+// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
+// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[TMP7]], align 4
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK1-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@main
+// CHECK3-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK3-NEXT: store i32 0, ptr [[RETVAL]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT: store ptr @_ZZ4mainE5sivar, ptr [[TMP0]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT: store ptr @_ZZ4mainE5sivar, ptr [[TMP1]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK3-NEXT: store ptr null, ptr [[TMP2]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK3-NEXT: store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK3-NEXT: store i32 1, ptr [[TMP6]], align 4
+// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK3-NEXT: store ptr [[TMP3]], ptr [[TMP7]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK3-NEXT: store ptr [[TMP4]], ptr [[TMP8]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK3-NEXT: store ptr @.offload_sizes, ptr [[TMP9]], align 4
+// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK3-NEXT: store ptr @.offload_maptypes, ptr [[TMP10]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK3-NEXT: store ptr null, ptr [[TMP11]], align 4
+// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK3-NEXT: store ptr null, ptr [[TMP12]], align 4
+// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK3-NEXT: store i64 2, ptr [[TMP13]], align 8
+// CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK3-NEXT: store i64 0, ptr [[TMP14]], align 8
+// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP15]], align 4
+// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4
+// CHECK3-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK3-NEXT: store i32 0, ptr [[TMP17]], align 4
+// CHECK3-NEXT: [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB4:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.region_id, ptr [[KERNEL_ARGS]])
+// CHECK3-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// CHECK3-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK3: omp_offload.failed:
+// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66(ptr @_ZZ4mainE5sivar) #[[ATTR2:[0-9]+]]
+// CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK3: omp_offload.cont:
+// CHECK3-NEXT: [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v()
+// CHECK3-NEXT: ret i32 [[CALL]]
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66
+// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[SIVAR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: store ptr [[SIVAR]], ptr [[SIVAR_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SIVAR_ADDR]], align 4
+// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB4]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined, ptr [[TMP0]])
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[SIVAR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[SIVAR1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[SIVAR]], ptr [[SIVAR_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SIVAR_ADDR]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[SIVAR1]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP2]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 1
+// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK3: cond.true:
+// CHECK3-NEXT: br label [[COND_END:%.*]]
+// CHECK3: cond.false:
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: br label [[COND_END]]
+// CHECK3: cond.end:
+// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK3-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3: omp.inner.for.cond:
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK3-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3: omp.inner.for.body:
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB4]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp_outlined, i32 [[TMP8]], i32 [[TMP9]], ptr [[SIVAR1]])
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3: omp.inner.for.inc:
+// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], [[TMP11]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK3: omp.inner.for.end:
+// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3: omp.loop.exit:
+// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP2]])
+// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
+// CHECK3-NEXT: store ptr [[SIVAR1]], ptr [[TMP12]], align 4
+// CHECK3-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3:[0-9]+]], i32 [[TMP2]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK3-NEXT: switch i32 [[TMP13]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+// CHECK3-NEXT: ]
+// CHECK3: .omp.reduction.case1:
+// CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[SIVAR1]], align 4
+// CHECK3-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
+// CHECK3-NEXT: store i32 [[ADD3]], ptr [[TMP0]], align 4
+// CHECK3-NEXT: call void @__kmpc_end_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK3: .omp.reduction.case2:
+// CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[SIVAR1]], align 4
+// CHECK3-NEXT: [[TMP17:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP16]] monotonic, align 4
+// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK3: .omp.reduction.default:
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[SIVAR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[SIVAR1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[SIVAR]], ptr [[SIVAR_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SIVAR_ADDR]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[SIVAR1]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP4]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP5]], 1
+// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK3: cond.true:
+// CHECK3-NEXT: br label [[COND_END:%.*]]
+// CHECK3: cond.false:
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: br label [[COND_END]]
+// CHECK3: cond.end:
+// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP6]], [[COND_FALSE]] ]
+// CHECK3-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3: omp.inner.for.cond:
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK3-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3: omp.inner.for.body:
+// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4
+// CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[SIVAR1]], align 4
+// CHECK3-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP12]], [[TMP11]]
+// CHECK3-NEXT: store i32 [[ADD3]], ptr [[SIVAR1]], align 4
+// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK3: omp.body.continue:
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3: omp.inner.for.inc:
+// CHECK3-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP13]], 1
+// CHECK3-NEXT: store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK3: omp.inner.for.end:
+// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3: omp.loop.exit:
+// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP4]])
+// CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
+// CHECK3-NEXT: store ptr [[SIVAR1]], ptr [[TMP14]], align 4
+// CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK3-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+// CHECK3-NEXT: ]
+// CHECK3: .omp.reduction.case1:
+// CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK3-NEXT: [[TMP17:%.*]] = load i32, ptr [[SIVAR1]], align 4
+// CHECK3-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+// CHECK3-NEXT: store i32 [[ADD5]], ptr [[TMP0]], align 4
+// CHECK3-NEXT: call void @__kmpc_end_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK3: .omp.reduction.case2:
+// CHECK3-NEXT: [[TMP18:%.*]] = load i32, ptr [[SIVAR1]], align 4
+// CHECK3-NEXT: [[TMP19:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP18]] monotonic, align 4
+// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK3: .omp.reduction.default:
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp_outlined.omp.reduction.reduction_func
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
+// CHECK3-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[TMP7]], align 4
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp.reduction.reduction_func
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
+// CHECK3-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[TMP7]], align 4
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
+// CHECK3-SAME: () #[[ATTR5:[0-9]+]] comdat {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[T_VAR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK3-NEXT: store i32 0, ptr [[T_VAR]], align 4
+// CHECK3-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
+// CHECK3-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT: store ptr [[T_VAR]], ptr [[TMP0]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT: store ptr [[T_VAR]], ptr [[TMP1]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK3-NEXT: store ptr null, ptr [[TMP2]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK3-NEXT: store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK3-NEXT: store i32 1, ptr [[TMP6]], align 4
+// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK3-NEXT: store ptr [[TMP3]], ptr [[TMP7]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK3-NEXT: store ptr [[TMP4]], ptr [[TMP8]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK3-NEXT: store ptr @.offload_sizes.1, ptr [[TMP9]], align 4
+// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK3-NEXT: store ptr @.offload_maptypes.2, ptr [[TMP10]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK3-NEXT: store ptr null, ptr [[TMP11]], align 4
+// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK3-NEXT: store ptr null, ptr [[TMP12]], align 4
+// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK3-NEXT: store i64 2, ptr [[TMP13]], align 8
+// CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK3-NEXT: store i64 0, ptr [[TMP14]], align 8
+// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP15]], align 4
+// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4
+// CHECK3-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK3-NEXT: store i32 0, ptr [[TMP17]], align 4
+// CHECK3-NEXT: [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB4]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.region_id, ptr [[KERNEL_ARGS]])
+// CHECK3-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// CHECK3-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK3: omp_offload.failed:
+// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(ptr [[T_VAR]]) #[[ATTR2]]
+// CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK3: omp_offload.cont:
+// CHECK3-NEXT: ret i32 0
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32
+// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[T_VAR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: store ptr [[T_VAR]], ptr [[T_VAR_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[T_VAR_ADDR]], align 4
+// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB4]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined, ptr [[TMP0]])
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[T_VAR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[T_VAR1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[T_VAR]], ptr [[T_VAR_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[T_VAR_ADDR]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[T_VAR1]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP2]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 1
+// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK3: cond.true:
+// CHECK3-NEXT: br label [[COND_END:%.*]]
+// CHECK3: cond.false:
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: br label [[COND_END]]
+// CHECK3: cond.end:
+// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK3-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3: omp.inner.for.cond:
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK3-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3: omp.inner.for.body:
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB4]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined, i32 [[TMP8]], i32 [[TMP9]], ptr [[T_VAR1]])
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3: omp.inner.for.inc:
+// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], [[TMP11]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK3: omp.inner.for.end:
+// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3: omp.loop.exit:
+// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP2]])
+// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
+// CHECK3-NEXT: store ptr [[T_VAR1]], ptr [[TMP12]], align 4
+// CHECK3-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP2]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK3-NEXT: switch i32 [[TMP13]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+// CHECK3-NEXT: ]
+// CHECK3: .omp.reduction.case1:
+// CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[T_VAR1]], align 4
+// CHECK3-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
+// CHECK3-NEXT: store i32 [[ADD3]], ptr [[TMP0]], align 4
+// CHECK3-NEXT: call void @__kmpc_end_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK3: .omp.reduction.case2:
+// CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[T_VAR1]], align 4
+// CHECK3-NEXT: [[TMP17:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP16]] monotonic, align 4
+// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK3: .omp.reduction.default:
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[T_VAR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[T_VAR1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[T_VAR]], ptr [[T_VAR_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[T_VAR_ADDR]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[T_VAR1]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP4]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP5]], 1
+// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK3: cond.true:
+// CHECK3-NEXT: br label [[COND_END:%.*]]
+// CHECK3: cond.false:
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: br label [[COND_END]]
+// CHECK3: cond.end:
+// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP6]], [[COND_FALSE]] ]
+// CHECK3-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3: omp.inner.for.cond:
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK3-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3: omp.inner.for.body:
+// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4
+// CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[T_VAR1]], align 4
+// CHECK3-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP12]], [[TMP11]]
+// CHECK3-NEXT: store i32 [[ADD3]], ptr [[T_VAR1]], align 4
+// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK3: omp.body.continue:
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3: omp.inner.for.inc:
+// CHECK3-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP13]], 1
+// CHECK3-NEXT: store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK3: omp.inner.for.end:
+// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3: omp.loop.exit:
+// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP4]])
+// CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
+// CHECK3-NEXT: store ptr [[T_VAR1]], ptr [[TMP14]], align 4
+// CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK3-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+// CHECK3-NEXT: ]
+// CHECK3: .omp.reduction.case1:
+// CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK3-NEXT: [[TMP17:%.*]] = load i32, ptr [[T_VAR1]], align 4
+// CHECK3-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+// CHECK3-NEXT: store i32 [[ADD5]], ptr [[TMP0]], align 4
+// CHECK3-NEXT: call void @__kmpc_end_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK3: .omp.reduction.case2:
+// CHECK3-NEXT: [[TMP18:%.*]] = load i32, ptr [[T_VAR1]], align 4
+// CHECK3-NEXT: [[TMP19:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP18]] monotonic, align 4
+// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK3: .omp.reduction.default:
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined.omp.reduction.reduction_func
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
+// CHECK3-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[TMP7]], align 4
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
+// CHECK3-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[TMP7]], align 4
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK3-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK5-LABEL: define {{[^@]+}}@main
+// CHECK5-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK5-NEXT: entry:
+// CHECK5-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[REF_TMP:%.*]] = alloca [[CLASS_ANON:%.*]], align 1
+// CHECK5-NEXT: store i32 0, ptr [[RETVAL]], align 4
+// CHECK5-NEXT: call void @"_ZZ4mainENK3$_0clEv"(ptr noundef nonnull align 1 dereferenceable(1) [[REF_TMP]])
+// CHECK5-NEXT: ret i32 0
+//
+//
+// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l44
+// CHECK5-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK5-NEXT: entry:
+// CHECK5-NEXT: [[SIVAR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK5-NEXT: store ptr [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
+// CHECK5-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SIVAR_ADDR]], align 8
+// CHECK5-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB4:[0-9]+]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l44.omp_outlined, ptr [[TMP0]])
+// CHECK5-NEXT: ret void
+//
+//
+// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l44.omp_outlined
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] {
+// CHECK5-NEXT: entry:
+// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK5-NEXT: [[SIVAR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK5-NEXT: [[SIVAR1:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
+// CHECK5-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK5-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK5-NEXT: store ptr [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
+// CHECK5-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SIVAR_ADDR]], align 8
+// CHECK5-NEXT: store i32 0, ptr [[SIVAR1]], align 4
+// CHECK5-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK5-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK5-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK5-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK5-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK5-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK5-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP2]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK5-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 1
+// CHECK5-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK5: cond.true:
+// CHECK5-NEXT: br label [[COND_END:%.*]]
+// CHECK5: cond.false:
+// CHECK5-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK5-NEXT: br label [[COND_END]]
+// CHECK5: cond.end:
+// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK5-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK5-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK5-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK5: omp.inner.for.cond:
+// CHECK5-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK5-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK5-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK5-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK5: omp.inner.for.body:
+// CHECK5-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK5-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK5-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK5-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
+// CHECK5-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB4]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l44.omp_outlined.omp_outlined, i64 [[TMP9]], i64 [[TMP11]], ptr [[SIVAR1]])
+// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK5: omp.inner.for.inc:
+// CHECK5-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK5-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
+// CHECK5-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK5: omp.inner.for.end:
+// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK5: omp.loop.exit:
+// CHECK5-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP2]])
+// CHECK5-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
+// CHECK5-NEXT: store ptr [[SIVAR1]], ptr [[TMP14]], align 8
+// CHECK5-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3:[0-9]+]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l44.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK5-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+// CHECK5-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+// CHECK5-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+// CHECK5-NEXT: ]
+// CHECK5: .omp.reduction.case1:
+// CHECK5-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK5-NEXT: [[TMP17:%.*]] = load i32, ptr [[SIVAR1]], align 4
+// CHECK5-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+// CHECK5-NEXT: store i32 [[ADD3]], ptr [[TMP0]], align 4
+// CHECK5-NEXT: call void @__kmpc_end_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK5-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK5: .omp.reduction.case2:
+// CHECK5-NEXT: [[TMP18:%.*]] = load i32, ptr [[SIVAR1]], align 4
+// CHECK5-NEXT: [[TMP19:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP18]] monotonic, align 4
+// CHECK5-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK5: .omp.reduction.default:
+// CHECK5-NEXT: ret void
+//
+//
+// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l44.omp_outlined.omp_outlined
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] {
+// CHECK5-NEXT: entry:
+// CHECK5-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK5-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK5-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK5-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK5-NEXT: [[SIVAR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK5-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[SIVAR2:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK5-NEXT: [[REF_TMP:%.*]] = alloca [[CLASS_ANON_0:%.*]], align 8
+// CHECK5-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
+// CHECK5-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK5-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK5-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK5-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK5-NEXT: store ptr [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
+// CHECK5-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SIVAR_ADDR]], align 8
+// CHECK5-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK5-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4
+// CHECK5-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK5-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK5-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK5-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32
+// CHECK5-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK5-NEXT: store i32 [[CONV1]], ptr [[DOTOMP_UB]], align 4
+// CHECK5-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK5-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK5-NEXT: store i32 0, ptr [[SIVAR2]], align 4
+// CHECK5-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK5-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+// CHECK5-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP4]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK5-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK5-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP5]], 1
+// CHECK5-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK5: cond.true:
+// CHECK5-NEXT: br label [[COND_END:%.*]]
+// CHECK5: cond.false:
+// CHECK5-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK5-NEXT: br label [[COND_END]]
+// CHECK5: cond.end:
+// CHECK5-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP6]], [[COND_FALSE]] ]
+// CHECK5-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK5-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK5-NEXT: store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
+// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK5: omp.inner.for.cond:
+// CHECK5-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK5-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK5-NEXT: [[CMP3:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK5-NEXT: br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK5: omp.inner.for.body:
+// CHECK5-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK5-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK5-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4
+// CHECK5-NEXT: [[TMP12:%.*]] = load i32, ptr [[SIVAR2]], align 4
+// CHECK5-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP12]], [[TMP11]]
+// CHECK5-NEXT: store i32 [[ADD4]], ptr [[SIVAR2]], align 4
+// CHECK5-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 0
+// CHECK5-NEXT: store ptr [[SIVAR2]], ptr [[TMP13]], align 8
+// CHECK5-NEXT: call void @"_ZZZ4mainENK3$_0clEvENKUlvE_clEv"(ptr noundef nonnull align 8 dereferenceable(8) [[REF_TMP]])
+// CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK5: omp.body.continue:
+// CHECK5-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK5: omp.inner.for.inc:
+// CHECK5-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK5-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP14]], 1
+// CHECK5-NEXT: store i32 [[ADD5]], ptr [[DOTOMP_IV]], align 4
+// CHECK5-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK5: omp.inner.for.end:
+// CHECK5-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK5: omp.loop.exit:
+// CHECK5-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP4]])
+// CHECK5-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
+// CHECK5-NEXT: store ptr [[SIVAR2]], ptr [[TMP15]], align 8
+// CHECK5-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l44.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK5-NEXT: switch i32 [[TMP16]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+// CHECK5-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+// CHECK5-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+// CHECK5-NEXT: ]
+// CHECK5: .omp.reduction.case1:
+// CHECK5-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK5-NEXT: [[TMP18:%.*]] = load i32, ptr [[SIVAR2]], align 4
+// CHECK5-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK5-NEXT: store i32 [[ADD6]], ptr [[TMP0]], align 4
+// CHECK5-NEXT: call void @__kmpc_end_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK5-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK5: .omp.reduction.case2:
+// CHECK5-NEXT: [[TMP19:%.*]] = load i32, ptr [[SIVAR2]], align 4
+// CHECK5-NEXT: [[TMP20:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP19]] monotonic, align 4
+// CHECK5-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK5: .omp.reduction.default:
+// CHECK5-NEXT: ret void
+//
+//
+// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l44.omp_outlined.omp_outlined.omp.reduction.reduction_func
+// CHECK5-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK5-NEXT: entry:
+// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK5-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK5-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK5-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK5-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK5-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
+// CHECK5-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
+// CHECK5-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// CHECK5-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK5-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK5-NEXT: store i32 [[ADD]], ptr [[TMP7]], align 4
+// CHECK5-NEXT: ret void
+//
+//
+// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l44.omp_outlined.omp.reduction.reduction_func
+// CHECK5-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK5-NEXT: entry:
+// CHECK5-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK5-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK5-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK5-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK5-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK5-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK5-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
+// CHECK5-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+// CHECK5-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
+// CHECK5-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// CHECK5-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK5-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK5-NEXT: store i32 [[ADD]], ptr [[TMP7]], align 4
+// CHECK5-NEXT: ret void
+//
+//
+// CHECK5-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK5-SAME: () #[[ATTR6:[0-9]+]] {
+// CHECK5-NEXT: entry:
+// CHECK5-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK5-NEXT: ret void
+//
diff --git a/clang/test/OpenMP/target_teams_generic_loop_uses_allocators_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_uses_allocators_codegen.cpp
new file mode 100644
index 00000000000000..64f0dced135f8b
--- /dev/null
+++ b/clang/test/OpenMP/target_teams_generic_loop_uses_allocators_codegen.cpp
@@ -0,0 +1,482 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// Test host codegen.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+enum omp_allocator_handle_t {
+ omp_null_allocator = 0,
+ omp_default_mem_alloc = 1,
+ omp_large_cap_mem_alloc = 2,
+ omp_const_mem_alloc = 3,
+ omp_high_bw_mem_alloc = 4,
+ omp_low_lat_mem_alloc = 5,
+ omp_cgroup_mem_alloc = 6,
+ omp_pteam_mem_alloc = 7,
+ omp_thread_mem_alloc = 8,
+ KMP_ALLOCATOR_MAX_HANDLE = __UINTPTR_MAX__
+};
+
+typedef enum omp_alloctrait_key_t { omp_atk_sync_hint = 1,
+ omp_atk_alignment = 2,
+ omp_atk_access = 3,
+ omp_atk_pool_size = 4,
+ omp_atk_fallback = 5,
+ omp_atk_fb_data = 6,
+ omp_atk_pinned = 7,
+ omp_atk_partition = 8
+} omp_alloctrait_key_t;
+typedef enum omp_alloctrait_value_t {
+ omp_atv_false = 0,
+ omp_atv_true = 1,
+ omp_atv_default = 2,
+ omp_atv_contended = 3,
+ omp_atv_uncontended = 4,
+ omp_atv_sequential = 5,
+ omp_atv_private = 6,
+ omp_atv_all = 7,
+ omp_atv_thread = 8,
+ omp_atv_pteam = 9,
+ omp_atv_cgroup = 10,
+ omp_atv_default_mem_fb = 11,
+ omp_atv_null_fb = 12,
+ omp_atv_abort_fb = 13,
+ omp_atv_allocator_fb = 14,
+ omp_atv_environment = 15,
+ omp_atv_nearest = 16,
+ omp_atv_blocked = 17,
+ omp_atv_interleaved = 18
+} omp_alloctrait_value_t;
+
+typedef struct omp_alloctrait_t {
+ omp_alloctrait_key_t key;
+ __UINTPTR_TYPE__ value;
+} omp_alloctrait_t;
+
+// Just map the traits variable as a firstprivate variable.
+
+void foo() {
+ omp_alloctrait_t traits[10];
+ omp_allocator_handle_t my_allocator;
+
+#pragma omp target teams loop uses_allocators(omp_null_allocator, omp_thread_mem_alloc, my_allocator(traits))
+ for (int i = 0; i < 10; ++i)
+ ;
+}
+
+
+// Destroy allocator upon exit from the region.
+
+#endif
+// CHECK-64-LABEL: define {{[^@]+}}@_Z3foov
+// CHECK-64-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-64-NEXT: entry:
+// CHECK-64-NEXT: [[TRAITS:%.*]] = alloca [10 x %struct.omp_alloctrait_t], align 8
+// CHECK-64-NEXT: [[MY_ALLOCATOR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK-64-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK-64-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CHECK-64-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-64-NEXT: store ptr [[TRAITS]], ptr [[TMP0]], align 8
+// CHECK-64-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-64-NEXT: store ptr [[TRAITS]], ptr [[TMP1]], align 8
+// CHECK-64-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK-64-NEXT: store ptr null, ptr [[TMP2]], align 8
+// CHECK-64-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-64-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-64-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK-64-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK-64-NEXT: store i32 2, ptr [[TMP5]], align 4
+// CHECK-64-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK-64-NEXT: store i32 1, ptr [[TMP6]], align 4
+// CHECK-64-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK-64-NEXT: store ptr [[TMP3]], ptr [[TMP7]], align 8
+// CHECK-64-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK-64-NEXT: store ptr [[TMP4]], ptr [[TMP8]], align 8
+// CHECK-64-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK-64-NEXT: store ptr @.offload_sizes, ptr [[TMP9]], align 8
+// CHECK-64-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK-64-NEXT: store ptr @.offload_maptypes, ptr [[TMP10]], align 8
+// CHECK-64-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK-64-NEXT: store ptr null, ptr [[TMP11]], align 8
+// CHECK-64-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK-64-NEXT: store ptr null, ptr [[TMP12]], align 8
+// CHECK-64-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK-64-NEXT: store i64 10, ptr [[TMP13]], align 8
+// CHECK-64-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK-64-NEXT: store i64 0, ptr [[TMP14]], align 8
+// CHECK-64-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK-64-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP15]], align 4
+// CHECK-64-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK-64-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4
+// CHECK-64-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK-64-NEXT: store i32 0, ptr [[TMP17]], align 4
+// CHECK-64-NEXT: [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l73.region_id, ptr [[KERNEL_ARGS]])
+// CHECK-64-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// CHECK-64-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK-64: omp_offload.failed:
+// CHECK-64-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l73(ptr [[TRAITS]]) #[[ATTR2:[0-9]+]]
+// CHECK-64-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK-64: omp_offload.cont:
+// CHECK-64-NEXT: ret void
+// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l73
+// CHECK-64-SAME: (ptr noundef nonnull align 8 dereferenceable(160) [[TRAITS:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-64-NEXT: entry:
+// CHECK-64-NEXT: [[TRAITS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-64-NEXT: [[MY_ALLOCATOR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK-64-NEXT: store ptr [[TRAITS]], ptr [[TRAITS_ADDR]], align 8
+// CHECK-64-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TRAITS_ADDR]], align 8
+// CHECK-64-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8
+// CHECK-64-NEXT: [[TMP3:%.*]] = call ptr @__kmpc_init_allocator(i32 [[TMP0]], ptr null, i32 10, ptr [[TMP2]])
+// CHECK-64-NEXT: [[CONV:%.*]] = ptrtoint ptr [[TMP3]] to i64
+// CHECK-64-NEXT: store i64 [[CONV]], ptr [[MY_ALLOCATOR]], align 8
+// CHECK-64-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 0, ptr @.omp_outlined.)
+// CHECK-64-NEXT: [[TMP4:%.*]] = load i64, ptr [[MY_ALLOCATOR]], align 8
+// CHECK-64-NEXT: [[CONV1:%.*]] = inttoptr i64 [[TMP4]] to ptr
+// CHECK-64-NEXT: call void @__kmpc_destroy_allocator(i32 [[TMP0]], ptr [[CONV1]])
+// CHECK-64-NEXT: ret void
+// CHECK-64-LABEL: define {{[^@]+}}@.omp_outlined.
+// CHECK-64-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT: entry:
+// CHECK-64-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-64-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-64-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT: store i32 9, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-64-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-64-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64: cond.true:
+// CHECK-64-NEXT: br label [[COND_END:%.*]]
+// CHECK-64: cond.false:
+// CHECK-64-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT: br label [[COND_END]]
+// CHECK-64: cond.end:
+// CHECK-64-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-64-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64: omp.inner.for.cond:
+// CHECK-64-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-64-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64: omp.inner.for.body:
+// CHECK-64-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK-64-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-64-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK-64-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-64-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 2, ptr @.omp_outlined..1, i64 [[TMP8]], i64 [[TMP10]])
+// CHECK-64-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64: omp.inner.for.inc:
+// CHECK-64-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK-64-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-64: omp.inner.for.end:
+// CHECK-64-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64: omp.loop.exit:
+// CHECK-64-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3:[0-9]+]], i32 [[TMP1]])
+// CHECK-64-NEXT: ret void
+// CHECK-64-LABEL: define {{[^@]+}}@.omp_outlined..1
+// CHECK-64-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-64-NEXT: entry:
+// CHECK-64-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-64-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-64-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-64-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-64-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT: [[TMP0:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-64-NEXT: [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-64-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-64-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-64-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT: store i32 [[CONV1]], ptr [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-64-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-64-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-64-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CHECK-64-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB3]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-64-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9
+// CHECK-64-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64: cond.true:
+// CHECK-64-NEXT: br label [[COND_END:%.*]]
+// CHECK-64: cond.false:
+// CHECK-64-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT: br label [[COND_END]]
+// CHECK-64: cond.end:
+// CHECK-64-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK-64-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK-64: omp.inner.for.cond:
+// CHECK-64-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-64-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64: omp.inner.for.body:
+// CHECK-64-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-64-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-64-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-64-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK-64: omp.body.continue:
+// CHECK-64-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK-64: omp.inner.for.inc:
+// CHECK-64-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-64-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK-64-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK-64: omp.inner.for.end:
+// CHECK-64-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK-64: omp.loop.exit:
+// CHECK-64-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP3]])
+// CHECK-64-NEXT: ret void
+// CHECK-64-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK-64-SAME: () #[[ATTR3:[0-9]+]] {
+// CHECK-64-NEXT: entry:
+// CHECK-64-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK-64-NEXT: ret void
+// CHECK-LABEL: define {{[^@]+}}@_Z3foov
+// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TRAITS:%.*]] = alloca [10 x %struct.omp_alloctrait_t], align 8
+// CHECK-NEXT: [[MY_ALLOCATOR:%.*]] = alloca i64, align 8
+// CHECK-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-NEXT: store ptr [[TRAITS]], ptr [[TMP0]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-NEXT: store ptr [[TRAITS]], ptr [[TMP1]], align 8
+// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK-NEXT: store ptr null, ptr [[TMP2]], align 8
+// CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK-NEXT: store i32 2, ptr [[TMP5]], align 4
+// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK-NEXT: store i32 1, ptr [[TMP6]], align 4
+// CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK-NEXT: store ptr [[TMP3]], ptr [[TMP7]], align 8
+// CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK-NEXT: store ptr [[TMP4]], ptr [[TMP8]], align 8
+// CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK-NEXT: store ptr @.offload_sizes, ptr [[TMP9]], align 8
+// CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK-NEXT: store ptr @.offload_maptypes, ptr [[TMP10]], align 8
+// CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK-NEXT: store ptr null, ptr [[TMP11]], align 8
+// CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK-NEXT: store ptr null, ptr [[TMP12]], align 8
+// CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK-NEXT: store i64 10, ptr [[TMP13]], align 8
+// CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK-NEXT: store i64 0, ptr [[TMP14]], align 8
+// CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP15]], align 4
+// CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4
+// CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK-NEXT: store i32 0, ptr [[TMP17]], align 4
+// CHECK-NEXT: [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l66.region_id, ptr [[KERNEL_ARGS]])
+// CHECK-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// CHECK-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK: omp_offload.failed:
+// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l66(ptr [[TRAITS]]) #[[ATTR2:[0-9]+]]
+// CHECK-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK: omp_offload.cont:
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l66
+// CHECK-SAME: (ptr noundef nonnull align 8 dereferenceable(160) [[TRAITS:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TRAITS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[MY_ALLOCATOR:%.*]] = alloca i64, align 8
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK-NEXT: store ptr [[TRAITS]], ptr [[TRAITS_ADDR]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TRAITS_ADDR]], align 8
+// CHECK-NEXT: [[TMP2:%.*]] = call ptr @__kmpc_init_allocator(i32 [[TMP0]], ptr null, i32 10, ptr [[TMP1]])
+// CHECK-NEXT: [[CONV:%.*]] = ptrtoint ptr [[TMP2]] to i64
+// CHECK-NEXT: store i64 [[CONV]], ptr [[MY_ALLOCATOR]], align 8
+// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l66.omp_outlined)
+// CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr [[MY_ALLOCATOR]], align 8
+// CHECK-NEXT: [[CONV1:%.*]] = inttoptr i64 [[TMP3]] to ptr
+// CHECK-NEXT: call void @__kmpc_destroy_allocator(i32 [[TMP0]], ptr [[CONV1]])
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l66.omp_outlined
+// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK-NEXT: store i32 9, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 9
+// CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK: cond.true:
+// CHECK-NEXT: br label [[COND_END:%.*]]
+// CHECK: cond.false:
+// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK-NEXT: br label [[COND_END]]
+// CHECK: cond.end:
+// CHECK-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK: omp.inner.for.cond:
+// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK: omp.inner.for.body:
+// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l66.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]])
+// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK: omp.inner.for.inc:
+// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK: omp.inner.for.end:
+// CHECK-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK: omp.loop.exit:
+// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3:[0-9]+]], i32 [[TMP1]])
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l66.omp_outlined.omp_outlined
+// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK-NEXT: [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK-NEXT: store i32 [[CONV1]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CHECK-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB3]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9
+// CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK: cond.true:
+// CHECK-NEXT: br label [[COND_END:%.*]]
+// CHECK: cond.false:
+// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: br label [[COND_END]]
+// CHECK: cond.end:
+// CHECK-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK: omp.inner.for.cond:
+// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK: omp.inner.for.body:
+// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK: omp.body.continue:
+// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK: omp.inner.for.inc:
+// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK: omp.inner.for.end:
+// CHECK-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK: omp.loop.exit:
+// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP3]])
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK-SAME: () #[[ATTR3:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK-NEXT: ret void
+//
diff --git a/clang/test/OpenMP/teams_generic_loop_codegen-1.cpp b/clang/test/OpenMP/teams_generic_loop_codegen-1.cpp
new file mode 100644
index 00000000000000..9ff4da4f827b79
--- /dev/null
+++ b/clang/test/OpenMP/teams_generic_loop_codegen-1.cpp
@@ -0,0 +1,3545 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+// Test host codegen.
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK3
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK3
+
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+#ifdef CK1
+
+int a[100];
+
+int teams_argument_global(int n){
+ int te = n / 128;
+ int th = 128;
+ // discard n_addr
+
+ #pragma omp target
+ #pragma omp teams loop num_teams(te), thread_limit(th)
+ for(int i = 0; i < n; i++) {
+ a[i] = 0;
+ }
+
+ #pragma omp target
+ {{{
+ #pragma omp teams loop
+ for(int i = 0; i < n; i++) {
+ a[i] = 0;
+ }
+ }}}
+
+ // outlined target regions
+
+
+
+
+ return a[0];
+}
+
+#endif // CK1
+
+// Test host codegen.
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK9
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK9
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK11
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK11
+
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCK2 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCK2 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+#ifdef CK2
+
+int teams_local_arg(void) {
+ int n = 100;
+ int a[n];
+
+ #pragma omp target
+ #pragma omp teams loop
+ for(int i = 0; i < n; i++) {
+ a[i] = 0;
+ }
+
+ // outlined target region
+
+
+ return a[0];
+}
+#endif // CK2
+
+// Test host codegen.
+// RUN: %clang_cc1 -DCK3 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK17
+// RUN: %clang_cc1 -DCK3 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK3 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK17
+// RUN: %clang_cc1 -DCK3 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK19
+// RUN: %clang_cc1 -DCK3 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK3 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK19
+
+// RUN: %clang_cc1 -DCK3 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCK3 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK3 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCK3 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCK3 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK3 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+#ifdef CK3
+
+
+template <typename T, int X, long long Y>
+struct SS{
+ T a[X];
+ float b;
+ int foo(void) {
+
+ #pragma omp target
+ #pragma omp teams loop
+ for(int i = 0; i < X; i++) {
+ a[i] = (T)0;
+ }
+
+ // outlined target region
+
+
+ return a[0];
+ }
+};
+
+int teams_template_struct(void) {
+ SS<int, 123, 456> V;
+ return V.foo();
+
+}
+#endif // CK3
+
+// Test host codegen.
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK25
+// RUN: %clang_cc1 -DCK4 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK4 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK25
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK27
+// RUN: %clang_cc1 -DCK4 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK4 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK27
+
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCK4 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK4 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCK4 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK4 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+
+#ifdef CK4
+
+template <typename T, int n>
+int tmain(T argc) {
+ T a[n];
+ int te = n/128;
+ int th = 128;
+#pragma omp target
+#pragma omp teams loop num_teams(te) thread_limit(th)
+ for(int i = 0; i < n; i++) {
+ a[i] = (T)0;
+ }
+ return 0;
+}
+
+int main (int argc, char **argv) {
+ int n = 100;
+ int a[n];
+#pragma omp target
+#pragma omp teams loop
+ for(int i = 0; i < n; i++) {
+ a[i] = 0;
+ }
+ return tmain<int, 10>(argc);
+}
+
+
+
+
+
+
+
+#endif // CK4
+#endif
+// CHECK1-LABEL: define {{[^@]+}}@_Z21teams_argument_globali
+// CHECK1-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TH:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TE_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[TH_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [4 x ptr], align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [4 x ptr], align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [4 x ptr], align 8
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK1-NEXT: [[N_CASTED4:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_BASEPTRS5:%.*]] = alloca [2 x ptr], align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_PTRS6:%.*]] = alloca [2 x ptr], align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS7:%.*]] = alloca [2 x ptr], align 8
+// CHECK1-NEXT: [[_TMP8:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_9:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_10:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[KERNEL_ARGS15:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK1-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP0]], 128
+// CHECK1-NEXT: store i32 [[DIV]], ptr [[TE]], align 4
+// CHECK1-NEXT: store i32 128, ptr [[TH]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[TE]], align 4
+// CHECK1-NEXT: store i32 [[TMP1]], ptr [[TE_CASTED]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load i64, ptr [[TE_CASTED]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[TH]], align 4
+// CHECK1-NEXT: store i32 [[TMP3]], ptr [[TH_CASTED]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i64, ptr [[TH_CASTED]], align 8
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP5]], ptr [[N_CASTED]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT: store i64 [[TMP2]], ptr [[TMP7]], align 8
+// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT: store i64 [[TMP2]], ptr [[TMP8]], align 8
+// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK1-NEXT: store ptr null, ptr [[TMP9]], align 8
+// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK1-NEXT: store i64 [[TMP4]], ptr [[TMP10]], align 8
+// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK1-NEXT: store i64 [[TMP4]], ptr [[TMP11]], align 8
+// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK1-NEXT: store ptr null, ptr [[TMP12]], align 8
+// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK1-NEXT: store i64 [[TMP6]], ptr [[TMP13]], align 8
+// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK1-NEXT: store i64 [[TMP6]], ptr [[TMP14]], align 8
+// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
+// CHECK1-NEXT: store ptr null, ptr [[TMP15]], align 8
+// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 3
+// CHECK1-NEXT: store ptr @a, ptr [[TMP16]], align 8
+// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 3
+// CHECK1-NEXT: store ptr @a, ptr [[TMP17]], align 8
+// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 3
+// CHECK1-NEXT: store ptr null, ptr [[TMP18]], align 8
+// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[TE]], align 4
+// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP22]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP23]], 0
+// CHECK1-NEXT: [[DIV2:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK1-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV2]], 1
+// CHECK1-NEXT: store i32 [[SUB3]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP24]], 1
+// CHECK1-NEXT: [[TMP25:%.*]] = zext i32 [[ADD]] to i64
+// CHECK1-NEXT: [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP21]], 0
+// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK1-NEXT: store i32 2, ptr [[TMP27]], align 4
+// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK1-NEXT: store i32 4, ptr [[TMP28]], align 4
+// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK1-NEXT: store ptr [[TMP19]], ptr [[TMP29]], align 8
+// CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK1-NEXT: store ptr [[TMP20]], ptr [[TMP30]], align 8
+// CHECK1-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK1-NEXT: store ptr @.offload_sizes, ptr [[TMP31]], align 8
+// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK1-NEXT: store ptr @.offload_maptypes, ptr [[TMP32]], align 8
+// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK1-NEXT: store ptr null, ptr [[TMP33]], align 8
+// CHECK1-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK1-NEXT: store ptr null, ptr [[TMP34]], align 8
+// CHECK1-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK1-NEXT: store i64 [[TMP25]], ptr [[TMP35]], align 8
+// CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK1-NEXT: store i64 0, ptr [[TMP36]], align 8
+// CHECK1-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK1-NEXT: store [3 x i32] [[TMP26]], ptr [[TMP37]], align 4
+// CHECK1-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP38]], align 4
+// CHECK1-NEXT: [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK1-NEXT: store i32 0, ptr [[TMP39]], align 4
+// CHECK1-NEXT: [[TMP40:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3:[0-9]+]], i64 -1, i32 [[TMP21]], i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l28.region_id, ptr [[KERNEL_ARGS]])
+// CHECK1-NEXT: [[TMP41:%.*]] = icmp ne i32 [[TMP40]], 0
+// CHECK1-NEXT: br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK1: omp_offload.failed:
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l28(i64 [[TMP2]], i64 [[TMP4]], i64 [[TMP6]], ptr @a) #[[ATTR2:[0-9]+]]
+// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK1: omp_offload.cont:
+// CHECK1-NEXT: [[TMP42:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP42]], ptr [[N_CASTED4]], align 4
+// CHECK1-NEXT: [[TMP43:%.*]] = load i64, ptr [[N_CASTED4]], align 8
+// CHECK1-NEXT: [[TMP44:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
+// CHECK1-NEXT: store i64 [[TMP43]], ptr [[TMP44]], align 8
+// CHECK1-NEXT: [[TMP45:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
+// CHECK1-NEXT: store i64 [[TMP43]], ptr [[TMP45]], align 8
+// CHECK1-NEXT: [[TMP46:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS7]], i64 0, i64 0
+// CHECK1-NEXT: store ptr null, ptr [[TMP46]], align 8
+// CHECK1-NEXT: [[TMP47:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 1
+// CHECK1-NEXT: store ptr @a, ptr [[TMP47]], align 8
+// CHECK1-NEXT: [[TMP48:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 1
+// CHECK1-NEXT: store ptr @a, ptr [[TMP48]], align 8
+// CHECK1-NEXT: [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS7]], i64 0, i64 1
+// CHECK1-NEXT: store ptr null, ptr [[TMP49]], align 8
+// CHECK1-NEXT: [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP52:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP52]], ptr [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK1-NEXT: [[TMP53:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK1-NEXT: [[SUB11:%.*]] = sub nsw i32 [[TMP53]], 0
+// CHECK1-NEXT: [[DIV12:%.*]] = sdiv i32 [[SUB11]], 1
+// CHECK1-NEXT: [[SUB13:%.*]] = sub nsw i32 [[DIV12]], 1
+// CHECK1-NEXT: store i32 [[SUB13]], ptr [[DOTCAPTURE_EXPR_10]], align 4
+// CHECK1-NEXT: [[TMP54:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_10]], align 4
+// CHECK1-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP54]], 1
+// CHECK1-NEXT: [[TMP55:%.*]] = zext i32 [[ADD14]] to i64
+// CHECK1-NEXT: [[TMP56:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
+// CHECK1-NEXT: store i32 2, ptr [[TMP56]], align 4
+// CHECK1-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
+// CHECK1-NEXT: store i32 2, ptr [[TMP57]], align 4
+// CHECK1-NEXT: [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
+// CHECK1-NEXT: store ptr [[TMP50]], ptr [[TMP58]], align 8
+// CHECK1-NEXT: [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 3
+// CHECK1-NEXT: store ptr [[TMP51]], ptr [[TMP59]], align 8
+// CHECK1-NEXT: [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 4
+// CHECK1-NEXT: store ptr @.offload_sizes.1, ptr [[TMP60]], align 8
+// CHECK1-NEXT: [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 5
+// CHECK1-NEXT: store ptr @.offload_maptypes.2, ptr [[TMP61]], align 8
+// CHECK1-NEXT: [[TMP62:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 6
+// CHECK1-NEXT: store ptr null, ptr [[TMP62]], align 8
+// CHECK1-NEXT: [[TMP63:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 7
+// CHECK1-NEXT: store ptr null, ptr [[TMP63]], align 8
+// CHECK1-NEXT: [[TMP64:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 8
+// CHECK1-NEXT: store i64 [[TMP55]], ptr [[TMP64]], align 8
+// CHECK1-NEXT: [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 9
+// CHECK1-NEXT: store i64 0, ptr [[TMP65]], align 8
+// CHECK1-NEXT: [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 10
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP66]], align 4
+// CHECK1-NEXT: [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 11
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP67]], align 4
+// CHECK1-NEXT: [[TMP68:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 12
+// CHECK1-NEXT: store i32 0, ptr [[TMP68]], align 4
+// CHECK1-NEXT: [[TMP69:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l34.region_id, ptr [[KERNEL_ARGS15]])
+// CHECK1-NEXT: [[TMP70:%.*]] = icmp ne i32 [[TMP69]], 0
+// CHECK1-NEXT: br i1 [[TMP70]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]]
+// CHECK1: omp_offload.failed16:
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l34(i64 [[TMP43]], ptr @a) #[[ATTR2]]
+// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT17]]
+// CHECK1: omp_offload.cont17:
+// CHECK1-NEXT: [[TMP71:%.*]] = load i32, ptr @a, align 4
+// CHECK1-NEXT: ret i32 [[TMP71]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l28
+// CHECK1-SAME: (i64 noundef [[TE:%.*]], i64 noundef [[TH:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[TE_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[TH_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]])
+// CHECK1-NEXT: store i64 [[TE]], ptr [[TE_ADDR]], align 8
+// CHECK1-NEXT: store i64 [[TH]], ptr [[TH_ADDR]], align 8
+// CHECK1-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[TE_ADDR]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[TH_ADDR]], align 4
+// CHECK1-NEXT: call void @__kmpc_push_num_teams(ptr @[[GLOB3]], i32 [[TMP0]], i32 [[TMP2]], i32 [[TMP3]])
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l28.omp_outlined, ptr [[N_ADDR]], ptr [[TMP1]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l28.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[A:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[N]], ptr [[N_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[N_ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0
+// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK1-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP4]]
+// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK1: omp.precond.then:
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP7]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+// CHECK1-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP10]], [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
+// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP16:%.*]] = zext i32 [[TMP15]] to i64
+// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP18:%.*]] = zext i32 [[TMP17]] to i64
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l28.omp_outlined.omp_outlined, i64 [[TMP16]], i64 [[TMP18]], ptr [[TMP0]], ptr [[TMP1]])
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP22]])
+// CHECK1-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK1: omp.precond.end:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l28.omp_outlined.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[A:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I4:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[N]], ptr [[N_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[N_ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0
+// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK1-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP4]]
+// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK1: omp.precond.then:
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP6]] to i32
+// CHECK1-NEXT: [[TMP7:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP7]] to i32
+// CHECK1-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[CONV3]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP9]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP10]], [[TMP11]]
+// CHECK1-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP14]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]]
+// CHECK1-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP17]], 1
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[I4]], align 4
+// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[I4]], align 4
+// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP18]] to i64
+// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x i32], ptr [[TMP1]], i64 0, i64 [[IDXPROM]]
+// CHECK1-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4
+// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1: omp.body.continue:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP19]], 1
+// CHECK1-NEXT: store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: [[TMP20:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP21]])
+// CHECK1-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK1: omp.precond.end:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l34
+// CHECK1-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[A:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l34.omp_outlined, ptr [[N_ADDR]], ptr [[TMP0]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l34.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[A:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[N]], ptr [[N_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[N_ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0
+// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK1-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP4]]
+// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK1: omp.precond.then:
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP7]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+// CHECK1-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP10]], [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
+// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP16:%.*]] = zext i32 [[TMP15]] to i64
+// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP18:%.*]] = zext i32 [[TMP17]] to i64
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l34.omp_outlined.omp_outlined, i64 [[TMP16]], i64 [[TMP18]], ptr [[TMP0]], ptr [[TMP1]])
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP22]])
+// CHECK1-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK1: omp.precond.end:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l34.omp_outlined.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[A:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I4:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[N]], ptr [[N_ADDR]], align 8
+// CHECK1-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[N_ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0
+// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK1-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP4]]
+// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK1: omp.precond.then:
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP6]] to i32
+// CHECK1-NEXT: [[TMP7:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP7]] to i32
+// CHECK1-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[CONV3]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP9]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP10]], [[TMP11]]
+// CHECK1-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP14]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]]
+// CHECK1-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP17]], 1
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[I4]], align 4
+// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[I4]], align 4
+// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP18]] to i64
+// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x i32], ptr [[TMP1]], i64 0, i64 [[IDXPROM]]
+// CHECK1-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4
+// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1: omp.body.continue:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP19]], 1
+// CHECK1-NEXT: store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: [[TMP20:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP21]])
+// CHECK1-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK1: omp.precond.end:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_Z21teams_argument_globali
+// CHECK3-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TH:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TE_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TH_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [4 x ptr], align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [4 x ptr], align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [4 x ptr], align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK3-NEXT: [[N_CASTED4:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_BASEPTRS5:%.*]] = alloca [2 x ptr], align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_PTRS6:%.*]] = alloca [2 x ptr], align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_MAPPERS7:%.*]] = alloca [2 x ptr], align 4
+// CHECK3-NEXT: [[_TMP8:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_9:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_10:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[KERNEL_ARGS15:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK3-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP0]], 128
+// CHECK3-NEXT: store i32 [[DIV]], ptr [[TE]], align 4
+// CHECK3-NEXT: store i32 128, ptr [[TH]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[TE]], align 4
+// CHECK3-NEXT: store i32 [[TMP1]], ptr [[TE_CASTED]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[TE_CASTED]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[TH]], align 4
+// CHECK3-NEXT: store i32 [[TMP3]], ptr [[TH_CASTED]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[TH_CASTED]], align 4
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP5]], ptr [[N_CASTED]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT: store i32 [[TMP2]], ptr [[TMP7]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT: store i32 [[TMP2]], ptr [[TMP8]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK3-NEXT: store ptr null, ptr [[TMP9]], align 4
+// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK3-NEXT: store i32 [[TMP4]], ptr [[TMP10]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK3-NEXT: store i32 [[TMP4]], ptr [[TMP11]], align 4
+// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK3-NEXT: store ptr null, ptr [[TMP12]], align 4
+// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK3-NEXT: store i32 [[TMP6]], ptr [[TMP13]], align 4
+// CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK3-NEXT: store i32 [[TMP6]], ptr [[TMP14]], align 4
+// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
+// CHECK3-NEXT: store ptr null, ptr [[TMP15]], align 4
+// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 3
+// CHECK3-NEXT: store ptr @a, ptr [[TMP16]], align 4
+// CHECK3-NEXT: [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 3
+// CHECK3-NEXT: store ptr @a, ptr [[TMP17]], align 4
+// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 3
+// CHECK3-NEXT: store ptr null, ptr [[TMP18]], align 4
+// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP21:%.*]] = load i32, ptr [[TE]], align 4
+// CHECK3-NEXT: [[TMP22:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP22]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP23]], 0
+// CHECK3-NEXT: [[DIV2:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK3-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV2]], 1
+// CHECK3-NEXT: store i32 [[SUB3]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: [[TMP24:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP24]], 1
+// CHECK3-NEXT: [[TMP25:%.*]] = zext i32 [[ADD]] to i64
+// CHECK3-NEXT: [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP21]], 0
+// CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK3-NEXT: store i32 2, ptr [[TMP27]], align 4
+// CHECK3-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK3-NEXT: store i32 4, ptr [[TMP28]], align 4
+// CHECK3-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK3-NEXT: store ptr [[TMP19]], ptr [[TMP29]], align 4
+// CHECK3-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK3-NEXT: store ptr [[TMP20]], ptr [[TMP30]], align 4
+// CHECK3-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK3-NEXT: store ptr @.offload_sizes, ptr [[TMP31]], align 4
+// CHECK3-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK3-NEXT: store ptr @.offload_maptypes, ptr [[TMP32]], align 4
+// CHECK3-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK3-NEXT: store ptr null, ptr [[TMP33]], align 4
+// CHECK3-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK3-NEXT: store ptr null, ptr [[TMP34]], align 4
+// CHECK3-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK3-NEXT: store i64 [[TMP25]], ptr [[TMP35]], align 8
+// CHECK3-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK3-NEXT: store i64 0, ptr [[TMP36]], align 8
+// CHECK3-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK3-NEXT: store [3 x i32] [[TMP26]], ptr [[TMP37]], align 4
+// CHECK3-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP38]], align 4
+// CHECK3-NEXT: [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK3-NEXT: store i32 0, ptr [[TMP39]], align 4
+// CHECK3-NEXT: [[TMP40:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3:[0-9]+]], i64 -1, i32 [[TMP21]], i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l28.region_id, ptr [[KERNEL_ARGS]])
+// CHECK3-NEXT: [[TMP41:%.*]] = icmp ne i32 [[TMP40]], 0
+// CHECK3-NEXT: br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK3: omp_offload.failed:
+// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l28(i32 [[TMP2]], i32 [[TMP4]], i32 [[TMP6]], ptr @a) #[[ATTR2:[0-9]+]]
+// CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK3: omp_offload.cont:
+// CHECK3-NEXT: [[TMP42:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP42]], ptr [[N_CASTED4]], align 4
+// CHECK3-NEXT: [[TMP43:%.*]] = load i32, ptr [[N_CASTED4]], align 4
+// CHECK3-NEXT: [[TMP44:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
+// CHECK3-NEXT: store i32 [[TMP43]], ptr [[TMP44]], align 4
+// CHECK3-NEXT: [[TMP45:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
+// CHECK3-NEXT: store i32 [[TMP43]], ptr [[TMP45]], align 4
+// CHECK3-NEXT: [[TMP46:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS7]], i32 0, i32 0
+// CHECK3-NEXT: store ptr null, ptr [[TMP46]], align 4
+// CHECK3-NEXT: [[TMP47:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 1
+// CHECK3-NEXT: store ptr @a, ptr [[TMP47]], align 4
+// CHECK3-NEXT: [[TMP48:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 1
+// CHECK3-NEXT: store ptr @a, ptr [[TMP48]], align 4
+// CHECK3-NEXT: [[TMP49:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_MAPPERS7]], i32 0, i32 1
+// CHECK3-NEXT: store ptr null, ptr [[TMP49]], align 4
+// CHECK3-NEXT: [[TMP50:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS5]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS6]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP52:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP52]], ptr [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK3-NEXT: [[TMP53:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK3-NEXT: [[SUB11:%.*]] = sub nsw i32 [[TMP53]], 0
+// CHECK3-NEXT: [[DIV12:%.*]] = sdiv i32 [[SUB11]], 1
+// CHECK3-NEXT: [[SUB13:%.*]] = sub nsw i32 [[DIV12]], 1
+// CHECK3-NEXT: store i32 [[SUB13]], ptr [[DOTCAPTURE_EXPR_10]], align 4
+// CHECK3-NEXT: [[TMP54:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_10]], align 4
+// CHECK3-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP54]], 1
+// CHECK3-NEXT: [[TMP55:%.*]] = zext i32 [[ADD14]] to i64
+// CHECK3-NEXT: [[TMP56:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 0
+// CHECK3-NEXT: store i32 2, ptr [[TMP56]], align 4
+// CHECK3-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 1
+// CHECK3-NEXT: store i32 2, ptr [[TMP57]], align 4
+// CHECK3-NEXT: [[TMP58:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 2
+// CHECK3-NEXT: store ptr [[TMP50]], ptr [[TMP58]], align 4
+// CHECK3-NEXT: [[TMP59:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 3
+// CHECK3-NEXT: store ptr [[TMP51]], ptr [[TMP59]], align 4
+// CHECK3-NEXT: [[TMP60:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 4
+// CHECK3-NEXT: store ptr @.offload_sizes.1, ptr [[TMP60]], align 4
+// CHECK3-NEXT: [[TMP61:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 5
+// CHECK3-NEXT: store ptr @.offload_maptypes.2, ptr [[TMP61]], align 4
+// CHECK3-NEXT: [[TMP62:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 6
+// CHECK3-NEXT: store ptr null, ptr [[TMP62]], align 4
+// CHECK3-NEXT: [[TMP63:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 7
+// CHECK3-NEXT: store ptr null, ptr [[TMP63]], align 4
+// CHECK3-NEXT: [[TMP64:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 8
+// CHECK3-NEXT: store i64 [[TMP55]], ptr [[TMP64]], align 8
+// CHECK3-NEXT: [[TMP65:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 9
+// CHECK3-NEXT: store i64 0, ptr [[TMP65]], align 8
+// CHECK3-NEXT: [[TMP66:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 10
+// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP66]], align 4
+// CHECK3-NEXT: [[TMP67:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 11
+// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP67]], align 4
+// CHECK3-NEXT: [[TMP68:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS15]], i32 0, i32 12
+// CHECK3-NEXT: store i32 0, ptr [[TMP68]], align 4
+// CHECK3-NEXT: [[TMP69:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l34.region_id, ptr [[KERNEL_ARGS15]])
+// CHECK3-NEXT: [[TMP70:%.*]] = icmp ne i32 [[TMP69]], 0
+// CHECK3-NEXT: br i1 [[TMP70]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]]
+// CHECK3: omp_offload.failed16:
+// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l34(i32 [[TMP43]], ptr @a) #[[ATTR2]]
+// CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT17]]
+// CHECK3: omp_offload.cont17:
+// CHECK3-NEXT: [[TMP71:%.*]] = load i32, ptr @a, align 4
+// CHECK3-NEXT: ret i32 [[TMP71]]
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l28
+// CHECK3-SAME: (i32 noundef [[TE:%.*]], i32 noundef [[TH:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[TE_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TH_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]])
+// CHECK3-NEXT: store i32 [[TE]], ptr [[TE_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TH]], ptr [[TH_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[TE_ADDR]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[TH_ADDR]], align 4
+// CHECK3-NEXT: call void @__kmpc_push_num_teams(ptr @[[GLOB3]], i32 [[TMP0]], i32 [[TMP2]], i32 [[TMP3]])
+// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l28.omp_outlined, ptr [[N_ADDR]], ptr [[TMP1]])
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l28.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[A:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0
+// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK3-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP4]]
+// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK3: omp.precond.then:
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP7]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK3: cond.true:
+// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: br label [[COND_END:%.*]]
+// CHECK3: cond.false:
+// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: br label [[COND_END]]
+// CHECK3: cond.end:
+// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP10]], [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ]
+// CHECK3-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3: omp.inner.for.cond:
+// CHECK3-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
+// CHECK3-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3: omp.inner.for.body:
+// CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l28.omp_outlined.omp_outlined, i32 [[TMP15]], i32 [[TMP16]], ptr [[TMP0]], ptr [[TMP1]])
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3: omp.inner.for.inc:
+// CHECK3-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK3: omp.inner.for.end:
+// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3: omp.loop.exit:
+// CHECK3-NEXT: [[TMP19:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP20]])
+// CHECK3-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK3: omp.precond.end:
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l28.omp_outlined.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[A:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0
+// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK3-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP4]]
+// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK3: omp.precond.then:
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP7]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP9]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP10]], [[TMP11]]
+// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK3: cond.true:
+// CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: br label [[COND_END:%.*]]
+// CHECK3: cond.false:
+// CHECK3-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: br label [[COND_END]]
+// CHECK3: cond.end:
+// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ]
+// CHECK3-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP14]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3: omp.inner.for.cond:
+// CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]]
+// CHECK3-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3: omp.inner.for.body:
+// CHECK3-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP17]], 1
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[I3]], align 4
+// CHECK3-NEXT: [[TMP18:%.*]] = load i32, ptr [[I3]], align 4
+// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x i32], ptr [[TMP1]], i32 0, i32 [[TMP18]]
+// CHECK3-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4
+// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK3: omp.body.continue:
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3: omp.inner.for.inc:
+// CHECK3-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP19]], 1
+// CHECK3-NEXT: store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK3: omp.inner.for.end:
+// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3: omp.loop.exit:
+// CHECK3-NEXT: [[TMP20:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP21]])
+// CHECK3-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK3: omp.precond.end:
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l34
+// CHECK3-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[A:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l34.omp_outlined, ptr [[N_ADDR]], ptr [[TMP0]])
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l34.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[A:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0
+// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK3-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP4]]
+// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK3: omp.precond.then:
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP7]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK3: cond.true:
+// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: br label [[COND_END:%.*]]
+// CHECK3: cond.false:
+// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: br label [[COND_END]]
+// CHECK3: cond.end:
+// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP10]], [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ]
+// CHECK3-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3: omp.inner.for.cond:
+// CHECK3-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
+// CHECK3-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3: omp.inner.for.body:
+// CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l34.omp_outlined.omp_outlined, i32 [[TMP15]], i32 [[TMP16]], ptr [[TMP0]], ptr [[TMP1]])
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3: omp.inner.for.inc:
+// CHECK3-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK3: omp.inner.for.end:
+// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3: omp.loop.exit:
+// CHECK3-NEXT: [[TMP19:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP20]])
+// CHECK3-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK3: omp.precond.end:
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l34.omp_outlined.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[A:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP3]], 0
+// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK3-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP4]]
+// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK3: omp.precond.then:
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP7]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP9]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP10]], [[TMP11]]
+// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK3: cond.true:
+// CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: br label [[COND_END:%.*]]
+// CHECK3: cond.false:
+// CHECK3-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: br label [[COND_END]]
+// CHECK3: cond.end:
+// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP12]], [[COND_TRUE]] ], [ [[TMP13]], [[COND_FALSE]] ]
+// CHECK3-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP14]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3: omp.inner.for.cond:
+// CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]]
+// CHECK3-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3: omp.inner.for.body:
+// CHECK3-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP17]], 1
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[I3]], align 4
+// CHECK3-NEXT: [[TMP18:%.*]] = load i32, ptr [[I3]], align 4
+// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x i32], ptr [[TMP1]], i32 0, i32 [[TMP18]]
+// CHECK3-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4
+// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK3: omp.body.continue:
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3: omp.inner.for.inc:
+// CHECK3-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP19]], 1
+// CHECK3-NEXT: store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK3: omp.inner.for.end:
+// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3: omp.loop.exit:
+// CHECK3-NEXT: [[TMP20:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP21]])
+// CHECK3-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK3: omp.precond.end:
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK3-SAME: () #[[ATTR3:[0-9]+]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@_Z15teams_local_argv
+// CHECK9-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[N:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[SAVED_STACK:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 8
+// CHECK9-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 8
+// CHECK9-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 8
+// CHECK9-NEXT: [[DOTOFFLOAD_SIZES:%.*]] = alloca [3 x i64], align 8
+// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK9-NEXT: store i32 100, ptr [[N]], align 4
+// CHECK9-NEXT: [[TMP0:%.*]] = load i32, ptr [[N]], align 4
+// CHECK9-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+// CHECK9-NEXT: [[TMP2:%.*]] = call ptr @llvm.stacksave()
+// CHECK9-NEXT: store ptr [[TMP2]], ptr [[SAVED_STACK]], align 8
+// CHECK9-NEXT: [[VLA:%.*]] = alloca i32, i64 [[TMP1]], align 4
+// CHECK9-NEXT: store i64 [[TMP1]], ptr [[__VLA_EXPR0]], align 8
+// CHECK9-NEXT: [[TMP3:%.*]] = load i32, ptr [[N]], align 4
+// CHECK9-NEXT: store i32 [[TMP3]], ptr [[N_CASTED]], align 4
+// CHECK9-NEXT: [[TMP4:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK9-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP1]], 4
+// CHECK9-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DOTOFFLOAD_SIZES]], ptr align 8 @.offload_sizes, i64 24, i1 false)
+// CHECK9-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK9-NEXT: store i64 [[TMP4]], ptr [[TMP6]], align 8
+// CHECK9-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK9-NEXT: store i64 [[TMP4]], ptr [[TMP7]], align 8
+// CHECK9-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK9-NEXT: store ptr null, ptr [[TMP8]], align 8
+// CHECK9-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK9-NEXT: store i64 [[TMP1]], ptr [[TMP9]], align 8
+// CHECK9-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK9-NEXT: store i64 [[TMP1]], ptr [[TMP10]], align 8
+// CHECK9-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK9-NEXT: store ptr null, ptr [[TMP11]], align 8
+// CHECK9-NEXT: [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK9-NEXT: store ptr [[VLA]], ptr [[TMP12]], align 8
+// CHECK9-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK9-NEXT: store ptr [[VLA]], ptr [[TMP13]], align 8
+// CHECK9-NEXT: [[TMP14:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 2
+// CHECK9-NEXT: store i64 [[TMP5]], ptr [[TMP14]], align 8
+// CHECK9-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
+// CHECK9-NEXT: store ptr null, ptr [[TMP15]], align 8
+// CHECK9-NEXT: [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK9-NEXT: [[TMP17:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK9-NEXT: [[TMP18:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CHECK9-NEXT: [[TMP19:%.*]] = load i32, ptr [[N]], align 4
+// CHECK9-NEXT: store i32 [[TMP19]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP20]], 0
+// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK9-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK9-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK9-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
+// CHECK9-NEXT: [[TMP22:%.*]] = zext i32 [[ADD]] to i64
+// CHECK9-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK9-NEXT: store i32 2, ptr [[TMP23]], align 4
+// CHECK9-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK9-NEXT: store i32 3, ptr [[TMP24]], align 4
+// CHECK9-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK9-NEXT: store ptr [[TMP16]], ptr [[TMP25]], align 8
+// CHECK9-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK9-NEXT: store ptr [[TMP17]], ptr [[TMP26]], align 8
+// CHECK9-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK9-NEXT: store ptr [[TMP18]], ptr [[TMP27]], align 8
+// CHECK9-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK9-NEXT: store ptr @.offload_maptypes, ptr [[TMP28]], align 8
+// CHECK9-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK9-NEXT: store ptr null, ptr [[TMP29]], align 8
+// CHECK9-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK9-NEXT: store ptr null, ptr [[TMP30]], align 8
+// CHECK9-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK9-NEXT: store i64 [[TMP22]], ptr [[TMP31]], align 8
+// CHECK9-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK9-NEXT: store i64 0, ptr [[TMP32]], align 8
+// CHECK9-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK9-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP33]], align 4
+// CHECK9-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK9-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP34]], align 4
+// CHECK9-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK9-NEXT: store i32 0, ptr [[TMP35]], align 4
+// CHECK9-NEXT: [[TMP36:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l72.region_id, ptr [[KERNEL_ARGS]])
+// CHECK9-NEXT: [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
+// CHECK9-NEXT: br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK9: omp_offload.failed:
+// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l72(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
+// CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK9: omp_offload.cont:
+// CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 0
+// CHECK9-NEXT: [[TMP38:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK9-NEXT: [[TMP39:%.*]] = load ptr, ptr [[SAVED_STACK]], align 8
+// CHECK9-NEXT: call void @llvm.stackrestore(ptr [[TMP39]])
+// CHECK9-NEXT: ret i32 [[TMP38]]
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l72
+// CHECK9-SAME: (i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK9-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
+// CHECK9-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK9-NEXT: [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8
+// CHECK9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l72.omp_outlined, ptr [[N_ADDR]], i64 [[TMP0]], ptr [[TMP1]])
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l72.omp_outlined
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK9-NEXT: store ptr [[N]], ptr [[N_ADDR]], align 8
+// CHECK9-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
+// CHECK9-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[N_ADDR]], align 8
+// CHECK9-NEXT: [[TMP1:%.*]] = load i64, ptr [[VLA_ADDR]], align 8
+// CHECK9-NEXT: [[TMP2:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK9-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK9-NEXT: store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0
+// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK9-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK9-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK9-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK9-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]]
+// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK9: omp.precond.then:
+// CHECK9-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK9-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK9-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK9-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK9-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK9-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK9-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK9-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
+// CHECK9-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK9: cond.true:
+// CHECK9-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK9-NEXT: br label [[COND_END:%.*]]
+// CHECK9: cond.false:
+// CHECK9-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT: br label [[COND_END]]
+// CHECK9: cond.end:
+// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ]
+// CHECK9-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK9-NEXT: store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK9: omp.inner.for.cond:
+// CHECK9-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
+// CHECK9-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK9: omp.inner.for.body:
+// CHECK9-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK9-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64
+// CHECK9-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT: [[TMP19:%.*]] = zext i32 [[TMP18]] to i64
+// CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 5, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l72.omp_outlined.omp_outlined, i64 [[TMP17]], i64 [[TMP19]], ptr [[TMP0]], i64 [[TMP1]], ptr [[TMP2]])
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK9: omp.inner.for.inc:
+// CHECK9-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK9-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK9: omp.inner.for.end:
+// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK9: omp.loop.exit:
+// CHECK9-NEXT: [[TMP22:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
+// CHECK9-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP23]])
+// CHECK9-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK9: omp.precond.end:
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l72.omp_outlined.omp_outlined
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[I4:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK9-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK9-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK9-NEXT: store ptr [[N]], ptr [[N_ADDR]], align 8
+// CHECK9-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
+// CHECK9-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[N_ADDR]], align 8
+// CHECK9-NEXT: [[TMP1:%.*]] = load i64, ptr [[VLA_ADDR]], align 8
+// CHECK9-NEXT: [[TMP2:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK9-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK9-NEXT: store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0
+// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK9-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK9-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK9-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK9-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]]
+// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK9: omp.precond.then:
+// CHECK9-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK9-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK9-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT: [[TMP7:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK9-NEXT: [[CONV:%.*]] = trunc i64 [[TMP7]] to i32
+// CHECK9-NEXT: [[TMP8:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK9-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP8]] to i32
+// CHECK9-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK9-NEXT: store i32 [[CONV3]], ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK9-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK9-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
+// CHECK9-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP10]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK9-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK9-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
+// CHECK9-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK9: cond.true:
+// CHECK9-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK9-NEXT: br label [[COND_END:%.*]]
+// CHECK9: cond.false:
+// CHECK9-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT: br label [[COND_END]]
+// CHECK9: cond.end:
+// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ [[TMP13]], [[COND_TRUE]] ], [ [[TMP14]], [[COND_FALSE]] ]
+// CHECK9-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK9-NEXT: store i32 [[TMP15]], ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK9: omp.inner.for.cond:
+// CHECK9-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]]
+// CHECK9-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK9: omp.inner.for.body:
+// CHECK9-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1
+// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK9-NEXT: store i32 [[ADD]], ptr [[I4]], align 4
+// CHECK9-NEXT: [[TMP19:%.*]] = load i32, ptr [[I4]], align 4
+// CHECK9-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64
+// CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[IDXPROM]]
+// CHECK9-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4
+// CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK9: omp.body.continue:
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK9: omp.inner.for.inc:
+// CHECK9-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP20]], 1
+// CHECK9-NEXT: store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK9: omp.inner.for.end:
+// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK9: omp.loop.exit:
+// CHECK9-NEXT: [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
+// CHECK9-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP22]])
+// CHECK9-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK9: omp.precond.end:
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK9-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@_Z15teams_local_argv
+// CHECK11-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK11-NEXT: entry:
+// CHECK11-NEXT: [[N:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[SAVED_STACK:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 4
+// CHECK11-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 4
+// CHECK11-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 4
+// CHECK11-NEXT: [[DOTOFFLOAD_SIZES:%.*]] = alloca [3 x i64], align 4
+// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK11-NEXT: store i32 100, ptr [[N]], align 4
+// CHECK11-NEXT: [[TMP0:%.*]] = load i32, ptr [[N]], align 4
+// CHECK11-NEXT: [[TMP1:%.*]] = call ptr @llvm.stacksave()
+// CHECK11-NEXT: store ptr [[TMP1]], ptr [[SAVED_STACK]], align 4
+// CHECK11-NEXT: [[VLA:%.*]] = alloca i32, i32 [[TMP0]], align 4
+// CHECK11-NEXT: store i32 [[TMP0]], ptr [[__VLA_EXPR0]], align 4
+// CHECK11-NEXT: [[TMP2:%.*]] = load i32, ptr [[N]], align 4
+// CHECK11-NEXT: store i32 [[TMP2]], ptr [[N_CASTED]], align 4
+// CHECK11-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK11-NEXT: [[TMP4:%.*]] = mul nuw i32 [[TMP0]], 4
+// CHECK11-NEXT: [[TMP5:%.*]] = sext i32 [[TMP4]] to i64
+// CHECK11-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[DOTOFFLOAD_SIZES]], ptr align 4 @.offload_sizes, i32 24, i1 false)
+// CHECK11-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK11-NEXT: store i32 [[TMP3]], ptr [[TMP6]], align 4
+// CHECK11-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK11-NEXT: store i32 [[TMP3]], ptr [[TMP7]], align 4
+// CHECK11-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK11-NEXT: store ptr null, ptr [[TMP8]], align 4
+// CHECK11-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK11-NEXT: store i32 [[TMP0]], ptr [[TMP9]], align 4
+// CHECK11-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK11-NEXT: store i32 [[TMP0]], ptr [[TMP10]], align 4
+// CHECK11-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK11-NEXT: store ptr null, ptr [[TMP11]], align 4
+// CHECK11-NEXT: [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK11-NEXT: store ptr [[VLA]], ptr [[TMP12]], align 4
+// CHECK11-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK11-NEXT: store ptr [[VLA]], ptr [[TMP13]], align 4
+// CHECK11-NEXT: [[TMP14:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 2
+// CHECK11-NEXT: store i64 [[TMP5]], ptr [[TMP14]], align 4
+// CHECK11-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
+// CHECK11-NEXT: store ptr null, ptr [[TMP15]], align 4
+// CHECK11-NEXT: [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK11-NEXT: [[TMP17:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK11-NEXT: [[TMP18:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CHECK11-NEXT: [[TMP19:%.*]] = load i32, ptr [[N]], align 4
+// CHECK11-NEXT: store i32 [[TMP19]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP20]], 0
+// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK11-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK11-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK11-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
+// CHECK11-NEXT: [[TMP22:%.*]] = zext i32 [[ADD]] to i64
+// CHECK11-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK11-NEXT: store i32 2, ptr [[TMP23]], align 4
+// CHECK11-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK11-NEXT: store i32 3, ptr [[TMP24]], align 4
+// CHECK11-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK11-NEXT: store ptr [[TMP16]], ptr [[TMP25]], align 4
+// CHECK11-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK11-NEXT: store ptr [[TMP17]], ptr [[TMP26]], align 4
+// CHECK11-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK11-NEXT: store ptr [[TMP18]], ptr [[TMP27]], align 4
+// CHECK11-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK11-NEXT: store ptr @.offload_maptypes, ptr [[TMP28]], align 4
+// CHECK11-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK11-NEXT: store ptr null, ptr [[TMP29]], align 4
+// CHECK11-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK11-NEXT: store ptr null, ptr [[TMP30]], align 4
+// CHECK11-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK11-NEXT: store i64 [[TMP22]], ptr [[TMP31]], align 8
+// CHECK11-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK11-NEXT: store i64 0, ptr [[TMP32]], align 8
+// CHECK11-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK11-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP33]], align 4
+// CHECK11-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK11-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP34]], align 4
+// CHECK11-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK11-NEXT: store i32 0, ptr [[TMP35]], align 4
+// CHECK11-NEXT: [[TMP36:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l72.region_id, ptr [[KERNEL_ARGS]])
+// CHECK11-NEXT: [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
+// CHECK11-NEXT: br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK11: omp_offload.failed:
+// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l72(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
+// CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK11: omp_offload.cont:
+// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i32 0
+// CHECK11-NEXT: [[TMP38:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK11-NEXT: [[TMP39:%.*]] = load ptr, ptr [[SAVED_STACK]], align 4
+// CHECK11-NEXT: call void @llvm.stackrestore(ptr [[TMP39]])
+// CHECK11-NEXT: ret i32 [[TMP38]]
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l72
+// CHECK11-SAME: (i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK11-NEXT: entry:
+// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[VLA_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
+// CHECK11-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK11-NEXT: [[TMP0:%.*]] = load i32, ptr [[VLA_ADDR]], align 4
+// CHECK11-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l72.omp_outlined, ptr [[N_ADDR]], i32 [[TMP0]], ptr [[TMP1]])
+// CHECK11-NEXT: ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l72.omp_outlined
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
+// CHECK11-NEXT: entry:
+// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[VLA_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK11-NEXT: store ptr [[N]], ptr [[N_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
+// CHECK11-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[N_ADDR]], align 4
+// CHECK11-NEXT: [[TMP1:%.*]] = load i32, ptr [[VLA_ADDR]], align 4
+// CHECK11-NEXT: [[TMP2:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK11-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK11-NEXT: store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0
+// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK11-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK11-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK11-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK11-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]]
+// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK11: omp.precond.then:
+// CHECK11-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK11-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK11-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK11-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK11-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK11-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK11-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK11-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK11-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK11-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
+// CHECK11-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK11: cond.true:
+// CHECK11-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK11-NEXT: br label [[COND_END:%.*]]
+// CHECK11: cond.false:
+// CHECK11-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK11-NEXT: br label [[COND_END]]
+// CHECK11: cond.end:
+// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ]
+// CHECK11-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK11-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK11-NEXT: store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK11: omp.inner.for.cond:
+// CHECK11-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK11-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
+// CHECK11-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK11: omp.inner.for.body:
+// CHECK11-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK11-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 5, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l72.omp_outlined.omp_outlined, i32 [[TMP16]], i32 [[TMP17]], ptr [[TMP0]], i32 [[TMP1]], ptr [[TMP2]])
+// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK11: omp.inner.for.inc:
+// CHECK11-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
+// CHECK11-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK11: omp.inner.for.end:
+// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK11: omp.loop.exit:
+// CHECK11-NEXT: [[TMP20:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
+// CHECK11-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP21]])
+// CHECK11-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK11: omp.precond.end:
+// CHECK11-NEXT: ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l72.omp_outlined.omp_outlined
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
+// CHECK11-NEXT: entry:
+// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[VLA_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK11-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK11-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK11-NEXT: store ptr [[N]], ptr [[N_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
+// CHECK11-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[N_ADDR]], align 4
+// CHECK11-NEXT: [[TMP1:%.*]] = load i32, ptr [[VLA_ADDR]], align 4
+// CHECK11-NEXT: [[TMP2:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK11-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK11-NEXT: store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0
+// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK11-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK11-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK11-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK11-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]]
+// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK11: omp.precond.then:
+// CHECK11-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK11-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK11-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_UB]], align 4
+// CHECK11-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK11-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK11-NEXT: store i32 [[TMP7]], ptr [[DOTOMP_LB]], align 4
+// CHECK11-NEXT: store i32 [[TMP8]], ptr [[DOTOMP_UB]], align 4
+// CHECK11-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK11-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK11-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
+// CHECK11-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP10]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK11-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK11-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK11-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
+// CHECK11-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK11: cond.true:
+// CHECK11-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK11-NEXT: br label [[COND_END:%.*]]
+// CHECK11: cond.false:
+// CHECK11-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK11-NEXT: br label [[COND_END]]
+// CHECK11: cond.end:
+// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ [[TMP13]], [[COND_TRUE]] ], [ [[TMP14]], [[COND_FALSE]] ]
+// CHECK11-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK11-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK11-NEXT: store i32 [[TMP15]], ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK11: omp.inner.for.cond:
+// CHECK11-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK11-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]]
+// CHECK11-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK11: omp.inner.for.body:
+// CHECK11-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1
+// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK11-NEXT: store i32 [[ADD]], ptr [[I3]], align 4
+// CHECK11-NEXT: [[TMP19:%.*]] = load i32, ptr [[I3]], align 4
+// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 [[TMP19]]
+// CHECK11-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4
+// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK11: omp.body.continue:
+// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK11: omp.inner.for.inc:
+// CHECK11-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP20]], 1
+// CHECK11-NEXT: store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK11: omp.inner.for.end:
+// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK11: omp.loop.exit:
+// CHECK11-NEXT: [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
+// CHECK11-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP22]])
+// CHECK11-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK11: omp.precond.end:
+// CHECK11-NEXT: ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK11-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK11-NEXT: entry:
+// CHECK11-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK11-NEXT: ret void
+//
+//
+// CHECK17-LABEL: define {{[^@]+}}@_Z21teams_template_structv
+// CHECK17-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK17-NEXT: entry:
+// CHECK17-NEXT: [[V:%.*]] = alloca [[STRUCT_SS:%.*]], align 4
+// CHECK17-NEXT: [[CALL:%.*]] = call noundef signext i32 @_ZN2SSIiLi123ELx456EE3fooEv(ptr noundef nonnull align 4 dereferenceable(496) [[V]])
+// CHECK17-NEXT: ret i32 [[CALL]]
+//
+//
+// CHECK17-LABEL: define {{[^@]+}}@_ZN2SSIiLi123ELx456EE3fooEv
+// CHECK17-SAME: (ptr noundef nonnull align 4 dereferenceable(496) [[THIS:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK17-NEXT: entry:
+// CHECK17-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK17-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK17-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK17-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CHECK17-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK17-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK17-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK17-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK17-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_SS:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK17-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK17-NEXT: store ptr [[THIS1]], ptr [[TMP0]], align 8
+// CHECK17-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK17-NEXT: store ptr [[A]], ptr [[TMP1]], align 8
+// CHECK17-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK17-NEXT: store ptr null, ptr [[TMP2]], align 8
+// CHECK17-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK17-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK17-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK17-NEXT: store i32 2, ptr [[TMP5]], align 4
+// CHECK17-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK17-NEXT: store i32 1, ptr [[TMP6]], align 4
+// CHECK17-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK17-NEXT: store ptr [[TMP3]], ptr [[TMP7]], align 8
+// CHECK17-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK17-NEXT: store ptr [[TMP4]], ptr [[TMP8]], align 8
+// CHECK17-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK17-NEXT: store ptr @.offload_sizes, ptr [[TMP9]], align 8
+// CHECK17-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK17-NEXT: store ptr @.offload_maptypes, ptr [[TMP10]], align 8
+// CHECK17-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK17-NEXT: store ptr null, ptr [[TMP11]], align 8
+// CHECK17-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK17-NEXT: store ptr null, ptr [[TMP12]], align 8
+// CHECK17-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK17-NEXT: store i64 123, ptr [[TMP13]], align 8
+// CHECK17-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK17-NEXT: store i64 0, ptr [[TMP14]], align 8
+// CHECK17-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK17-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP15]], align 4
+// CHECK17-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK17-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4
+// CHECK17-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK17-NEXT: store i32 0, ptr [[TMP17]], align 4
+// CHECK17-NEXT: [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l108.region_id, ptr [[KERNEL_ARGS]])
+// CHECK17-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// CHECK17-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK17: omp_offload.failed:
+// CHECK17-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l108(ptr [[THIS1]]) #[[ATTR2:[0-9]+]]
+// CHECK17-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK17: omp_offload.cont:
+// CHECK17-NEXT: [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
+// CHECK17-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [123 x i32], ptr [[A2]], i64 0, i64 0
+// CHECK17-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK17-NEXT: ret i32 [[TMP20]]
+//
+//
+// CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l108
+// CHECK17-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK17-NEXT: entry:
+// CHECK17-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK17-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK17-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK17-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l108.omp_outlined, ptr [[TMP0]])
+// CHECK17-NEXT: ret void
+//
+//
+// CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l108.omp_outlined
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
+// CHECK17-NEXT: entry:
+// CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK17-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK17-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK17-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK17-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK17-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK17-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK17-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK17-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK17-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK17-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK17-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK17-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK17-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK17-NEXT: store i32 122, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK17-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK17-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK17-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK17-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK17-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP2]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK17-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK17-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 122
+// CHECK17-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK17: cond.true:
+// CHECK17-NEXT: br label [[COND_END:%.*]]
+// CHECK17: cond.false:
+// CHECK17-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK17-NEXT: br label [[COND_END]]
+// CHECK17: cond.end:
+// CHECK17-NEXT: [[COND:%.*]] = phi i32 [ 122, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK17-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK17-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK17-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK17-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK17: omp.inner.for.cond:
+// CHECK17-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK17-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK17-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK17-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK17: omp.inner.for.body:
+// CHECK17-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK17-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK17-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK17-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
+// CHECK17-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l108.omp_outlined.omp_outlined, i64 [[TMP9]], i64 [[TMP11]], ptr [[TMP0]])
+// CHECK17-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK17: omp.inner.for.inc:
+// CHECK17-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK17-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK17-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
+// CHECK17-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK17-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK17: omp.inner.for.end:
+// CHECK17-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK17: omp.loop.exit:
+// CHECK17-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]])
+// CHECK17-NEXT: ret void
+//
+//
+// CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l108.omp_outlined.omp_outlined
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
+// CHECK17-NEXT: entry:
+// CHECK17-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK17-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK17-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK17-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK17-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK17-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK17-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK17-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK17-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK17-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK17-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK17-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK17-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK17-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK17-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK17-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK17-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK17-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK17-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK17-NEXT: store i32 122, ptr [[DOTOMP_UB]], align 4
+// CHECK17-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK17-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK17-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK17-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32
+// CHECK17-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK17-NEXT: store i32 [[CONV1]], ptr [[DOTOMP_UB]], align 4
+// CHECK17-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK17-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK17-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK17-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+// CHECK17-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP4]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK17-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK17-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP5]], 122
+// CHECK17-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK17: cond.true:
+// CHECK17-NEXT: br label [[COND_END:%.*]]
+// CHECK17: cond.false:
+// CHECK17-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK17-NEXT: br label [[COND_END]]
+// CHECK17: cond.end:
+// CHECK17-NEXT: [[COND:%.*]] = phi i32 [ 122, [[COND_TRUE]] ], [ [[TMP6]], [[COND_FALSE]] ]
+// CHECK17-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK17-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK17-NEXT: store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
+// CHECK17-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK17: omp.inner.for.cond:
+// CHECK17-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK17-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK17-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK17-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK17: omp.inner.for.body:
+// CHECK17-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK17-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK17-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK17-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK17-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_SS:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CHECK17-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4
+// CHECK17-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP11]] to i64
+// CHECK17-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [123 x i32], ptr [[A]], i64 0, i64 [[IDXPROM]]
+// CHECK17-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4
+// CHECK17-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK17: omp.body.continue:
+// CHECK17-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK17: omp.inner.for.inc:
+// CHECK17-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK17-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP12]], 1
+// CHECK17-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK17-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK17: omp.inner.for.end:
+// CHECK17-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK17: omp.loop.exit:
+// CHECK17-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP4]])
+// CHECK17-NEXT: ret void
+//
+//
+// CHECK17-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK17-SAME: () #[[ATTR3:[0-9]+]] {
+// CHECK17-NEXT: entry:
+// CHECK17-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK17-NEXT: ret void
+//
+//
+// CHECK19-LABEL: define {{[^@]+}}@_Z21teams_template_structv
+// CHECK19-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK19-NEXT: entry:
+// CHECK19-NEXT: [[V:%.*]] = alloca [[STRUCT_SS:%.*]], align 4
+// CHECK19-NEXT: [[CALL:%.*]] = call noundef i32 @_ZN2SSIiLi123ELx456EE3fooEv(ptr noundef nonnull align 4 dereferenceable(496) [[V]])
+// CHECK19-NEXT: ret i32 [[CALL]]
+//
+//
+// CHECK19-LABEL: define {{[^@]+}}@_ZN2SSIiLi123ELx456EE3fooEv
+// CHECK19-SAME: (ptr noundef nonnull align 4 dereferenceable(496) [[THIS:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK19-NEXT: entry:
+// CHECK19-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK19-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// CHECK19-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// CHECK19-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// CHECK19-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK19-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK19-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK19-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK19-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_SS:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK19-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK19-NEXT: store ptr [[THIS1]], ptr [[TMP0]], align 4
+// CHECK19-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK19-NEXT: store ptr [[A]], ptr [[TMP1]], align 4
+// CHECK19-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK19-NEXT: store ptr null, ptr [[TMP2]], align 4
+// CHECK19-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK19-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK19-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK19-NEXT: store i32 2, ptr [[TMP5]], align 4
+// CHECK19-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK19-NEXT: store i32 1, ptr [[TMP6]], align 4
+// CHECK19-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK19-NEXT: store ptr [[TMP3]], ptr [[TMP7]], align 4
+// CHECK19-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK19-NEXT: store ptr [[TMP4]], ptr [[TMP8]], align 4
+// CHECK19-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK19-NEXT: store ptr @.offload_sizes, ptr [[TMP9]], align 4
+// CHECK19-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK19-NEXT: store ptr @.offload_maptypes, ptr [[TMP10]], align 4
+// CHECK19-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK19-NEXT: store ptr null, ptr [[TMP11]], align 4
+// CHECK19-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK19-NEXT: store ptr null, ptr [[TMP12]], align 4
+// CHECK19-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK19-NEXT: store i64 123, ptr [[TMP13]], align 8
+// CHECK19-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK19-NEXT: store i64 0, ptr [[TMP14]], align 8
+// CHECK19-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK19-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP15]], align 4
+// CHECK19-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK19-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4
+// CHECK19-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK19-NEXT: store i32 0, ptr [[TMP17]], align 4
+// CHECK19-NEXT: [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l108.region_id, ptr [[KERNEL_ARGS]])
+// CHECK19-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// CHECK19-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK19: omp_offload.failed:
+// CHECK19-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l108(ptr [[THIS1]]) #[[ATTR2:[0-9]+]]
+// CHECK19-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK19: omp_offload.cont:
+// CHECK19-NEXT: [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
+// CHECK19-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [123 x i32], ptr [[A2]], i32 0, i32 0
+// CHECK19-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK19-NEXT: ret i32 [[TMP20]]
+//
+//
+// CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l108
+// CHECK19-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK19-NEXT: entry:
+// CHECK19-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK19-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK19-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK19-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l108.omp_outlined, ptr [[TMP0]])
+// CHECK19-NEXT: ret void
+//
+//
+// CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l108.omp_outlined
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
+// CHECK19-NEXT: entry:
+// CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK19-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK19-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK19-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK19-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK19-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK19-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK19-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK19-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK19-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK19-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK19-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK19-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK19-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK19-NEXT: store i32 122, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK19-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK19-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK19-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK19-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK19-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP2]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK19-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK19-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 122
+// CHECK19-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK19: cond.true:
+// CHECK19-NEXT: br label [[COND_END:%.*]]
+// CHECK19: cond.false:
+// CHECK19-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK19-NEXT: br label [[COND_END]]
+// CHECK19: cond.end:
+// CHECK19-NEXT: [[COND:%.*]] = phi i32 [ 122, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK19-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK19-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK19-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK19-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK19: omp.inner.for.cond:
+// CHECK19-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK19-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK19-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK19-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK19: omp.inner.for.body:
+// CHECK19-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK19-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK19-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l108.omp_outlined.omp_outlined, i32 [[TMP8]], i32 [[TMP9]], ptr [[TMP0]])
+// CHECK19-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK19: omp.inner.for.inc:
+// CHECK19-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK19-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK19-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], [[TMP11]]
+// CHECK19-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK19-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK19: omp.inner.for.end:
+// CHECK19-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK19: omp.loop.exit:
+// CHECK19-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]])
+// CHECK19-NEXT: ret void
+//
+//
+// CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l108.omp_outlined.omp_outlined
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
+// CHECK19-NEXT: entry:
+// CHECK19-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK19-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK19-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK19-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK19-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK19-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK19-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK19-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK19-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK19-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK19-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK19-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK19-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK19-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK19-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK19-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK19-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK19-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK19-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK19-NEXT: store i32 122, ptr [[DOTOMP_UB]], align 4
+// CHECK19-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK19-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK19-NEXT: store i32 [[TMP1]], ptr [[DOTOMP_LB]], align 4
+// CHECK19-NEXT: store i32 [[TMP2]], ptr [[DOTOMP_UB]], align 4
+// CHECK19-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK19-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK19-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK19-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+// CHECK19-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP4]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK19-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK19-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP5]], 122
+// CHECK19-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK19: cond.true:
+// CHECK19-NEXT: br label [[COND_END:%.*]]
+// CHECK19: cond.false:
+// CHECK19-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK19-NEXT: br label [[COND_END]]
+// CHECK19: cond.end:
+// CHECK19-NEXT: [[COND:%.*]] = phi i32 [ 122, [[COND_TRUE]] ], [ [[TMP6]], [[COND_FALSE]] ]
+// CHECK19-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK19-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK19-NEXT: store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
+// CHECK19-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK19: omp.inner.for.cond:
+// CHECK19-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK19-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK19-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK19-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK19: omp.inner.for.body:
+// CHECK19-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK19-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK19-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK19-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK19-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_SS:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CHECK19-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4
+// CHECK19-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [123 x i32], ptr [[A]], i32 0, i32 [[TMP11]]
+// CHECK19-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4
+// CHECK19-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK19: omp.body.continue:
+// CHECK19-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK19: omp.inner.for.inc:
+// CHECK19-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK19-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP12]], 1
+// CHECK19-NEXT: store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4
+// CHECK19-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK19: omp.inner.for.end:
+// CHECK19-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK19: omp.loop.exit:
+// CHECK19-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP4]])
+// CHECK19-NEXT: ret void
+//
+//
+// CHECK19-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK19-SAME: () #[[ATTR3:[0-9]+]] {
+// CHECK19-NEXT: entry:
+// CHECK19-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK19-NEXT: ret void
+//
+//
+// CHECK25-LABEL: define {{[^@]+}}@main
+// CHECK25-SAME: (i32 noundef signext [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK25-NEXT: entry:
+// CHECK25-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[ARGV_ADDR:%.*]] = alloca ptr, align 8
+// CHECK25-NEXT: [[N:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[SAVED_STACK:%.*]] = alloca ptr, align 8
+// CHECK25-NEXT: [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// CHECK25-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK25-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 8
+// CHECK25-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 8
+// CHECK25-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 8
+// CHECK25-NEXT: [[DOTOFFLOAD_SIZES:%.*]] = alloca [3 x i64], align 8
+// CHECK25-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK25-NEXT: store i32 0, ptr [[RETVAL]], align 4
+// CHECK25-NEXT: store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4
+// CHECK25-NEXT: store ptr [[ARGV]], ptr [[ARGV_ADDR]], align 8
+// CHECK25-NEXT: store i32 100, ptr [[N]], align 4
+// CHECK25-NEXT: [[TMP0:%.*]] = load i32, ptr [[N]], align 4
+// CHECK25-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+// CHECK25-NEXT: [[TMP2:%.*]] = call ptr @llvm.stacksave()
+// CHECK25-NEXT: store ptr [[TMP2]], ptr [[SAVED_STACK]], align 8
+// CHECK25-NEXT: [[VLA:%.*]] = alloca i32, i64 [[TMP1]], align 4
+// CHECK25-NEXT: store i64 [[TMP1]], ptr [[__VLA_EXPR0]], align 8
+// CHECK25-NEXT: [[TMP3:%.*]] = load i32, ptr [[N]], align 4
+// CHECK25-NEXT: store i32 [[TMP3]], ptr [[N_CASTED]], align 4
+// CHECK25-NEXT: [[TMP4:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK25-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP1]], 4
+// CHECK25-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DOTOFFLOAD_SIZES]], ptr align 8 @.offload_sizes, i64 24, i1 false)
+// CHECK25-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK25-NEXT: store i64 [[TMP4]], ptr [[TMP6]], align 8
+// CHECK25-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK25-NEXT: store i64 [[TMP4]], ptr [[TMP7]], align 8
+// CHECK25-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK25-NEXT: store ptr null, ptr [[TMP8]], align 8
+// CHECK25-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK25-NEXT: store i64 [[TMP1]], ptr [[TMP9]], align 8
+// CHECK25-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK25-NEXT: store i64 [[TMP1]], ptr [[TMP10]], align 8
+// CHECK25-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK25-NEXT: store ptr null, ptr [[TMP11]], align 8
+// CHECK25-NEXT: [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK25-NEXT: store ptr [[VLA]], ptr [[TMP12]], align 8
+// CHECK25-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK25-NEXT: store ptr [[VLA]], ptr [[TMP13]], align 8
+// CHECK25-NEXT: [[TMP14:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 2
+// CHECK25-NEXT: store i64 [[TMP5]], ptr [[TMP14]], align 8
+// CHECK25-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
+// CHECK25-NEXT: store ptr null, ptr [[TMP15]], align 8
+// CHECK25-NEXT: [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK25-NEXT: [[TMP17:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK25-NEXT: [[TMP18:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CHECK25-NEXT: [[TMP19:%.*]] = load i32, ptr [[N]], align 4
+// CHECK25-NEXT: store i32 [[TMP19]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK25-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK25-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP20]], 0
+// CHECK25-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK25-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK25-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK25-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK25-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
+// CHECK25-NEXT: [[TMP22:%.*]] = zext i32 [[ADD]] to i64
+// CHECK25-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK25-NEXT: store i32 2, ptr [[TMP23]], align 4
+// CHECK25-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK25-NEXT: store i32 3, ptr [[TMP24]], align 4
+// CHECK25-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK25-NEXT: store ptr [[TMP16]], ptr [[TMP25]], align 8
+// CHECK25-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK25-NEXT: store ptr [[TMP17]], ptr [[TMP26]], align 8
+// CHECK25-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK25-NEXT: store ptr [[TMP18]], ptr [[TMP27]], align 8
+// CHECK25-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK25-NEXT: store ptr @.offload_maptypes, ptr [[TMP28]], align 8
+// CHECK25-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK25-NEXT: store ptr null, ptr [[TMP29]], align 8
+// CHECK25-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK25-NEXT: store ptr null, ptr [[TMP30]], align 8
+// CHECK25-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK25-NEXT: store i64 [[TMP22]], ptr [[TMP31]], align 8
+// CHECK25-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK25-NEXT: store i64 0, ptr [[TMP32]], align 8
+// CHECK25-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK25-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP33]], align 4
+// CHECK25-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK25-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP34]], align 4
+// CHECK25-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK25-NEXT: store i32 0, ptr [[TMP35]], align 4
+// CHECK25-NEXT: [[TMP36:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l161.region_id, ptr [[KERNEL_ARGS]])
+// CHECK25-NEXT: [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
+// CHECK25-NEXT: br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK25: omp_offload.failed:
+// CHECK25-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l161(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
+// CHECK25-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK25: omp_offload.cont:
+// CHECK25-NEXT: [[TMP38:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
+// CHECK25-NEXT: [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiLi10EEiT_(i32 noundef signext [[TMP38]])
+// CHECK25-NEXT: store i32 [[CALL]], ptr [[RETVAL]], align 4
+// CHECK25-NEXT: [[TMP39:%.*]] = load ptr, ptr [[SAVED_STACK]], align 8
+// CHECK25-NEXT: call void @llvm.stackrestore(ptr [[TMP39]])
+// CHECK25-NEXT: [[TMP40:%.*]] = load i32, ptr [[RETVAL]], align 4
+// CHECK25-NEXT: ret i32 [[TMP40]]
+//
+//
+// CHECK25-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l161
+// CHECK25-SAME: (i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK25-NEXT: entry:
+// CHECK25-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK25-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8
+// CHECK25-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK25-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK25-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
+// CHECK25-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK25-NEXT: [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8
+// CHECK25-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK25-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l161.omp_outlined, ptr [[N_ADDR]], i64 [[TMP0]], ptr [[TMP1]])
+// CHECK25-NEXT: ret void
+//
+//
+// CHECK25-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l161.omp_outlined
+// CHECK25-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
+// CHECK25-NEXT: entry:
+// CHECK25-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK25-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK25-NEXT: [[N_ADDR:%.*]] = alloca ptr, align 8
+// CHECK25-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8
+// CHECK25-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK25-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK25-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK25-NEXT: store ptr [[N]], ptr [[N_ADDR]], align 8
+// CHECK25-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
+// CHECK25-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK25-NEXT: [[TMP0:%.*]] = load ptr, ptr [[N_ADDR]], align 8
+// CHECK25-NEXT: [[TMP1:%.*]] = load i64, ptr [[VLA_ADDR]], align 8
+// CHECK25-NEXT: [[TMP2:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK25-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK25-NEXT: store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK25-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK25-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0
+// CHECK25-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK25-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK25-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK25-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK25-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK25-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]]
+// CHECK25-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK25: omp.precond.then:
+// CHECK25-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK25-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK25-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK25-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK25-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK25-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK25-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK25-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK25-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK25-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK25-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
+// CHECK25-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK25: cond.true:
+// CHECK25-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK25-NEXT: br label [[COND_END:%.*]]
+// CHECK25: cond.false:
+// CHECK25-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK25-NEXT: br label [[COND_END]]
+// CHECK25: cond.end:
+// CHECK25-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ]
+// CHECK25-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK25-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK25-NEXT: store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4
+// CHECK25-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK25: omp.inner.for.cond:
+// CHECK25-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK25-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK25-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
+// CHECK25-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK25: omp.inner.for.body:
+// CHECK25-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK25-NEXT: [[TMP17:%.*]] = zext i32 [[TMP16]] to i64
+// CHECK25-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK25-NEXT: [[TMP19:%.*]] = zext i32 [[TMP18]] to i64
+// CHECK25-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 5, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l161.omp_outlined.omp_outlined, i64 [[TMP17]], i64 [[TMP19]], ptr [[TMP0]], i64 [[TMP1]], ptr [[TMP2]])
+// CHECK25-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK25: omp.inner.for.inc:
+// CHECK25-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK25-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK25-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK25-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK25-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK25: omp.inner.for.end:
+// CHECK25-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK25: omp.loop.exit:
+// CHECK25-NEXT: [[TMP22:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK25-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
+// CHECK25-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP23]])
+// CHECK25-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK25: omp.precond.end:
+// CHECK25-NEXT: ret void
+//
+//
+// CHECK25-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l161.omp_outlined.omp_outlined
+// CHECK25-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
+// CHECK25-NEXT: entry:
+// CHECK25-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK25-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK25-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK25-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK25-NEXT: [[N_ADDR:%.*]] = alloca ptr, align 8
+// CHECK25-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8
+// CHECK25-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK25-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[I4:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK25-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK25-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK25-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK25-NEXT: store ptr [[N]], ptr [[N_ADDR]], align 8
+// CHECK25-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
+// CHECK25-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK25-NEXT: [[TMP0:%.*]] = load ptr, ptr [[N_ADDR]], align 8
+// CHECK25-NEXT: [[TMP1:%.*]] = load i64, ptr [[VLA_ADDR]], align 8
+// CHECK25-NEXT: [[TMP2:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK25-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK25-NEXT: store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK25-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK25-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0
+// CHECK25-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK25-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK25-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK25-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK25-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK25-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]]
+// CHECK25-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK25: omp.precond.then:
+// CHECK25-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK25-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK25-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_UB]], align 4
+// CHECK25-NEXT: [[TMP7:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK25-NEXT: [[CONV:%.*]] = trunc i64 [[TMP7]] to i32
+// CHECK25-NEXT: [[TMP8:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK25-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP8]] to i32
+// CHECK25-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK25-NEXT: store i32 [[CONV3]], ptr [[DOTOMP_UB]], align 4
+// CHECK25-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK25-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK25-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK25-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
+// CHECK25-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP10]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK25-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK25-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK25-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
+// CHECK25-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK25: cond.true:
+// CHECK25-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK25-NEXT: br label [[COND_END:%.*]]
+// CHECK25: cond.false:
+// CHECK25-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK25-NEXT: br label [[COND_END]]
+// CHECK25: cond.end:
+// CHECK25-NEXT: [[COND:%.*]] = phi i32 [ [[TMP13]], [[COND_TRUE]] ], [ [[TMP14]], [[COND_FALSE]] ]
+// CHECK25-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK25-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK25-NEXT: store i32 [[TMP15]], ptr [[DOTOMP_IV]], align 4
+// CHECK25-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK25: omp.inner.for.cond:
+// CHECK25-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK25-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK25-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]]
+// CHECK25-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK25: omp.inner.for.body:
+// CHECK25-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK25-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1
+// CHECK25-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK25-NEXT: store i32 [[ADD]], ptr [[I4]], align 4
+// CHECK25-NEXT: [[TMP19:%.*]] = load i32, ptr [[I4]], align 4
+// CHECK25-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64
+// CHECK25-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[IDXPROM]]
+// CHECK25-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4
+// CHECK25-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK25: omp.body.continue:
+// CHECK25-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK25: omp.inner.for.inc:
+// CHECK25-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK25-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP20]], 1
+// CHECK25-NEXT: store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4
+// CHECK25-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK25: omp.inner.for.end:
+// CHECK25-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK25: omp.loop.exit:
+// CHECK25-NEXT: [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK25-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
+// CHECK25-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP22]])
+// CHECK25-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK25: omp.precond.end:
+// CHECK25-NEXT: ret void
+//
+//
+// CHECK25-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_
+// CHECK25-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat {
+// CHECK25-NEXT: entry:
+// CHECK25-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[A:%.*]] = alloca [10 x i32], align 4
+// CHECK25-NEXT: [[TE:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[TH:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[TE_CASTED:%.*]] = alloca i64, align 8
+// CHECK25-NEXT: [[TH_CASTED:%.*]] = alloca i64, align 8
+// CHECK25-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 8
+// CHECK25-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 8
+// CHECK25-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 8
+// CHECK25-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK25-NEXT: store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4
+// CHECK25-NEXT: store i32 0, ptr [[TE]], align 4
+// CHECK25-NEXT: store i32 128, ptr [[TH]], align 4
+// CHECK25-NEXT: [[TMP0:%.*]] = load i32, ptr [[TE]], align 4
+// CHECK25-NEXT: store i32 [[TMP0]], ptr [[TE_CASTED]], align 4
+// CHECK25-NEXT: [[TMP1:%.*]] = load i64, ptr [[TE_CASTED]], align 8
+// CHECK25-NEXT: [[TMP2:%.*]] = load i32, ptr [[TH]], align 4
+// CHECK25-NEXT: store i32 [[TMP2]], ptr [[TH_CASTED]], align 4
+// CHECK25-NEXT: [[TMP3:%.*]] = load i64, ptr [[TH_CASTED]], align 8
+// CHECK25-NEXT: [[TMP4:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK25-NEXT: store i64 [[TMP1]], ptr [[TMP4]], align 8
+// CHECK25-NEXT: [[TMP5:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK25-NEXT: store i64 [[TMP1]], ptr [[TMP5]], align 8
+// CHECK25-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK25-NEXT: store ptr null, ptr [[TMP6]], align 8
+// CHECK25-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK25-NEXT: store i64 [[TMP3]], ptr [[TMP7]], align 8
+// CHECK25-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK25-NEXT: store i64 [[TMP3]], ptr [[TMP8]], align 8
+// CHECK25-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK25-NEXT: store ptr null, ptr [[TMP9]], align 8
+// CHECK25-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK25-NEXT: store ptr [[A]], ptr [[TMP10]], align 8
+// CHECK25-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK25-NEXT: store ptr [[A]], ptr [[TMP11]], align 8
+// CHECK25-NEXT: [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
+// CHECK25-NEXT: store ptr null, ptr [[TMP12]], align 8
+// CHECK25-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK25-NEXT: [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK25-NEXT: [[TMP15:%.*]] = load i32, ptr [[TE]], align 4
+// CHECK25-NEXT: [[TMP16:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP15]], 0
+// CHECK25-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK25-NEXT: store i32 2, ptr [[TMP17]], align 4
+// CHECK25-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK25-NEXT: store i32 3, ptr [[TMP18]], align 4
+// CHECK25-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK25-NEXT: store ptr [[TMP13]], ptr [[TMP19]], align 8
+// CHECK25-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK25-NEXT: store ptr [[TMP14]], ptr [[TMP20]], align 8
+// CHECK25-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK25-NEXT: store ptr @.offload_sizes.1, ptr [[TMP21]], align 8
+// CHECK25-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK25-NEXT: store ptr @.offload_maptypes.2, ptr [[TMP22]], align 8
+// CHECK25-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK25-NEXT: store ptr null, ptr [[TMP23]], align 8
+// CHECK25-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK25-NEXT: store ptr null, ptr [[TMP24]], align 8
+// CHECK25-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK25-NEXT: store i64 10, ptr [[TMP25]], align 8
+// CHECK25-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK25-NEXT: store i64 0, ptr [[TMP26]], align 8
+// CHECK25-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK25-NEXT: store [3 x i32] [[TMP16]], ptr [[TMP27]], align 4
+// CHECK25-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK25-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4
+// CHECK25-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK25-NEXT: store i32 0, ptr [[TMP29]], align 4
+// CHECK25-NEXT: [[TMP30:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 [[TMP15]], i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l150.region_id, ptr [[KERNEL_ARGS]])
+// CHECK25-NEXT: [[TMP31:%.*]] = icmp ne i32 [[TMP30]], 0
+// CHECK25-NEXT: br i1 [[TMP31]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK25: omp_offload.failed:
+// CHECK25-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l150(i64 [[TMP1]], i64 [[TMP3]], ptr [[A]]) #[[ATTR3]]
+// CHECK25-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK25: omp_offload.cont:
+// CHECK25-NEXT: ret i32 0
+//
+//
+// CHECK25-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l150
+// CHECK25-SAME: (i64 noundef [[TE:%.*]], i64 noundef [[TH:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
+// CHECK25-NEXT: entry:
+// CHECK25-NEXT: [[TE_ADDR:%.*]] = alloca i64, align 8
+// CHECK25-NEXT: [[TH_ADDR:%.*]] = alloca i64, align 8
+// CHECK25-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK25-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]])
+// CHECK25-NEXT: store i64 [[TE]], ptr [[TE_ADDR]], align 8
+// CHECK25-NEXT: store i64 [[TH]], ptr [[TH_ADDR]], align 8
+// CHECK25-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK25-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK25-NEXT: [[TMP2:%.*]] = load i32, ptr [[TE_ADDR]], align 4
+// CHECK25-NEXT: [[TMP3:%.*]] = load i32, ptr [[TH_ADDR]], align 4
+// CHECK25-NEXT: call void @__kmpc_push_num_teams(ptr @[[GLOB3]], i32 [[TMP0]], i32 [[TMP2]], i32 [[TMP3]])
+// CHECK25-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l150.omp_outlined, ptr [[TMP1]])
+// CHECK25-NEXT: ret void
+//
+//
+// CHECK25-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l150.omp_outlined
+// CHECK25-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
+// CHECK25-NEXT: entry:
+// CHECK25-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK25-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK25-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK25-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK25-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK25-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK25-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK25-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK25-NEXT: store i32 9, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK25-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK25-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK25-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK25-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK25-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP2]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK25-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK25-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9
+// CHECK25-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK25: cond.true:
+// CHECK25-NEXT: br label [[COND_END:%.*]]
+// CHECK25: cond.false:
+// CHECK25-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK25-NEXT: br label [[COND_END]]
+// CHECK25: cond.end:
+// CHECK25-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK25-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK25-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK25-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK25-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK25: omp.inner.for.cond:
+// CHECK25-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK25-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK25-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK25-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK25: omp.inner.for.body:
+// CHECK25-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK25-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK25-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK25-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
+// CHECK25-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l150.omp_outlined.omp_outlined, i64 [[TMP9]], i64 [[TMP11]], ptr [[TMP0]])
+// CHECK25-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK25: omp.inner.for.inc:
+// CHECK25-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK25-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK25-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
+// CHECK25-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK25-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK25: omp.inner.for.end:
+// CHECK25-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK25: omp.loop.exit:
+// CHECK25-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP2]])
+// CHECK25-NEXT: ret void
+//
+//
+// CHECK25-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l150.omp_outlined.omp_outlined
+// CHECK25-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
+// CHECK25-NEXT: entry:
+// CHECK25-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK25-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK25-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK25-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK25-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK25-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK25-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK25-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK25-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK25-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK25-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK25-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK25-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK25-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK25-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK25-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK25-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK25-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32
+// CHECK25-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK25-NEXT: store i32 [[CONV1]], ptr [[DOTOMP_UB]], align 4
+// CHECK25-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK25-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK25-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK25-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+// CHECK25-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP4]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK25-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK25-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP5]], 9
+// CHECK25-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK25: cond.true:
+// CHECK25-NEXT: br label [[COND_END:%.*]]
+// CHECK25: cond.false:
+// CHECK25-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK25-NEXT: br label [[COND_END]]
+// CHECK25: cond.end:
+// CHECK25-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP6]], [[COND_FALSE]] ]
+// CHECK25-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK25-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK25-NEXT: store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
+// CHECK25-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK25: omp.inner.for.cond:
+// CHECK25-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK25-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK25-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK25-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK25: omp.inner.for.body:
+// CHECK25-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK25-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK25-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK25-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK25-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4
+// CHECK25-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP11]] to i64
+// CHECK25-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK25-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4
+// CHECK25-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK25: omp.body.continue:
+// CHECK25-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK25: omp.inner.for.inc:
+// CHECK25-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK25-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP12]], 1
+// CHECK25-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK25-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK25: omp.inner.for.end:
+// CHECK25-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK25: omp.loop.exit:
+// CHECK25-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP4]])
+// CHECK25-NEXT: ret void
+//
+//
+// CHECK25-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK25-SAME: () #[[ATTR6:[0-9]+]] {
+// CHECK25-NEXT: entry:
+// CHECK25-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK25-NEXT: ret void
+//
+//
+// CHECK27-LABEL: define {{[^@]+}}@main
+// CHECK27-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK27-NEXT: entry:
+// CHECK27-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[ARGV_ADDR:%.*]] = alloca ptr, align 4
+// CHECK27-NEXT: [[N:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[SAVED_STACK:%.*]] = alloca ptr, align 4
+// CHECK27-NEXT: [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 4
+// CHECK27-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 4
+// CHECK27-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 4
+// CHECK27-NEXT: [[DOTOFFLOAD_SIZES:%.*]] = alloca [3 x i64], align 4
+// CHECK27-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK27-NEXT: store i32 0, ptr [[RETVAL]], align 4
+// CHECK27-NEXT: store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4
+// CHECK27-NEXT: store ptr [[ARGV]], ptr [[ARGV_ADDR]], align 4
+// CHECK27-NEXT: store i32 100, ptr [[N]], align 4
+// CHECK27-NEXT: [[TMP0:%.*]] = load i32, ptr [[N]], align 4
+// CHECK27-NEXT: [[TMP1:%.*]] = call ptr @llvm.stacksave()
+// CHECK27-NEXT: store ptr [[TMP1]], ptr [[SAVED_STACK]], align 4
+// CHECK27-NEXT: [[VLA:%.*]] = alloca i32, i32 [[TMP0]], align 4
+// CHECK27-NEXT: store i32 [[TMP0]], ptr [[__VLA_EXPR0]], align 4
+// CHECK27-NEXT: [[TMP2:%.*]] = load i32, ptr [[N]], align 4
+// CHECK27-NEXT: store i32 [[TMP2]], ptr [[N_CASTED]], align 4
+// CHECK27-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK27-NEXT: [[TMP4:%.*]] = mul nuw i32 [[TMP0]], 4
+// CHECK27-NEXT: [[TMP5:%.*]] = sext i32 [[TMP4]] to i64
+// CHECK27-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[DOTOFFLOAD_SIZES]], ptr align 4 @.offload_sizes, i32 24, i1 false)
+// CHECK27-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK27-NEXT: store i32 [[TMP3]], ptr [[TMP6]], align 4
+// CHECK27-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK27-NEXT: store i32 [[TMP3]], ptr [[TMP7]], align 4
+// CHECK27-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK27-NEXT: store ptr null, ptr [[TMP8]], align 4
+// CHECK27-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK27-NEXT: store i32 [[TMP0]], ptr [[TMP9]], align 4
+// CHECK27-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK27-NEXT: store i32 [[TMP0]], ptr [[TMP10]], align 4
+// CHECK27-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK27-NEXT: store ptr null, ptr [[TMP11]], align 4
+// CHECK27-NEXT: [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK27-NEXT: store ptr [[VLA]], ptr [[TMP12]], align 4
+// CHECK27-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK27-NEXT: store ptr [[VLA]], ptr [[TMP13]], align 4
+// CHECK27-NEXT: [[TMP14:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 2
+// CHECK27-NEXT: store i64 [[TMP5]], ptr [[TMP14]], align 4
+// CHECK27-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
+// CHECK27-NEXT: store ptr null, ptr [[TMP15]], align 4
+// CHECK27-NEXT: [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK27-NEXT: [[TMP17:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK27-NEXT: [[TMP18:%.*]] = getelementptr inbounds [3 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CHECK27-NEXT: [[TMP19:%.*]] = load i32, ptr [[N]], align 4
+// CHECK27-NEXT: store i32 [[TMP19]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK27-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK27-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP20]], 0
+// CHECK27-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK27-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK27-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK27-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK27-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP21]], 1
+// CHECK27-NEXT: [[TMP22:%.*]] = zext i32 [[ADD]] to i64
+// CHECK27-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK27-NEXT: store i32 2, ptr [[TMP23]], align 4
+// CHECK27-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK27-NEXT: store i32 3, ptr [[TMP24]], align 4
+// CHECK27-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK27-NEXT: store ptr [[TMP16]], ptr [[TMP25]], align 4
+// CHECK27-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK27-NEXT: store ptr [[TMP17]], ptr [[TMP26]], align 4
+// CHECK27-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK27-NEXT: store ptr [[TMP18]], ptr [[TMP27]], align 4
+// CHECK27-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK27-NEXT: store ptr @.offload_maptypes, ptr [[TMP28]], align 4
+// CHECK27-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK27-NEXT: store ptr null, ptr [[TMP29]], align 4
+// CHECK27-NEXT: [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK27-NEXT: store ptr null, ptr [[TMP30]], align 4
+// CHECK27-NEXT: [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK27-NEXT: store i64 [[TMP22]], ptr [[TMP31]], align 8
+// CHECK27-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK27-NEXT: store i64 0, ptr [[TMP32]], align 8
+// CHECK27-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK27-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP33]], align 4
+// CHECK27-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK27-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP34]], align 4
+// CHECK27-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK27-NEXT: store i32 0, ptr [[TMP35]], align 4
+// CHECK27-NEXT: [[TMP36:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l161.region_id, ptr [[KERNEL_ARGS]])
+// CHECK27-NEXT: [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
+// CHECK27-NEXT: br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK27: omp_offload.failed:
+// CHECK27-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l161(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
+// CHECK27-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK27: omp_offload.cont:
+// CHECK27-NEXT: [[TMP38:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
+// CHECK27-NEXT: [[CALL:%.*]] = call noundef i32 @_Z5tmainIiLi10EEiT_(i32 noundef [[TMP38]])
+// CHECK27-NEXT: store i32 [[CALL]], ptr [[RETVAL]], align 4
+// CHECK27-NEXT: [[TMP39:%.*]] = load ptr, ptr [[SAVED_STACK]], align 4
+// CHECK27-NEXT: call void @llvm.stackrestore(ptr [[TMP39]])
+// CHECK27-NEXT: [[TMP40:%.*]] = load i32, ptr [[RETVAL]], align 4
+// CHECK27-NEXT: ret i32 [[TMP40]]
+//
+//
+// CHECK27-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l161
+// CHECK27-SAME: (i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK27-NEXT: entry:
+// CHECK27-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[VLA_ADDR:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK27-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK27-NEXT: store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
+// CHECK27-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK27-NEXT: [[TMP0:%.*]] = load i32, ptr [[VLA_ADDR]], align 4
+// CHECK27-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK27-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l161.omp_outlined, ptr [[N_ADDR]], i32 [[TMP0]], ptr [[TMP1]])
+// CHECK27-NEXT: ret void
+//
+//
+// CHECK27-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l161.omp_outlined
+// CHECK27-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
+// CHECK27-NEXT: entry:
+// CHECK27-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK27-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK27-NEXT: [[N_ADDR:%.*]] = alloca ptr, align 4
+// CHECK27-NEXT: [[VLA_ADDR:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK27-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK27-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK27-NEXT: store ptr [[N]], ptr [[N_ADDR]], align 4
+// CHECK27-NEXT: store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
+// CHECK27-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK27-NEXT: [[TMP0:%.*]] = load ptr, ptr [[N_ADDR]], align 4
+// CHECK27-NEXT: [[TMP1:%.*]] = load i32, ptr [[VLA_ADDR]], align 4
+// CHECK27-NEXT: [[TMP2:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK27-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK27-NEXT: store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK27-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK27-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0
+// CHECK27-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK27-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK27-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK27-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK27-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK27-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]]
+// CHECK27-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK27: omp.precond.then:
+// CHECK27-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK27-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK27-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK27-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK27-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK27-NEXT: [[TMP7:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK27-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK27-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP8]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK27-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK27-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK27-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
+// CHECK27-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK27: cond.true:
+// CHECK27-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK27-NEXT: br label [[COND_END:%.*]]
+// CHECK27: cond.false:
+// CHECK27-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK27-NEXT: br label [[COND_END]]
+// CHECK27: cond.end:
+// CHECK27-NEXT: [[COND:%.*]] = phi i32 [ [[TMP11]], [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ]
+// CHECK27-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK27-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK27-NEXT: store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4
+// CHECK27-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK27: omp.inner.for.cond:
+// CHECK27-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK27-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK27-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
+// CHECK27-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK27: omp.inner.for.body:
+// CHECK27-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK27-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK27-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 5, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l161.omp_outlined.omp_outlined, i32 [[TMP16]], i32 [[TMP17]], ptr [[TMP0]], i32 [[TMP1]], ptr [[TMP2]])
+// CHECK27-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK27: omp.inner.for.inc:
+// CHECK27-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK27-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK27-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
+// CHECK27-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK27-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK27: omp.inner.for.end:
+// CHECK27-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK27: omp.loop.exit:
+// CHECK27-NEXT: [[TMP20:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK27-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
+// CHECK27-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP21]])
+// CHECK27-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK27: omp.precond.end:
+// CHECK27-NEXT: ret void
+//
+//
+// CHECK27-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l161.omp_outlined.omp_outlined
+// CHECK27-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
+// CHECK27-NEXT: entry:
+// CHECK27-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK27-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK27-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[N_ADDR:%.*]] = alloca ptr, align 4
+// CHECK27-NEXT: [[VLA_ADDR:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK27-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK27-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK27-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK27-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK27-NEXT: store ptr [[N]], ptr [[N_ADDR]], align 4
+// CHECK27-NEXT: store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
+// CHECK27-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK27-NEXT: [[TMP0:%.*]] = load ptr, ptr [[N_ADDR]], align 4
+// CHECK27-NEXT: [[TMP1:%.*]] = load i32, ptr [[VLA_ADDR]], align 4
+// CHECK27-NEXT: [[TMP2:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK27-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK27-NEXT: store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK27-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK27-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0
+// CHECK27-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK27-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK27-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK27-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK27-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK27-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]]
+// CHECK27-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK27: omp.precond.then:
+// CHECK27-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK27-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK27-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_UB]], align 4
+// CHECK27-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK27-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK27-NEXT: store i32 [[TMP7]], ptr [[DOTOMP_LB]], align 4
+// CHECK27-NEXT: store i32 [[TMP8]], ptr [[DOTOMP_UB]], align 4
+// CHECK27-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK27-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK27-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK27-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
+// CHECK27-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP10]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK27-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK27-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK27-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
+// CHECK27-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK27: cond.true:
+// CHECK27-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK27-NEXT: br label [[COND_END:%.*]]
+// CHECK27: cond.false:
+// CHECK27-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK27-NEXT: br label [[COND_END]]
+// CHECK27: cond.end:
+// CHECK27-NEXT: [[COND:%.*]] = phi i32 [ [[TMP13]], [[COND_TRUE]] ], [ [[TMP14]], [[COND_FALSE]] ]
+// CHECK27-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK27-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK27-NEXT: store i32 [[TMP15]], ptr [[DOTOMP_IV]], align 4
+// CHECK27-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK27: omp.inner.for.cond:
+// CHECK27-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK27-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK27-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]]
+// CHECK27-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK27: omp.inner.for.body:
+// CHECK27-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK27-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1
+// CHECK27-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK27-NEXT: store i32 [[ADD]], ptr [[I3]], align 4
+// CHECK27-NEXT: [[TMP19:%.*]] = load i32, ptr [[I3]], align 4
+// CHECK27-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 [[TMP19]]
+// CHECK27-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4
+// CHECK27-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK27: omp.body.continue:
+// CHECK27-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK27: omp.inner.for.inc:
+// CHECK27-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK27-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP20]], 1
+// CHECK27-NEXT: store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4
+// CHECK27-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK27: omp.inner.for.end:
+// CHECK27-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK27: omp.loop.exit:
+// CHECK27-NEXT: [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK27-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
+// CHECK27-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP22]])
+// CHECK27-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK27: omp.precond.end:
+// CHECK27-NEXT: ret void
+//
+//
+// CHECK27-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_
+// CHECK27-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat {
+// CHECK27-NEXT: entry:
+// CHECK27-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[A:%.*]] = alloca [10 x i32], align 4
+// CHECK27-NEXT: [[TE:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[TH:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[TE_CASTED:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[TH_CASTED:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 4
+// CHECK27-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 4
+// CHECK27-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 4
+// CHECK27-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK27-NEXT: store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4
+// CHECK27-NEXT: store i32 0, ptr [[TE]], align 4
+// CHECK27-NEXT: store i32 128, ptr [[TH]], align 4
+// CHECK27-NEXT: [[TMP0:%.*]] = load i32, ptr [[TE]], align 4
+// CHECK27-NEXT: store i32 [[TMP0]], ptr [[TE_CASTED]], align 4
+// CHECK27-NEXT: [[TMP1:%.*]] = load i32, ptr [[TE_CASTED]], align 4
+// CHECK27-NEXT: [[TMP2:%.*]] = load i32, ptr [[TH]], align 4
+// CHECK27-NEXT: store i32 [[TMP2]], ptr [[TH_CASTED]], align 4
+// CHECK27-NEXT: [[TMP3:%.*]] = load i32, ptr [[TH_CASTED]], align 4
+// CHECK27-NEXT: [[TMP4:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK27-NEXT: store i32 [[TMP1]], ptr [[TMP4]], align 4
+// CHECK27-NEXT: [[TMP5:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK27-NEXT: store i32 [[TMP1]], ptr [[TMP5]], align 4
+// CHECK27-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK27-NEXT: store ptr null, ptr [[TMP6]], align 4
+// CHECK27-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK27-NEXT: store i32 [[TMP3]], ptr [[TMP7]], align 4
+// CHECK27-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK27-NEXT: store i32 [[TMP3]], ptr [[TMP8]], align 4
+// CHECK27-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK27-NEXT: store ptr null, ptr [[TMP9]], align 4
+// CHECK27-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK27-NEXT: store ptr [[A]], ptr [[TMP10]], align 4
+// CHECK27-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK27-NEXT: store ptr [[A]], ptr [[TMP11]], align 4
+// CHECK27-NEXT: [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
+// CHECK27-NEXT: store ptr null, ptr [[TMP12]], align 4
+// CHECK27-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK27-NEXT: [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK27-NEXT: [[TMP15:%.*]] = load i32, ptr [[TE]], align 4
+// CHECK27-NEXT: [[TMP16:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP15]], 0
+// CHECK27-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK27-NEXT: store i32 2, ptr [[TMP17]], align 4
+// CHECK27-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK27-NEXT: store i32 3, ptr [[TMP18]], align 4
+// CHECK27-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK27-NEXT: store ptr [[TMP13]], ptr [[TMP19]], align 4
+// CHECK27-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK27-NEXT: store ptr [[TMP14]], ptr [[TMP20]], align 4
+// CHECK27-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK27-NEXT: store ptr @.offload_sizes.1, ptr [[TMP21]], align 4
+// CHECK27-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK27-NEXT: store ptr @.offload_maptypes.2, ptr [[TMP22]], align 4
+// CHECK27-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK27-NEXT: store ptr null, ptr [[TMP23]], align 4
+// CHECK27-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK27-NEXT: store ptr null, ptr [[TMP24]], align 4
+// CHECK27-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK27-NEXT: store i64 10, ptr [[TMP25]], align 8
+// CHECK27-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK27-NEXT: store i64 0, ptr [[TMP26]], align 8
+// CHECK27-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK27-NEXT: store [3 x i32] [[TMP16]], ptr [[TMP27]], align 4
+// CHECK27-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK27-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4
+// CHECK27-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK27-NEXT: store i32 0, ptr [[TMP29]], align 4
+// CHECK27-NEXT: [[TMP30:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 [[TMP15]], i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l150.region_id, ptr [[KERNEL_ARGS]])
+// CHECK27-NEXT: [[TMP31:%.*]] = icmp ne i32 [[TMP30]], 0
+// CHECK27-NEXT: br i1 [[TMP31]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK27: omp_offload.failed:
+// CHECK27-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l150(i32 [[TMP1]], i32 [[TMP3]], ptr [[A]]) #[[ATTR3]]
+// CHECK27-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK27: omp_offload.cont:
+// CHECK27-NEXT: ret i32 0
+//
+//
+// CHECK27-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l150
+// CHECK27-SAME: (i32 noundef [[TE:%.*]], i32 noundef [[TH:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
+// CHECK27-NEXT: entry:
+// CHECK27-NEXT: [[TE_ADDR:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[TH_ADDR:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK27-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]])
+// CHECK27-NEXT: store i32 [[TE]], ptr [[TE_ADDR]], align 4
+// CHECK27-NEXT: store i32 [[TH]], ptr [[TH_ADDR]], align 4
+// CHECK27-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK27-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK27-NEXT: [[TMP2:%.*]] = load i32, ptr [[TE_ADDR]], align 4
+// CHECK27-NEXT: [[TMP3:%.*]] = load i32, ptr [[TH_ADDR]], align 4
+// CHECK27-NEXT: call void @__kmpc_push_num_teams(ptr @[[GLOB3]], i32 [[TMP0]], i32 [[TMP2]], i32 [[TMP3]])
+// CHECK27-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l150.omp_outlined, ptr [[TMP1]])
+// CHECK27-NEXT: ret void
+//
+//
+// CHECK27-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l150.omp_outlined
+// CHECK27-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
+// CHECK27-NEXT: entry:
+// CHECK27-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK27-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK27-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK27-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK27-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK27-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK27-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK27-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK27-NEXT: store i32 9, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK27-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK27-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK27-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK27-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK27-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP2]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK27-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK27-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9
+// CHECK27-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK27: cond.true:
+// CHECK27-NEXT: br label [[COND_END:%.*]]
+// CHECK27: cond.false:
+// CHECK27-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK27-NEXT: br label [[COND_END]]
+// CHECK27: cond.end:
+// CHECK27-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK27-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK27-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK27-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK27-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK27: omp.inner.for.cond:
+// CHECK27-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK27-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK27-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK27-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK27: omp.inner.for.body:
+// CHECK27-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK27-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK27-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l150.omp_outlined.omp_outlined, i32 [[TMP8]], i32 [[TMP9]], ptr [[TMP0]])
+// CHECK27-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK27: omp.inner.for.inc:
+// CHECK27-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK27-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK27-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], [[TMP11]]
+// CHECK27-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK27-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK27: omp.inner.for.end:
+// CHECK27-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK27: omp.loop.exit:
+// CHECK27-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP2]])
+// CHECK27-NEXT: ret void
+//
+//
+// CHECK27-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l150.omp_outlined.omp_outlined
+// CHECK27-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
+// CHECK27-NEXT: entry:
+// CHECK27-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK27-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK27-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK27-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK27-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK27-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK27-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK27-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK27-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK27-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK27-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK27-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK27-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK27-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK27-NEXT: store i32 [[TMP1]], ptr [[DOTOMP_LB]], align 4
+// CHECK27-NEXT: store i32 [[TMP2]], ptr [[DOTOMP_UB]], align 4
+// CHECK27-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK27-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK27-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK27-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+// CHECK27-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP4]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK27-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK27-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP5]], 9
+// CHECK27-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK27: cond.true:
+// CHECK27-NEXT: br label [[COND_END:%.*]]
+// CHECK27: cond.false:
+// CHECK27-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK27-NEXT: br label [[COND_END]]
+// CHECK27: cond.end:
+// CHECK27-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP6]], [[COND_FALSE]] ]
+// CHECK27-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK27-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK27-NEXT: store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
+// CHECK27-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK27: omp.inner.for.cond:
+// CHECK27-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK27-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK27-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK27-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK27: omp.inner.for.body:
+// CHECK27-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK27-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK27-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK27-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK27-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4
+// CHECK27-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 [[TMP11]]
+// CHECK27-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4
+// CHECK27-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK27: omp.body.continue:
+// CHECK27-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK27: omp.inner.for.inc:
+// CHECK27-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK27-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP12]], 1
+// CHECK27-NEXT: store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4
+// CHECK27-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK27: omp.inner.for.end:
+// CHECK27-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK27: omp.loop.exit:
+// CHECK27-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP4]])
+// CHECK27-NEXT: ret void
+//
+//
+// CHECK27-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK27-SAME: () #[[ATTR6:[0-9]+]] {
+// CHECK27-NEXT: entry:
+// CHECK27-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK27-NEXT: ret void
+//
diff --git a/clang/test/OpenMP/teams_generic_loop_codegen.cpp b/clang/test/OpenMP/teams_generic_loop_codegen.cpp
new file mode 100644
index 00000000000000..2f3e70b1de5829
--- /dev/null
+++ b/clang/test/OpenMP/teams_generic_loop_codegen.cpp
@@ -0,0 +1,770 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -emit-llvm %s -o - | FileCheck %s --check-prefix=IR
+
+// Check same results after serialization round-trip
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -emit-pch -o %t %s
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=IR-PCH
+
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+int foo() {
+ int i;
+ int j;
+ int sum[10][10];
+
+ #pragma omp teams loop reduction(+:sum) collapse(2) bind(parallel) \
+ order(concurrent) lastprivate(j)
+ for(i=0; i<10; i++)
+ for(j=0; j<10; j++)
+ sum[i][j] += i;
+
+ return 0;
+}
+#endif
+// IR-LABEL: define {{[^@]+}}@_Z3foov
+// IR-SAME: () #[[ATTR0:[0-9]+]] {
+// IR-NEXT: entry:
+// IR-NEXT: [[I:%.*]] = alloca i32, align 4
+// IR-NEXT: [[J:%.*]] = alloca i32, align 4
+// IR-NEXT: [[SUM:%.*]] = alloca [10 x [10 x i32]], align 16
+// IR-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB4:[0-9]+]], i32 2, ptr @_Z3foov.omp_outlined, ptr [[J]], ptr [[SUM]])
+// IR-NEXT: ret i32 0
+//
+//
+// IR-LABEL: define {{[^@]+}}@_Z3foov.omp_outlined
+// IR-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1:[0-9]+]] {
+// IR-NEXT: entry:
+// IR-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// IR-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// IR-NEXT: [[J_ADDR:%.*]] = alloca ptr, align 8
+// IR-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8
+// IR-NEXT: [[SUM1:%.*]] = alloca [10 x [10 x i32]], align 16
+// IR-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// IR-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// IR-NEXT: [[_TMP2:%.*]] = alloca i32, align 4
+// IR-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// IR-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// IR-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// IR-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// IR-NEXT: [[J3:%.*]] = alloca i32, align 4
+// IR-NEXT: [[I:%.*]] = alloca i32, align 4
+// IR-NEXT: [[J4:%.*]] = alloca i32, align 4
+// IR-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
+// IR-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// IR-NEXT: store ptr [[J]], ptr [[J_ADDR]], align 8
+// IR-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
+// IR-NEXT: [[TMP0:%.*]] = load ptr, ptr [[J_ADDR]], align 8
+// IR-NEXT: [[TMP1:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
+// IR-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM1]], i32 0, i32 0, i32 0
+// IR-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
+// IR-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP2]]
+// IR-NEXT: br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
+// IR: omp.arrayinit.body:
+// IR-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
+// IR-NEXT: store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
+// IR-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
+// IR-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP2]]
+// IR-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
+// IR: omp.arrayinit.done:
+// IR-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// IR-NEXT: store i32 99, ptr [[DOTOMP_COMB_UB]], align 4
+// IR-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// IR-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// IR-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+// IR-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP4]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// IR-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// IR-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP5]], 99
+// IR-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// IR: cond.true:
+// IR-NEXT: br label [[COND_END:%.*]]
+// IR: cond.false:
+// IR-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// IR-NEXT: br label [[COND_END]]
+// IR: cond.end:
+// IR-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP6]], [[COND_FALSE]] ]
+// IR-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// IR-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// IR-NEXT: store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
+// IR-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// IR: omp.inner.for.cond:
+// IR-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// IR-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// IR-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// IR-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// IR: omp.inner.for.body:
+// IR-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// IR-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
+// IR-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// IR-NEXT: [[TMP13:%.*]] = zext i32 [[TMP12]] to i64
+// IR-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB4]], i32 4, ptr @_Z3foov.omp_outlined.omp_outlined, i64 [[TMP11]], i64 [[TMP13]], ptr [[J3]], ptr [[SUM1]])
+// IR-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// IR: omp.inner.for.inc:
+// IR-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// IR-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// IR-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
+// IR-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// IR-NEXT: br label [[OMP_INNER_FOR_COND]]
+// IR: omp.inner.for.end:
+// IR-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// IR: omp.loop.exit:
+// IR-NEXT: [[TMP16:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
+// IR-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP17]])
+// IR-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// IR-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// IR-NEXT: br i1 [[TMP19]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
+// IR: .omp.lastprivate.then:
+// IR-NEXT: store i32 10, ptr [[J3]], align 4
+// IR-NEXT: [[TMP20:%.*]] = load i32, ptr [[J3]], align 4
+// IR-NEXT: store i32 [[TMP20]], ptr [[TMP0]], align 4
+// IR-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]]
+// IR: .omp.lastprivate.done:
+// IR-NEXT: [[TMP21:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
+// IR-NEXT: store ptr [[SUM1]], ptr [[TMP21]], align 8
+// IR-NEXT: [[TMP22:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
+// IR-NEXT: [[TMP24:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3:[0-9]+]], i32 [[TMP23]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z3foov.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// IR-NEXT: switch i32 [[TMP24]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+// IR-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+// IR-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+// IR-NEXT: ]
+// IR: .omp.reduction.case1:
+// IR-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP1]], i64 100
+// IR-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP1]], [[TMP25]]
+// IR-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE10:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// IR: omp.arraycpy.body:
+// IR-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM1]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST6:%.*]] = phi ptr [ [[TMP1]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT8:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-NEXT: [[TMP26:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], align 4
+// IR-NEXT: [[TMP27:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
+// IR-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP26]], [[TMP27]]
+// IR-NEXT: store i32 [[ADD7]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], align 4
+// IR-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT8]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], i32 1
+// IR-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
+// IR-NEXT: [[OMP_ARRAYCPY_DONE9:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT8]], [[TMP25]]
+// IR-NEXT: br i1 [[OMP_ARRAYCPY_DONE9]], label [[OMP_ARRAYCPY_DONE10]], label [[OMP_ARRAYCPY_BODY]]
+// IR: omp.arraycpy.done10:
+// IR-NEXT: call void @__kmpc_end_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP23]], ptr @.gomp_critical_user_.reduction.var)
+// IR-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// IR: .omp.reduction.case2:
+// IR-NEXT: [[TMP28:%.*]] = getelementptr i32, ptr [[TMP1]], i64 100
+// IR-NEXT: [[OMP_ARRAYCPY_ISEMPTY11:%.*]] = icmp eq ptr [[TMP1]], [[TMP28]]
+// IR-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY11]], label [[OMP_ARRAYCPY_DONE18:%.*]], label [[OMP_ARRAYCPY_BODY12:%.*]]
+// IR: omp.arraycpy.body12:
+// IR-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST13:%.*]] = phi ptr [ [[SUM1]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT16:%.*]], [[OMP_ARRAYCPY_BODY12]] ]
+// IR-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST14:%.*]] = phi ptr [ [[TMP1]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT15:%.*]], [[OMP_ARRAYCPY_BODY12]] ]
+// IR-NEXT: [[TMP29:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST13]], align 4
+// IR-NEXT: [[TMP30:%.*]] = atomicrmw add ptr [[OMP_ARRAYCPY_DESTELEMENTPAST14]], i32 [[TMP29]] monotonic, align 4
+// IR-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT15]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST14]], i32 1
+// IR-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT16]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST13]], i32 1
+// IR-NEXT: [[OMP_ARRAYCPY_DONE17:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT15]], [[TMP28]]
+// IR-NEXT: br i1 [[OMP_ARRAYCPY_DONE17]], label [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_BODY12]]
+// IR: omp.arraycpy.done18:
+// IR-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// IR: .omp.reduction.default:
+// IR-NEXT: ret void
+//
+//
+// IR-LABEL: define {{[^@]+}}@_Z3foov.omp_outlined.omp_outlined
+// IR-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1]] {
+// IR-NEXT: entry:
+// IR-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// IR-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// IR-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// IR-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// IR-NEXT: [[J_ADDR:%.*]] = alloca ptr, align 8
+// IR-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8
+// IR-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// IR-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// IR-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// IR-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// IR-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// IR-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// IR-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// IR-NEXT: [[J3:%.*]] = alloca i32, align 4
+// IR-NEXT: [[SUM4:%.*]] = alloca [10 x [10 x i32]], align 16
+// IR-NEXT: [[I:%.*]] = alloca i32, align 4
+// IR-NEXT: [[J5:%.*]] = alloca i32, align 4
+// IR-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
+// IR-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// IR-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// IR-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// IR-NEXT: store ptr [[J]], ptr [[J_ADDR]], align 8
+// IR-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
+// IR-NEXT: [[TMP0:%.*]] = load ptr, ptr [[J_ADDR]], align 8
+// IR-NEXT: [[TMP1:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
+// IR-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// IR-NEXT: store i32 99, ptr [[DOTOMP_UB]], align 4
+// IR-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// IR-NEXT: [[CONV:%.*]] = trunc i64 [[TMP2]] to i32
+// IR-NEXT: [[TMP3:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// IR-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP3]] to i32
+// IR-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// IR-NEXT: store i32 [[CONV2]], ptr [[DOTOMP_UB]], align 4
+// IR-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// IR-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// IR-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4]], i32 0, i32 0, i32 0
+// IR-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
+// IR-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP4]]
+// IR-NEXT: br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
+// IR: omp.arrayinit.body:
+// IR-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
+// IR-NEXT: store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
+// IR-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
+// IR-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP4]]
+// IR-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
+// IR: omp.arrayinit.done:
+// IR-NEXT: [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+// IR-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP6]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// IR-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// IR-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 99
+// IR-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// IR: cond.true:
+// IR-NEXT: br label [[COND_END:%.*]]
+// IR: cond.false:
+// IR-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// IR-NEXT: br label [[COND_END]]
+// IR: cond.end:
+// IR-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP8]], [[COND_FALSE]] ]
+// IR-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// IR-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// IR-NEXT: store i32 [[TMP9]], ptr [[DOTOMP_IV]], align 4
+// IR-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// IR: omp.inner.for.cond:
+// IR-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3:![0-9]+]]
+// IR-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP10]], [[TMP11]]
+// IR-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// IR: omp.inner.for.body:
+// IR-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP12]], 10
+// IR-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
+// IR-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// IR-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT: [[DIV7:%.*]] = sdiv i32 [[TMP14]], 10
+// IR-NEXT: [[MUL8:%.*]] = mul nsw i32 [[DIV7]], 10
+// IR-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP13]], [[MUL8]]
+// IR-NEXT: [[MUL9:%.*]] = mul nsw i32 [[SUB]], 1
+// IR-NEXT: [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
+// IR-NEXT: store i32 [[ADD10]], ptr [[J3]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT: [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT: [[TMP16:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP16]] to i64
+// IR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4]], i64 0, i64 [[IDXPROM]]
+// IR-NEXT: [[TMP17:%.*]] = load i32, ptr [[J3]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT: [[IDXPROM11:%.*]] = sext i32 [[TMP17]] to i64
+// IR-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM11]]
+// IR-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP18]], [[TMP15]]
+// IR-NEXT: store i32 [[ADD13]], ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// IR: omp.body.continue:
+// IR-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// IR: omp.inner.for.inc:
+// IR-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP19]], 1
+// IR-NEXT: store i32 [[ADD14]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]]
+// IR: omp.inner.for.end:
+// IR-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// IR: omp.loop.exit:
+// IR-NEXT: [[TMP20:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
+// IR-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP21]])
+// IR-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
+// IR-NEXT: store ptr [[SUM4]], ptr [[TMP22]], align 8
+// IR-NEXT: [[TMP23:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4
+// IR-NEXT: [[TMP25:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP24]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z3foov.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// IR-NEXT: switch i32 [[TMP25]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+// IR-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+// IR-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+// IR-NEXT: ]
+// IR: .omp.reduction.case1:
+// IR-NEXT: [[TMP26:%.*]] = getelementptr i32, ptr [[TMP1]], i64 100
+// IR-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP1]], [[TMP26]]
+// IR-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE19:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// IR: omp.arraycpy.body:
+// IR-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM4]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST15:%.*]] = phi ptr [ [[TMP1]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT17:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-NEXT: [[TMP27:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
+// IR-NEXT: [[TMP28:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
+// IR-NEXT: [[ADD16:%.*]] = add nsw i32 [[TMP27]], [[TMP28]]
+// IR-NEXT: store i32 [[ADD16]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
+// IR-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT17]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], i32 1
+// IR-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
+// IR-NEXT: [[OMP_ARRAYCPY_DONE18:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT17]], [[TMP26]]
+// IR-NEXT: br i1 [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_DONE19]], label [[OMP_ARRAYCPY_BODY]]
+// IR: omp.arraycpy.done19:
+// IR-NEXT: call void @__kmpc_end_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP24]], ptr @.gomp_critical_user_.reduction.var)
+// IR-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// IR: .omp.reduction.case2:
+// IR-NEXT: [[TMP29:%.*]] = getelementptr i32, ptr [[TMP1]], i64 100
+// IR-NEXT: [[OMP_ARRAYCPY_ISEMPTY20:%.*]] = icmp eq ptr [[TMP1]], [[TMP29]]
+// IR-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY20]], label [[OMP_ARRAYCPY_DONE27:%.*]], label [[OMP_ARRAYCPY_BODY21:%.*]]
+// IR: omp.arraycpy.body21:
+// IR-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST22:%.*]] = phi ptr [ [[SUM4]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT25:%.*]], [[OMP_ARRAYCPY_BODY21]] ]
+// IR-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST23:%.*]] = phi ptr [ [[TMP1]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT24:%.*]], [[OMP_ARRAYCPY_BODY21]] ]
+// IR-NEXT: [[TMP30:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST22]], align 4
+// IR-NEXT: [[TMP31:%.*]] = atomicrmw add ptr [[OMP_ARRAYCPY_DESTELEMENTPAST23]], i32 [[TMP30]] monotonic, align 4
+// IR-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT24]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST23]], i32 1
+// IR-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT25]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST22]], i32 1
+// IR-NEXT: [[OMP_ARRAYCPY_DONE26:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT24]], [[TMP29]]
+// IR-NEXT: br i1 [[OMP_ARRAYCPY_DONE26]], label [[OMP_ARRAYCPY_DONE27]], label [[OMP_ARRAYCPY_BODY21]]
+// IR: omp.arraycpy.done27:
+// IR-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// IR: .omp.reduction.default:
+// IR-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// IR-NEXT: [[TMP33:%.*]] = icmp ne i32 [[TMP32]], 0
+// IR-NEXT: br i1 [[TMP33]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
+// IR: .omp.lastprivate.then:
+// IR-NEXT: store i32 10, ptr [[J3]], align 4
+// IR-NEXT: [[TMP34:%.*]] = load i32, ptr [[J3]], align 4
+// IR-NEXT: store i32 [[TMP34]], ptr [[TMP0]], align 4
+// IR-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]]
+// IR: .omp.lastprivate.done:
+// IR-NEXT: ret void
+//
+//
+// IR-LABEL: define {{[^@]+}}@_Z3foov.omp_outlined.omp_outlined.omp.reduction.reduction_func
+// IR-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+// IR-NEXT: entry:
+// IR-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
+// IR-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// IR-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// IR-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// IR-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// IR-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// IR-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
+// IR-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+// IR-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
+// IR-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// IR-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i64 100
+// IR-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP7]], [[TMP8]]
+// IR-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE2:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// IR: omp.arraycpy.body:
+// IR-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[TMP7]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-NEXT: [[TMP9:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
+// IR-NEXT: [[TMP10:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
+// IR-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
+// IR-NEXT: store i32 [[ADD]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
+// IR-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
+// IR-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
+// IR-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP8]]
+// IR-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE2]], label [[OMP_ARRAYCPY_BODY]]
+// IR: omp.arraycpy.done2:
+// IR-NEXT: ret void
+//
+//
+// IR-LABEL: define {{[^@]+}}@_Z3foov.omp_outlined.omp.reduction.reduction_func
+// IR-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
+// IR-NEXT: entry:
+// IR-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
+// IR-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// IR-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// IR-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// IR-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// IR-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// IR-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
+// IR-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+// IR-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
+// IR-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// IR-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i64 100
+// IR-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP7]], [[TMP8]]
+// IR-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE2:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// IR: omp.arraycpy.body:
+// IR-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[TMP7]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-NEXT: [[TMP9:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
+// IR-NEXT: [[TMP10:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
+// IR-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
+// IR-NEXT: store i32 [[ADD]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
+// IR-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
+// IR-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
+// IR-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP8]]
+// IR-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE2]], label [[OMP_ARRAYCPY_BODY]]
+// IR: omp.arraycpy.done2:
+// IR-NEXT: ret void
+//
+//
+// IR-PCH-LABEL: define {{[^@]+}}@_Z3foov
+// IR-PCH-SAME: () #[[ATTR0:[0-9]+]] {
+// IR-PCH-NEXT: entry:
+// IR-PCH-NEXT: [[I:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[J:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[SUM:%.*]] = alloca [10 x [10 x i32]], align 16
+// IR-PCH-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB4:[0-9]+]], i32 2, ptr @_Z3foov.omp_outlined, ptr [[J]], ptr [[SUM]])
+// IR-PCH-NEXT: ret i32 0
+//
+//
+// IR-PCH-LABEL: define {{[^@]+}}@_Z3foov.omp_outlined
+// IR-PCH-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1:[0-9]+]] {
+// IR-PCH-NEXT: entry:
+// IR-PCH-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-NEXT: [[J_ADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-NEXT: [[SUM1:%.*]] = alloca [10 x [10 x i32]], align 16
+// IR-PCH-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[_TMP2:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[J3:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[I:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[J4:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
+// IR-PCH-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-PCH-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// IR-PCH-NEXT: store ptr [[J]], ptr [[J_ADDR]], align 8
+// IR-PCH-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
+// IR-PCH-NEXT: [[TMP0:%.*]] = load ptr, ptr [[J_ADDR]], align 8
+// IR-PCH-NEXT: [[TMP1:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
+// IR-PCH-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM1]], i32 0, i32 0, i32 0
+// IR-PCH-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
+// IR-PCH-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP2]]
+// IR-PCH-NEXT: br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
+// IR-PCH: omp.arrayinit.body:
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
+// IR-PCH-NEXT: store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP2]]
+// IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
+// IR-PCH: omp.arrayinit.done:
+// IR-PCH-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// IR-PCH-NEXT: store i32 99, ptr [[DOTOMP_COMB_UB]], align 4
+// IR-PCH-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// IR-PCH-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// IR-PCH-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-PCH-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+// IR-PCH-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP4]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// IR-PCH-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// IR-PCH-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP5]], 99
+// IR-PCH-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// IR-PCH: cond.true:
+// IR-PCH-NEXT: br label [[COND_END:%.*]]
+// IR-PCH: cond.false:
+// IR-PCH-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// IR-PCH-NEXT: br label [[COND_END]]
+// IR-PCH: cond.end:
+// IR-PCH-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP6]], [[COND_FALSE]] ]
+// IR-PCH-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// IR-PCH-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// IR-PCH-NEXT: store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
+// IR-PCH-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// IR-PCH: omp.inner.for.cond:
+// IR-PCH-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// IR-PCH-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// IR-PCH-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// IR-PCH-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// IR-PCH: omp.inner.for.body:
+// IR-PCH-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// IR-PCH-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
+// IR-PCH-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// IR-PCH-NEXT: [[TMP13:%.*]] = zext i32 [[TMP12]] to i64
+// IR-PCH-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB4]], i32 4, ptr @_Z3foov.omp_outlined.omp_outlined, i64 [[TMP11]], i64 [[TMP13]], ptr [[J3]], ptr [[SUM1]])
+// IR-PCH-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// IR-PCH: omp.inner.for.inc:
+// IR-PCH-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// IR-PCH-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// IR-PCH-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
+// IR-PCH-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// IR-PCH-NEXT: br label [[OMP_INNER_FOR_COND]]
+// IR-PCH: omp.inner.for.end:
+// IR-PCH-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// IR-PCH: omp.loop.exit:
+// IR-PCH-NEXT: [[TMP16:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-PCH-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
+// IR-PCH-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP17]])
+// IR-PCH-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// IR-PCH-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// IR-PCH-NEXT: br i1 [[TMP19]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
+// IR-PCH: .omp.lastprivate.then:
+// IR-PCH-NEXT: store i32 10, ptr [[J3]], align 4
+// IR-PCH-NEXT: [[TMP20:%.*]] = load i32, ptr [[J3]], align 4
+// IR-PCH-NEXT: store i32 [[TMP20]], ptr [[TMP0]], align 4
+// IR-PCH-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]]
+// IR-PCH: .omp.lastprivate.done:
+// IR-PCH-NEXT: [[TMP21:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
+// IR-PCH-NEXT: store ptr [[SUM1]], ptr [[TMP21]], align 8
+// IR-PCH-NEXT: [[TMP22:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-PCH-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
+// IR-PCH-NEXT: [[TMP24:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3:[0-9]+]], i32 [[TMP23]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z3foov.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// IR-PCH-NEXT: switch i32 [[TMP24]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+// IR-PCH-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+// IR-PCH-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+// IR-PCH-NEXT: ]
+// IR-PCH: .omp.reduction.case1:
+// IR-PCH-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP1]], i64 100
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP1]], [[TMP25]]
+// IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE10:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// IR-PCH: omp.arraycpy.body:
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM1]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST6:%.*]] = phi ptr [ [[TMP1]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT8:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-PCH-NEXT: [[TMP26:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], align 4
+// IR-PCH-NEXT: [[TMP27:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
+// IR-PCH-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP26]], [[TMP27]]
+// IR-PCH-NEXT: store i32 [[ADD7]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], align 4
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT8]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST6]], i32 1
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DONE9:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT8]], [[TMP25]]
+// IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_DONE9]], label [[OMP_ARRAYCPY_DONE10]], label [[OMP_ARRAYCPY_BODY]]
+// IR-PCH: omp.arraycpy.done10:
+// IR-PCH-NEXT: call void @__kmpc_end_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP23]], ptr @.gomp_critical_user_.reduction.var)
+// IR-PCH-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// IR-PCH: .omp.reduction.case2:
+// IR-PCH-NEXT: [[TMP28:%.*]] = getelementptr i32, ptr [[TMP1]], i64 100
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_ISEMPTY11:%.*]] = icmp eq ptr [[TMP1]], [[TMP28]]
+// IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY11]], label [[OMP_ARRAYCPY_DONE18:%.*]], label [[OMP_ARRAYCPY_BODY12:%.*]]
+// IR-PCH: omp.arraycpy.body12:
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST13:%.*]] = phi ptr [ [[SUM1]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT16:%.*]], [[OMP_ARRAYCPY_BODY12]] ]
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST14:%.*]] = phi ptr [ [[TMP1]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT15:%.*]], [[OMP_ARRAYCPY_BODY12]] ]
+// IR-PCH-NEXT: [[TMP29:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST13]], align 4
+// IR-PCH-NEXT: [[TMP30:%.*]] = atomicrmw add ptr [[OMP_ARRAYCPY_DESTELEMENTPAST14]], i32 [[TMP29]] monotonic, align 4
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT15]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST14]], i32 1
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT16]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST13]], i32 1
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DONE17:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT15]], [[TMP28]]
+// IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_DONE17]], label [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_BODY12]]
+// IR-PCH: omp.arraycpy.done18:
+// IR-PCH-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// IR-PCH: .omp.reduction.default:
+// IR-PCH-NEXT: ret void
+//
+//
+// IR-PCH-LABEL: define {{[^@]+}}@_Z3foov.omp_outlined.omp_outlined
+// IR-PCH-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR1]] {
+// IR-PCH-NEXT: entry:
+// IR-PCH-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// IR-PCH-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// IR-PCH-NEXT: [[J_ADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[J3:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[SUM4:%.*]] = alloca [10 x [10 x i32]], align 16
+// IR-PCH-NEXT: [[I:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[J5:%.*]] = alloca i32, align 4
+// IR-PCH-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
+// IR-PCH-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-PCH-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// IR-PCH-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// IR-PCH-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// IR-PCH-NEXT: store ptr [[J]], ptr [[J_ADDR]], align 8
+// IR-PCH-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR]], align 8
+// IR-PCH-NEXT: [[TMP0:%.*]] = load ptr, ptr [[J_ADDR]], align 8
+// IR-PCH-NEXT: [[TMP1:%.*]] = load ptr, ptr [[SUM_ADDR]], align 8
+// IR-PCH-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// IR-PCH-NEXT: store i32 99, ptr [[DOTOMP_UB]], align 4
+// IR-PCH-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// IR-PCH-NEXT: [[CONV:%.*]] = trunc i64 [[TMP2]] to i32
+// IR-PCH-NEXT: [[TMP3:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// IR-PCH-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP3]] to i32
+// IR-PCH-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// IR-PCH-NEXT: store i32 [[CONV2]], ptr [[DOTOMP_UB]], align 4
+// IR-PCH-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// IR-PCH-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// IR-PCH-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4]], i32 0, i32 0, i32 0
+// IR-PCH-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[ARRAY_BEGIN]], i64 100
+// IR-PCH-NEXT: [[OMP_ARRAYINIT_ISEMPTY:%.*]] = icmp eq ptr [[ARRAY_BEGIN]], [[TMP4]]
+// IR-PCH-NEXT: br i1 [[OMP_ARRAYINIT_ISEMPTY]], label [[OMP_ARRAYINIT_DONE:%.*]], label [[OMP_ARRAYINIT_BODY:%.*]]
+// IR-PCH: omp.arrayinit.body:
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYINIT_BODY]] ]
+// IR-PCH-NEXT: store i32 0, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP4]]
+// IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYINIT_DONE]], label [[OMP_ARRAYINIT_BODY]]
+// IR-PCH: omp.arrayinit.done:
+// IR-PCH-NEXT: [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-PCH-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+// IR-PCH-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP6]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// IR-PCH-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// IR-PCH-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP7]], 99
+// IR-PCH-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// IR-PCH: cond.true:
+// IR-PCH-NEXT: br label [[COND_END:%.*]]
+// IR-PCH: cond.false:
+// IR-PCH-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// IR-PCH-NEXT: br label [[COND_END]]
+// IR-PCH: cond.end:
+// IR-PCH-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP8]], [[COND_FALSE]] ]
+// IR-PCH-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// IR-PCH-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// IR-PCH-NEXT: store i32 [[TMP9]], ptr [[DOTOMP_IV]], align 4
+// IR-PCH-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// IR-PCH: omp.inner.for.cond:
+// IR-PCH-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3:![0-9]+]]
+// IR-PCH-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP10]], [[TMP11]]
+// IR-PCH-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// IR-PCH: omp.inner.for.body:
+// IR-PCH-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP12]], 10
+// IR-PCH-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
+// IR-PCH-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// IR-PCH-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT: [[DIV7:%.*]] = sdiv i32 [[TMP14]], 10
+// IR-PCH-NEXT: [[MUL8:%.*]] = mul nsw i32 [[DIV7]], 10
+// IR-PCH-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP13]], [[MUL8]]
+// IR-PCH-NEXT: [[MUL9:%.*]] = mul nsw i32 [[SUB]], 1
+// IR-PCH-NEXT: [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
+// IR-PCH-NEXT: store i32 [[ADD10]], ptr [[J3]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT: [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT: [[TMP16:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP16]] to i64
+// IR-PCH-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[SUM4]], i64 0, i64 [[IDXPROM]]
+// IR-PCH-NEXT: [[TMP17:%.*]] = load i32, ptr [[J3]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT: [[IDXPROM11:%.*]] = sext i32 [[TMP17]] to i64
+// IR-PCH-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM11]]
+// IR-PCH-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT: [[ADD13:%.*]] = add nsw i32 [[TMP18]], [[TMP15]]
+// IR-PCH-NEXT: store i32 [[ADD13]], ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// IR-PCH: omp.body.continue:
+// IR-PCH-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// IR-PCH: omp.inner.for.inc:
+// IR-PCH-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP19]], 1
+// IR-PCH-NEXT: store i32 [[ADD14]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
+// IR-PCH-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]]
+// IR-PCH: omp.inner.for.end:
+// IR-PCH-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// IR-PCH: omp.loop.exit:
+// IR-PCH-NEXT: [[TMP20:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-PCH-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
+// IR-PCH-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP21]])
+// IR-PCH-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
+// IR-PCH-NEXT: store ptr [[SUM4]], ptr [[TMP22]], align 8
+// IR-PCH-NEXT: [[TMP23:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// IR-PCH-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4
+// IR-PCH-NEXT: [[TMP25:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP24]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z3foov.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// IR-PCH-NEXT: switch i32 [[TMP25]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+// IR-PCH-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+// IR-PCH-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+// IR-PCH-NEXT: ]
+// IR-PCH: .omp.reduction.case1:
+// IR-PCH-NEXT: [[TMP26:%.*]] = getelementptr i32, ptr [[TMP1]], i64 100
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP1]], [[TMP26]]
+// IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE19:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// IR-PCH: omp.arraycpy.body:
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM4]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST15:%.*]] = phi ptr [ [[TMP1]], [[DOTOMP_REDUCTION_CASE1]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT17:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-PCH-NEXT: [[TMP27:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
+// IR-PCH-NEXT: [[TMP28:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
+// IR-PCH-NEXT: [[ADD16:%.*]] = add nsw i32 [[TMP27]], [[TMP28]]
+// IR-PCH-NEXT: store i32 [[ADD16]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], align 4
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT17]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST15]], i32 1
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DONE18:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT17]], [[TMP26]]
+// IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_DONE18]], label [[OMP_ARRAYCPY_DONE19]], label [[OMP_ARRAYCPY_BODY]]
+// IR-PCH: omp.arraycpy.done19:
+// IR-PCH-NEXT: call void @__kmpc_end_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP24]], ptr @.gomp_critical_user_.reduction.var)
+// IR-PCH-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// IR-PCH: .omp.reduction.case2:
+// IR-PCH-NEXT: [[TMP29:%.*]] = getelementptr i32, ptr [[TMP1]], i64 100
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_ISEMPTY20:%.*]] = icmp eq ptr [[TMP1]], [[TMP29]]
+// IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY20]], label [[OMP_ARRAYCPY_DONE27:%.*]], label [[OMP_ARRAYCPY_BODY21:%.*]]
+// IR-PCH: omp.arraycpy.body21:
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST22:%.*]] = phi ptr [ [[SUM4]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT25:%.*]], [[OMP_ARRAYCPY_BODY21]] ]
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST23:%.*]] = phi ptr [ [[TMP1]], [[DOTOMP_REDUCTION_CASE2]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT24:%.*]], [[OMP_ARRAYCPY_BODY21]] ]
+// IR-PCH-NEXT: [[TMP30:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST22]], align 4
+// IR-PCH-NEXT: [[TMP31:%.*]] = atomicrmw add ptr [[OMP_ARRAYCPY_DESTELEMENTPAST23]], i32 [[TMP30]] monotonic, align 4
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT24]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST23]], i32 1
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT25]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST22]], i32 1
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DONE26:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT24]], [[TMP29]]
+// IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_DONE26]], label [[OMP_ARRAYCPY_DONE27]], label [[OMP_ARRAYCPY_BODY21]]
+// IR-PCH: omp.arraycpy.done27:
+// IR-PCH-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// IR-PCH: .omp.reduction.default:
+// IR-PCH-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// IR-PCH-NEXT: [[TMP33:%.*]] = icmp ne i32 [[TMP32]], 0
+// IR-PCH-NEXT: br i1 [[TMP33]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]]
+// IR-PCH: .omp.lastprivate.then:
+// IR-PCH-NEXT: store i32 10, ptr [[J3]], align 4
+// IR-PCH-NEXT: [[TMP34:%.*]] = load i32, ptr [[J3]], align 4
+// IR-PCH-NEXT: store i32 [[TMP34]], ptr [[TMP0]], align 4
+// IR-PCH-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]]
+// IR-PCH: .omp.lastprivate.done:
+// IR-PCH-NEXT: ret void
+//
+//
+// IR-PCH-LABEL: define {{[^@]+}}@_Z3foov.omp_outlined.omp_outlined.omp.reduction.reduction_func
+// IR-PCH-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+// IR-PCH-NEXT: entry:
+// IR-PCH-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// IR-PCH-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// IR-PCH-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// IR-PCH-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// IR-PCH-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// IR-PCH-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
+// IR-PCH-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+// IR-PCH-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
+// IR-PCH-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// IR-PCH-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i64 100
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP7]], [[TMP8]]
+// IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE2:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// IR-PCH: omp.arraycpy.body:
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[TMP7]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-PCH-NEXT: [[TMP9:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
+// IR-PCH-NEXT: [[TMP10:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
+// IR-PCH-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
+// IR-PCH-NEXT: store i32 [[ADD]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP8]]
+// IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE2]], label [[OMP_ARRAYCPY_BODY]]
+// IR-PCH: omp.arraycpy.done2:
+// IR-PCH-NEXT: ret void
+//
+//
+// IR-PCH-LABEL: define {{[^@]+}}@_Z3foov.omp_outlined.omp.reduction.reduction_func
+// IR-PCH-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
+// IR-PCH-NEXT: entry:
+// IR-PCH-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
+// IR-PCH-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// IR-PCH-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// IR-PCH-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// IR-PCH-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// IR-PCH-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// IR-PCH-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
+// IR-PCH-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+// IR-PCH-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
+// IR-PCH-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// IR-PCH-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i64 100
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP7]], [[TMP8]]
+// IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE2:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
+// IR-PCH: omp.arraycpy.body:
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP5]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[TMP7]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
+// IR-PCH-NEXT: [[TMP9:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
+// IR-PCH-NEXT: [[TMP10:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
+// IR-PCH-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
+// IR-PCH-NEXT: store i32 [[ADD]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], align 4
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
+// IR-PCH-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP8]]
+// IR-PCH-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE2]], label [[OMP_ARRAYCPY_BODY]]
+// IR-PCH: omp.arraycpy.done2:
+// IR-PCH-NEXT: ret void
+//
diff --git a/clang/test/OpenMP/teams_generic_loop_collapse_codgen.cpp b/clang/test/OpenMP/teams_generic_loop_collapse_codgen.cpp
new file mode 100644
index 00000000000000..55eb6b28e5eea8
--- /dev/null
+++ b/clang/test/OpenMP/teams_generic_loop_collapse_codgen.cpp
@@ -0,0 +1,1894 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// Test host codegen.
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK3
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK3
+
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+#ifdef CK1
+
+template <typename T, int X, long long Y>
+struct SS{
+ T a[X][Y];
+
+ int foo(void) {
+
+ #pragma omp target
+ #pragma omp teams loop collapse(2)
+ for(int i = 0; i < X; i++) {
+ for(int j = 0; j < Y; j++) {
+ a[i][j] = (T)0;
+ }
+ }
+
+ // discard loop variables not needed here
+
+
+ return a[0][0];
+ }
+};
+
+int teams_template_struct(void) {
+ SS<int, 123, 456> V;
+ return V.foo();
+
+}
+#endif // CK1
+
+// Test host codegen.
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK9
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK9
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK11
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK11
+
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCK2 -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCK2 -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+#ifdef CK2
+
+template <typename T, int n, int m>
+int tmain(T argc) {
+ T a[n][m];
+ #pragma omp target
+ #pragma omp teams loop collapse(2)
+ for(int i = 0; i < n; i++) {
+ for(int j = 0; j < m; j++) {
+ a[i][j] = (T)0;
+ }
+ }
+ return 0;
+}
+
+int main (int argc, char **argv) {
+ int n = 100;
+ int m = 2;
+ int a[n][m];
+ #pragma omp target
+ #pragma omp teams loop collapse(2)
+ for(int i = 0; i < n; i++) {
+ for(int j = 0; j < m; j++) {
+ a[i][j] = 0;
+ }
+ }
+ return tmain<int, 10, 2>(argc);
+}
+
+
+
+
+
+
+
+
+// discard loop variables not needed here
+
+
+#endif // CK2
+#endif // #ifndef HEADER
+// CHECK1-LABEL: define {{[^@]+}}@_Z21teams_template_structv
+// CHECK1-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[V:%.*]] = alloca [[STRUCT_SS:%.*]], align 4
+// CHECK1-NEXT: [[CALL:%.*]] = call noundef signext i32 @_ZN2SSIiLi123ELx456EE3fooEv(ptr noundef nonnull align 4 dereferenceable(224352) [[V]])
+// CHECK1-NEXT: ret i32 [[CALL]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_ZN2SSIiLi123ELx456EE3fooEv
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(224352) [[THIS:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[_TMP2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_SS:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT: store ptr [[THIS1]], ptr [[TMP0]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT: store ptr [[A]], ptr [[TMP1]], align 8
+// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK1-NEXT: store ptr null, ptr [[TMP2]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK1-NEXT: store i32 2, ptr [[TMP5]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK1-NEXT: store i32 1, ptr [[TMP6]], align 4
+// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK1-NEXT: store ptr [[TMP3]], ptr [[TMP7]], align 8
+// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK1-NEXT: store ptr [[TMP4]], ptr [[TMP8]], align 8
+// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK1-NEXT: store ptr @.offload_sizes, ptr [[TMP9]], align 8
+// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK1-NEXT: store ptr @.offload_maptypes, ptr [[TMP10]], align 8
+// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK1-NEXT: store ptr null, ptr [[TMP11]], align 8
+// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK1-NEXT: store ptr null, ptr [[TMP12]], align 8
+// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK1-NEXT: store i64 56088, ptr [[TMP13]], align 8
+// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK1-NEXT: store i64 0, ptr [[TMP14]], align 8
+// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP15]], align 4
+// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4
+// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK1-NEXT: store i32 0, ptr [[TMP17]], align 4
+// CHECK1-NEXT: [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.region_id, ptr [[KERNEL_ARGS]])
+// CHECK1-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// CHECK1-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK1: omp_offload.failed:
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]]
+// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK1: omp_offload.cont:
+// CHECK1-NEXT: [[A3:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
+// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [123 x [456 x i32]], ptr [[A3]], i64 0, i64 0
+// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [456 x i32], ptr [[ARRAYIDX]], i64 0, i64 0
+// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
+// CHECK1-NEXT: ret i32 [[TMP20]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28
+// CHECK1-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined, ptr [[TMP0]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 56087, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP2]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 56087
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 56087, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK1-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined.omp_outlined, i64 [[TMP9]], i64 [[TMP11]], ptr [[TMP0]])
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 56087, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK1-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP2]] to i32
+// CHECK1-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[CONV2]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP4]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP5]], 56087
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 56087, [[COND_TRUE]] ], [ [[TMP6]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP3:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK1-NEXT: br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP10]], 456
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[DIV4:%.*]] = sdiv i32 [[TMP12]], 456
+// CHECK1-NEXT: [[MUL5:%.*]] = mul nsw i32 [[DIV4]], 456
+// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP11]], [[MUL5]]
+// CHECK1-NEXT: [[MUL6:%.*]] = mul nsw i32 [[SUB]], 1
+// CHECK1-NEXT: [[ADD7:%.*]] = add nsw i32 0, [[MUL6]]
+// CHECK1-NEXT: store i32 [[ADD7]], ptr [[J]], align 4
+// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_SS:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64
+// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [123 x [456 x i32]], ptr [[A]], i64 0, i64 [[IDXPROM]]
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[J]], align 4
+// CHECK1-NEXT: [[IDXPROM8:%.*]] = sext i32 [[TMP14]] to i64
+// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [456 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM8]]
+// CHECK1-NEXT: store i32 0, ptr [[ARRAYIDX9]], align 4
+// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1: omp.body.continue:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP15]], 1
+// CHECK1-NEXT: store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP4]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_Z21teams_template_structv
+// CHECK3-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[V:%.*]] = alloca [[STRUCT_SS:%.*]], align 4
+// CHECK3-NEXT: [[CALL:%.*]] = call noundef i32 @_ZN2SSIiLi123ELx456EE3fooEv(ptr noundef nonnull align 4 dereferenceable(224352) [[V]])
+// CHECK3-NEXT: ret i32 [[CALL]]
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_ZN2SSIiLi123ELx456EE3fooEv
+// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(224352) [[THIS:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[_TMP2:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_SS:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT: store ptr [[THIS1]], ptr [[TMP0]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT: store ptr [[A]], ptr [[TMP1]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK3-NEXT: store ptr null, ptr [[TMP2]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK3-NEXT: store i32 2, ptr [[TMP5]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK3-NEXT: store i32 1, ptr [[TMP6]], align 4
+// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK3-NEXT: store ptr [[TMP3]], ptr [[TMP7]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK3-NEXT: store ptr [[TMP4]], ptr [[TMP8]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK3-NEXT: store ptr @.offload_sizes, ptr [[TMP9]], align 4
+// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK3-NEXT: store ptr @.offload_maptypes, ptr [[TMP10]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK3-NEXT: store ptr null, ptr [[TMP11]], align 4
+// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK3-NEXT: store ptr null, ptr [[TMP12]], align 4
+// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK3-NEXT: store i64 56088, ptr [[TMP13]], align 8
+// CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK3-NEXT: store i64 0, ptr [[TMP14]], align 8
+// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP15]], align 4
+// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4
+// CHECK3-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK3-NEXT: store i32 0, ptr [[TMP17]], align 4
+// CHECK3-NEXT: [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.region_id, ptr [[KERNEL_ARGS]])
+// CHECK3-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// CHECK3-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK3: omp_offload.failed:
+// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]]
+// CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK3: omp_offload.cont:
+// CHECK3-NEXT: [[A3:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
+// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [123 x [456 x i32]], ptr [[A3]], i32 0, i32 0
+// CHECK3-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [456 x i32], ptr [[ARRAYIDX]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
+// CHECK3-NEXT: ret i32 [[TMP20]]
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28
+// CHECK3-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined, ptr [[TMP0]])
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: store i32 56087, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP2]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 56087
+// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK3: cond.true:
+// CHECK3-NEXT: br label [[COND_END:%.*]]
+// CHECK3: cond.false:
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: br label [[COND_END]]
+// CHECK3: cond.end:
+// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 56087, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK3-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3: omp.inner.for.cond:
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK3-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3: omp.inner.for.body:
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined.omp_outlined, i32 [[TMP8]], i32 [[TMP9]], ptr [[TMP0]])
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3: omp.inner.for.inc:
+// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], [[TMP11]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK3: omp.inner.for.end:
+// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3: omp.loop.exit:
+// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]])
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 56087, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP4]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP5]], 56087
+// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK3: cond.true:
+// CHECK3-NEXT: br label [[COND_END:%.*]]
+// CHECK3: cond.false:
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: br label [[COND_END]]
+// CHECK3: cond.end:
+// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 56087, [[COND_TRUE]] ], [ [[TMP6]], [[COND_FALSE]] ]
+// CHECK3-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3: omp.inner.for.cond:
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK3-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3: omp.inner.for.body:
+// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP10]], 456
+// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[DIV3:%.*]] = sdiv i32 [[TMP12]], 456
+// CHECK3-NEXT: [[MUL4:%.*]] = mul nsw i32 [[DIV3]], 456
+// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP11]], [[MUL4]]
+// CHECK3-NEXT: [[MUL5:%.*]] = mul nsw i32 [[SUB]], 1
+// CHECK3-NEXT: [[ADD6:%.*]] = add nsw i32 0, [[MUL5]]
+// CHECK3-NEXT: store i32 [[ADD6]], ptr [[J]], align 4
+// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_SS:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4
+// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [123 x [456 x i32]], ptr [[A]], i32 0, i32 [[TMP13]]
+// CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[J]], align 4
+// CHECK3-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [456 x i32], ptr [[ARRAYIDX]], i32 0, i32 [[TMP14]]
+// CHECK3-NEXT: store i32 0, ptr [[ARRAYIDX7]], align 4
+// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK3: omp.body.continue:
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3: omp.inner.for.inc:
+// CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP15]], 1
+// CHECK3-NEXT: store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK3: omp.inner.for.end:
+// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3: omp.loop.exit:
+// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP4]])
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK3-SAME: () #[[ATTR3:[0-9]+]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@main
+// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[ARGV_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[N:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[M:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[SAVED_STACK:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[__VLA_EXPR0:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[__VLA_EXPR1:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[M_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [5 x ptr], align 8
+// CHECK9-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [5 x ptr], align 8
+// CHECK9-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [5 x ptr], align 8
+// CHECK9-NEXT: [[DOTOFFLOAD_SIZES:%.*]] = alloca [5 x i64], align 8
+// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK9-NEXT: store i32 0, ptr [[RETVAL]], align 4
+// CHECK9-NEXT: store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4
+// CHECK9-NEXT: store ptr [[ARGV]], ptr [[ARGV_ADDR]], align 8
+// CHECK9-NEXT: store i32 100, ptr [[N]], align 4
+// CHECK9-NEXT: store i32 2, ptr [[M]], align 4
+// CHECK9-NEXT: [[TMP0:%.*]] = load i32, ptr [[N]], align 4
+// CHECK9-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+// CHECK9-NEXT: [[TMP2:%.*]] = load i32, ptr [[M]], align 4
+// CHECK9-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
+// CHECK9-NEXT: [[TMP4:%.*]] = call ptr @llvm.stacksave()
+// CHECK9-NEXT: store ptr [[TMP4]], ptr [[SAVED_STACK]], align 8
+// CHECK9-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP1]], [[TMP3]]
+// CHECK9-NEXT: [[VLA:%.*]] = alloca i32, i64 [[TMP5]], align 4
+// CHECK9-NEXT: store i64 [[TMP1]], ptr [[__VLA_EXPR0]], align 8
+// CHECK9-NEXT: store i64 [[TMP3]], ptr [[__VLA_EXPR1]], align 8
+// CHECK9-NEXT: [[TMP6:%.*]] = load i32, ptr [[N]], align 4
+// CHECK9-NEXT: store i32 [[TMP6]], ptr [[N_CASTED]], align 4
+// CHECK9-NEXT: [[TMP7:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK9-NEXT: [[TMP8:%.*]] = load i32, ptr [[M]], align 4
+// CHECK9-NEXT: store i32 [[TMP8]], ptr [[M_CASTED]], align 4
+// CHECK9-NEXT: [[TMP9:%.*]] = load i64, ptr [[M_CASTED]], align 8
+// CHECK9-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP1]], [[TMP3]]
+// CHECK9-NEXT: [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 4
+// CHECK9-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DOTOFFLOAD_SIZES]], ptr align 8 @.offload_sizes, i64 40, i1 false)
+// CHECK9-NEXT: [[TMP12:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK9-NEXT: store i64 [[TMP7]], ptr [[TMP12]], align 8
+// CHECK9-NEXT: [[TMP13:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK9-NEXT: store i64 [[TMP7]], ptr [[TMP13]], align 8
+// CHECK9-NEXT: [[TMP14:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK9-NEXT: store ptr null, ptr [[TMP14]], align 8
+// CHECK9-NEXT: [[TMP15:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK9-NEXT: store i64 [[TMP9]], ptr [[TMP15]], align 8
+// CHECK9-NEXT: [[TMP16:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK9-NEXT: store i64 [[TMP9]], ptr [[TMP16]], align 8
+// CHECK9-NEXT: [[TMP17:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK9-NEXT: store ptr null, ptr [[TMP17]], align 8
+// CHECK9-NEXT: [[TMP18:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK9-NEXT: store i64 [[TMP1]], ptr [[TMP18]], align 8
+// CHECK9-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK9-NEXT: store i64 [[TMP1]], ptr [[TMP19]], align 8
+// CHECK9-NEXT: [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
+// CHECK9-NEXT: store ptr null, ptr [[TMP20]], align 8
+// CHECK9-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 3
+// CHECK9-NEXT: store i64 [[TMP3]], ptr [[TMP21]], align 8
+// CHECK9-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 3
+// CHECK9-NEXT: store i64 [[TMP3]], ptr [[TMP22]], align 8
+// CHECK9-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 3
+// CHECK9-NEXT: store ptr null, ptr [[TMP23]], align 8
+// CHECK9-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 4
+// CHECK9-NEXT: store ptr [[VLA]], ptr [[TMP24]], align 8
+// CHECK9-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 4
+// CHECK9-NEXT: store ptr [[VLA]], ptr [[TMP25]], align 8
+// CHECK9-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 4
+// CHECK9-NEXT: store i64 [[TMP11]], ptr [[TMP26]], align 8
+// CHECK9-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 4
+// CHECK9-NEXT: store ptr null, ptr [[TMP27]], align 8
+// CHECK9-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK9-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK9-NEXT: [[TMP30:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CHECK9-NEXT: [[TMP31:%.*]] = load i32, ptr [[N]], align 4
+// CHECK9-NEXT: store i32 [[TMP31]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT: [[TMP32:%.*]] = load i32, ptr [[M]], align 4
+// CHECK9-NEXT: store i32 [[TMP32]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK9-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP33]], 0
+// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK9-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64
+// CHECK9-NEXT: [[TMP34:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK9-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP34]], 0
+// CHECK9-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1
+// CHECK9-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64
+// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]]
+// CHECK9-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1
+// CHECK9-NEXT: store i64 [[SUB7]], ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK9-NEXT: [[TMP35:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK9-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP35]], 1
+// CHECK9-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK9-NEXT: store i32 2, ptr [[TMP36]], align 4
+// CHECK9-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK9-NEXT: store i32 5, ptr [[TMP37]], align 4
+// CHECK9-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK9-NEXT: store ptr [[TMP28]], ptr [[TMP38]], align 8
+// CHECK9-NEXT: [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK9-NEXT: store ptr [[TMP29]], ptr [[TMP39]], align 8
+// CHECK9-NEXT: [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK9-NEXT: store ptr [[TMP30]], ptr [[TMP40]], align 8
+// CHECK9-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK9-NEXT: store ptr @.offload_maptypes, ptr [[TMP41]], align 8
+// CHECK9-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK9-NEXT: store ptr null, ptr [[TMP42]], align 8
+// CHECK9-NEXT: [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK9-NEXT: store ptr null, ptr [[TMP43]], align 8
+// CHECK9-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK9-NEXT: store i64 [[ADD]], ptr [[TMP44]], align 8
+// CHECK9-NEXT: [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK9-NEXT: store i64 0, ptr [[TMP45]], align 8
+// CHECK9-NEXT: [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK9-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP46]], align 4
+// CHECK9-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK9-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP47]], align 4
+// CHECK9-NEXT: [[TMP48:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK9-NEXT: store i32 0, ptr [[TMP48]], align 4
+// CHECK9-NEXT: [[TMP49:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83.region_id, ptr [[KERNEL_ARGS]])
+// CHECK9-NEXT: [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0
+// CHECK9-NEXT: br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK9: omp_offload.failed:
+// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83(i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP1]], i64 [[TMP3]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
+// CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK9: omp_offload.cont:
+// CHECK9-NEXT: [[TMP51:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
+// CHECK9-NEXT: [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiLi10ELi2EEiT_(i32 noundef signext [[TMP51]])
+// CHECK9-NEXT: store i32 [[CALL]], ptr [[RETVAL]], align 4
+// CHECK9-NEXT: [[TMP52:%.*]] = load ptr, ptr [[SAVED_STACK]], align 8
+// CHECK9-NEXT: call void @llvm.stackrestore(ptr [[TMP52]])
+// CHECK9-NEXT: [[TMP53:%.*]] = load i32, ptr [[RETVAL]], align 4
+// CHECK9-NEXT: ret i32 [[TMP53]]
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83
+// CHECK9-SAME: (i64 noundef [[N:%.*]], i64 noundef [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[M_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[VLA_ADDR2:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK9-NEXT: store i64 [[M]], ptr [[M_ADDR]], align 8
+// CHECK9-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
+// CHECK9-NEXT: store i64 [[VLA1]], ptr [[VLA_ADDR2]], align 8
+// CHECK9-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK9-NEXT: [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8
+// CHECK9-NEXT: [[TMP1:%.*]] = load i64, ptr [[VLA_ADDR2]], align 8
+// CHECK9-NEXT: [[TMP2:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 5, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83.omp_outlined, ptr [[N_ADDR]], ptr [[M_ADDR]], i64 [[TMP0]], i64 [[TMP1]], ptr [[TMP2]])
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83.omp_outlined
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[M_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[VLA_ADDR2:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[_TMP3:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTCAPTURE_EXPR_4:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTCAPTURE_EXPR_5:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[I11:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[J12:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK9-NEXT: store ptr [[N]], ptr [[N_ADDR]], align 8
+// CHECK9-NEXT: store ptr [[M]], ptr [[M_ADDR]], align 8
+// CHECK9-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
+// CHECK9-NEXT: store i64 [[VLA1]], ptr [[VLA_ADDR2]], align 8
+// CHECK9-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[N_ADDR]], align 8
+// CHECK9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[M_ADDR]], align 8
+// CHECK9-NEXT: [[TMP2:%.*]] = load i64, ptr [[VLA_ADDR]], align 8
+// CHECK9-NEXT: [[TMP3:%.*]] = load i64, ptr [[VLA_ADDR2]], align 8
+// CHECK9-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK9-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK9-NEXT: store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK9-NEXT: store i32 [[TMP6]], ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK9-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0
+// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK9-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64
+// CHECK9-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK9-NEXT: [[SUB6:%.*]] = sub nsw i32 [[TMP8]], 0
+// CHECK9-NEXT: [[DIV7:%.*]] = sdiv i32 [[SUB6]], 1
+// CHECK9-NEXT: [[CONV8:%.*]] = sext i32 [[DIV7]] to i64
+// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV8]]
+// CHECK9-NEXT: [[SUB9:%.*]] = sub nsw i64 [[MUL]], 1
+// CHECK9-NEXT: store i64 [[SUB9]], ptr [[DOTCAPTURE_EXPR_5]], align 8
+// CHECK9-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK9-NEXT: store i32 0, ptr [[J]], align 4
+// CHECK9-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP9]]
+// CHECK9-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK9: land.lhs.true:
+// CHECK9-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK9-NEXT: [[CMP10:%.*]] = icmp slt i32 0, [[TMP10]]
+// CHECK9-NEXT: br i1 [[CMP10]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]]
+// CHECK9: omp.precond.then:
+// CHECK9-NEXT: store i64 0, ptr [[DOTOMP_COMB_LB]], align 8
+// CHECK9-NEXT: [[TMP11:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_5]], align 8
+// CHECK9-NEXT: store i64 [[TMP11]], ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK9-NEXT: store i64 1, ptr [[DOTOMP_STRIDE]], align 8
+// CHECK9-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK9-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
+// CHECK9-NEXT: call void @__kmpc_for_static_init_8(ptr @[[GLOB1:[0-9]+]], i32 [[TMP13]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i64 1, i64 1)
+// CHECK9-NEXT: [[TMP14:%.*]] = load i64, ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK9-NEXT: [[TMP15:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_5]], align 8
+// CHECK9-NEXT: [[CMP13:%.*]] = icmp sgt i64 [[TMP14]], [[TMP15]]
+// CHECK9-NEXT: br i1 [[CMP13]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK9: cond.true:
+// CHECK9-NEXT: [[TMP16:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_5]], align 8
+// CHECK9-NEXT: br label [[COND_END:%.*]]
+// CHECK9: cond.false:
+// CHECK9-NEXT: [[TMP17:%.*]] = load i64, ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK9-NEXT: br label [[COND_END]]
+// CHECK9: cond.end:
+// CHECK9-NEXT: [[COND:%.*]] = phi i64 [ [[TMP16]], [[COND_TRUE]] ], [ [[TMP17]], [[COND_FALSE]] ]
+// CHECK9-NEXT: store i64 [[COND]], ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK9-NEXT: [[TMP18:%.*]] = load i64, ptr [[DOTOMP_COMB_LB]], align 8
+// CHECK9-NEXT: store i64 [[TMP18]], ptr [[DOTOMP_IV]], align 8
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK9: omp.inner.for.cond:
+// CHECK9-NEXT: [[TMP19:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK9-NEXT: [[TMP20:%.*]] = load i64, ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK9-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP19]], [[TMP20]]
+// CHECK9-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK9: omp.inner.for.body:
+// CHECK9-NEXT: [[TMP21:%.*]] = load i64, ptr [[DOTOMP_COMB_LB]], align 8
+// CHECK9-NEXT: [[TMP22:%.*]] = load i64, ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 7, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83.omp_outlined.omp_outlined, i64 [[TMP21]], i64 [[TMP22]], ptr [[TMP0]], ptr [[TMP1]], i64 [[TMP2]], i64 [[TMP3]], ptr [[TMP4]])
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK9: omp.inner.for.inc:
+// CHECK9-NEXT: [[TMP23:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK9-NEXT: [[TMP24:%.*]] = load i64, ptr [[DOTOMP_STRIDE]], align 8
+// CHECK9-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP23]], [[TMP24]]
+// CHECK9-NEXT: store i64 [[ADD]], ptr [[DOTOMP_IV]], align 8
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK9: omp.inner.for.end:
+// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK9: omp.loop.exit:
+// CHECK9-NEXT: [[TMP25:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
+// CHECK9-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP26]])
+// CHECK9-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK9: omp.precond.end:
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83.omp_outlined.omp_outlined
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[M_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[VLA_ADDR2:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[_TMP3:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTCAPTURE_EXPR_4:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTCAPTURE_EXPR_5:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[I11:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[J12:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK9-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK9-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK9-NEXT: store ptr [[N]], ptr [[N_ADDR]], align 8
+// CHECK9-NEXT: store ptr [[M]], ptr [[M_ADDR]], align 8
+// CHECK9-NEXT: store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
+// CHECK9-NEXT: store i64 [[VLA1]], ptr [[VLA_ADDR2]], align 8
+// CHECK9-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[N_ADDR]], align 8
+// CHECK9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[M_ADDR]], align 8
+// CHECK9-NEXT: [[TMP2:%.*]] = load i64, ptr [[VLA_ADDR]], align 8
+// CHECK9-NEXT: [[TMP3:%.*]] = load i64, ptr [[VLA_ADDR2]], align 8
+// CHECK9-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK9-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK9-NEXT: store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK9-NEXT: store i32 [[TMP6]], ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK9-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0
+// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK9-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64
+// CHECK9-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK9-NEXT: [[SUB6:%.*]] = sub nsw i32 [[TMP8]], 0
+// CHECK9-NEXT: [[DIV7:%.*]] = sdiv i32 [[SUB6]], 1
+// CHECK9-NEXT: [[CONV8:%.*]] = sext i32 [[DIV7]] to i64
+// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV8]]
+// CHECK9-NEXT: [[SUB9:%.*]] = sub nsw i64 [[MUL]], 1
+// CHECK9-NEXT: store i64 [[SUB9]], ptr [[DOTCAPTURE_EXPR_5]], align 8
+// CHECK9-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK9-NEXT: store i32 0, ptr [[J]], align 4
+// CHECK9-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP9]]
+// CHECK9-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK9: land.lhs.true:
+// CHECK9-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK9-NEXT: [[CMP10:%.*]] = icmp slt i32 0, [[TMP10]]
+// CHECK9-NEXT: br i1 [[CMP10]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]]
+// CHECK9: omp.precond.then:
+// CHECK9-NEXT: store i64 0, ptr [[DOTOMP_LB]], align 8
+// CHECK9-NEXT: [[TMP11:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_5]], align 8
+// CHECK9-NEXT: store i64 [[TMP11]], ptr [[DOTOMP_UB]], align 8
+// CHECK9-NEXT: [[TMP12:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK9-NEXT: [[TMP13:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK9-NEXT: store i64 [[TMP12]], ptr [[DOTOMP_LB]], align 8
+// CHECK9-NEXT: store i64 [[TMP13]], ptr [[DOTOMP_UB]], align 8
+// CHECK9-NEXT: store i64 1, ptr [[DOTOMP_STRIDE]], align 8
+// CHECK9-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK9-NEXT: [[TMP14:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4
+// CHECK9-NEXT: call void @__kmpc_for_static_init_8(ptr @[[GLOB2]], i32 [[TMP15]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i64 1, i64 1)
+// CHECK9-NEXT: [[TMP16:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8
+// CHECK9-NEXT: [[TMP17:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_5]], align 8
+// CHECK9-NEXT: [[CMP13:%.*]] = icmp sgt i64 [[TMP16]], [[TMP17]]
+// CHECK9-NEXT: br i1 [[CMP13]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK9: cond.true:
+// CHECK9-NEXT: [[TMP18:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_5]], align 8
+// CHECK9-NEXT: br label [[COND_END:%.*]]
+// CHECK9: cond.false:
+// CHECK9-NEXT: [[TMP19:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8
+// CHECK9-NEXT: br label [[COND_END]]
+// CHECK9: cond.end:
+// CHECK9-NEXT: [[COND:%.*]] = phi i64 [ [[TMP18]], [[COND_TRUE]] ], [ [[TMP19]], [[COND_FALSE]] ]
+// CHECK9-NEXT: store i64 [[COND]], ptr [[DOTOMP_UB]], align 8
+// CHECK9-NEXT: [[TMP20:%.*]] = load i64, ptr [[DOTOMP_LB]], align 8
+// CHECK9-NEXT: store i64 [[TMP20]], ptr [[DOTOMP_IV]], align 8
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK9: omp.inner.for.cond:
+// CHECK9-NEXT: [[TMP21:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK9-NEXT: [[TMP22:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8
+// CHECK9-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP21]], [[TMP22]]
+// CHECK9-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK9: omp.inner.for.body:
+// CHECK9-NEXT: [[TMP23:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK9-NEXT: [[TMP24:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK9-NEXT: [[SUB15:%.*]] = sub nsw i32 [[TMP24]], 0
+// CHECK9-NEXT: [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1
+// CHECK9-NEXT: [[MUL17:%.*]] = mul nsw i32 1, [[DIV16]]
+// CHECK9-NEXT: [[CONV18:%.*]] = sext i32 [[MUL17]] to i64
+// CHECK9-NEXT: [[DIV19:%.*]] = sdiv i64 [[TMP23]], [[CONV18]]
+// CHECK9-NEXT: [[MUL20:%.*]] = mul nsw i64 [[DIV19]], 1
+// CHECK9-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL20]]
+// CHECK9-NEXT: [[CONV21:%.*]] = trunc i64 [[ADD]] to i32
+// CHECK9-NEXT: store i32 [[CONV21]], ptr [[I11]], align 4
+// CHECK9-NEXT: [[TMP25:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK9-NEXT: [[TMP26:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK9-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK9-NEXT: [[SUB22:%.*]] = sub nsw i32 [[TMP27]], 0
+// CHECK9-NEXT: [[DIV23:%.*]] = sdiv i32 [[SUB22]], 1
+// CHECK9-NEXT: [[MUL24:%.*]] = mul nsw i32 1, [[DIV23]]
+// CHECK9-NEXT: [[CONV25:%.*]] = sext i32 [[MUL24]] to i64
+// CHECK9-NEXT: [[DIV26:%.*]] = sdiv i64 [[TMP26]], [[CONV25]]
+// CHECK9-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK9-NEXT: [[SUB27:%.*]] = sub nsw i32 [[TMP28]], 0
+// CHECK9-NEXT: [[DIV28:%.*]] = sdiv i32 [[SUB27]], 1
+// CHECK9-NEXT: [[MUL29:%.*]] = mul nsw i32 1, [[DIV28]]
+// CHECK9-NEXT: [[CONV30:%.*]] = sext i32 [[MUL29]] to i64
+// CHECK9-NEXT: [[MUL31:%.*]] = mul nsw i64 [[DIV26]], [[CONV30]]
+// CHECK9-NEXT: [[SUB32:%.*]] = sub nsw i64 [[TMP25]], [[MUL31]]
+// CHECK9-NEXT: [[MUL33:%.*]] = mul nsw i64 [[SUB32]], 1
+// CHECK9-NEXT: [[ADD34:%.*]] = add nsw i64 0, [[MUL33]]
+// CHECK9-NEXT: [[CONV35:%.*]] = trunc i64 [[ADD34]] to i32
+// CHECK9-NEXT: store i32 [[CONV35]], ptr [[J12]], align 4
+// CHECK9-NEXT: [[TMP29:%.*]] = load i32, ptr [[I11]], align 4
+// CHECK9-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP29]] to i64
+// CHECK9-NEXT: [[TMP30:%.*]] = mul nsw i64 [[IDXPROM]], [[TMP3]]
+// CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 [[TMP30]]
+// CHECK9-NEXT: [[TMP31:%.*]] = load i32, ptr [[J12]], align 4
+// CHECK9-NEXT: [[IDXPROM36:%.*]] = sext i32 [[TMP31]] to i64
+// CHECK9-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX]], i64 [[IDXPROM36]]
+// CHECK9-NEXT: store i32 0, ptr [[ARRAYIDX37]], align 4
+// CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK9: omp.body.continue:
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK9: omp.inner.for.inc:
+// CHECK9-NEXT: [[TMP32:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK9-NEXT: [[ADD38:%.*]] = add nsw i64 [[TMP32]], 1
+// CHECK9-NEXT: store i64 [[ADD38]], ptr [[DOTOMP_IV]], align 8
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK9: omp.inner.for.end:
+// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK9: omp.loop.exit:
+// CHECK9-NEXT: [[TMP33:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: [[TMP34:%.*]] = load i32, ptr [[TMP33]], align 4
+// CHECK9-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP34]])
+// CHECK9-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK9: omp.precond.end:
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiLi10ELi2EEiT_
+// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[A:%.*]] = alloca [10 x [2 x i32]], align 4
+// CHECK9-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK9-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK9-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK9-NEXT: store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4
+// CHECK9-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK9-NEXT: store ptr [[A]], ptr [[TMP0]], align 8
+// CHECK9-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK9-NEXT: store ptr [[A]], ptr [[TMP1]], align 8
+// CHECK9-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK9-NEXT: store ptr null, ptr [[TMP2]], align 8
+// CHECK9-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK9-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK9-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK9-NEXT: store i32 2, ptr [[TMP5]], align 4
+// CHECK9-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK9-NEXT: store i32 1, ptr [[TMP6]], align 4
+// CHECK9-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK9-NEXT: store ptr [[TMP3]], ptr [[TMP7]], align 8
+// CHECK9-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK9-NEXT: store ptr [[TMP4]], ptr [[TMP8]], align 8
+// CHECK9-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK9-NEXT: store ptr @.offload_sizes.1, ptr [[TMP9]], align 8
+// CHECK9-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK9-NEXT: store ptr @.offload_maptypes.2, ptr [[TMP10]], align 8
+// CHECK9-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK9-NEXT: store ptr null, ptr [[TMP11]], align 8
+// CHECK9-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK9-NEXT: store ptr null, ptr [[TMP12]], align 8
+// CHECK9-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK9-NEXT: store i64 20, ptr [[TMP13]], align 8
+// CHECK9-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK9-NEXT: store i64 0, ptr [[TMP14]], align 8
+// CHECK9-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK9-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP15]], align 4
+// CHECK9-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK9-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4
+// CHECK9-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK9-NEXT: store i32 0, ptr [[TMP17]], align 4
+// CHECK9-NEXT: [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l69.region_id, ptr [[KERNEL_ARGS]])
+// CHECK9-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// CHECK9-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK9: omp_offload.failed:
+// CHECK9-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l69(ptr [[A]]) #[[ATTR3]]
+// CHECK9-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK9: omp_offload.cont:
+// CHECK9-NEXT: ret i32 0
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l69
+// CHECK9-SAME: (ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l69.omp_outlined, ptr [[TMP0]])
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l69.omp_outlined
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK9-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK9-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK9-NEXT: store i32 19, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK9-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK9-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP2]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK9-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 19
+// CHECK9-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK9: cond.true:
+// CHECK9-NEXT: br label [[COND_END:%.*]]
+// CHECK9: cond.false:
+// CHECK9-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT: br label [[COND_END]]
+// CHECK9: cond.end:
+// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ 19, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK9-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK9-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK9: omp.inner.for.cond:
+// CHECK9-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK9-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK9: omp.inner.for.body:
+// CHECK9-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK9-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK9-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
+// CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l69.omp_outlined.omp_outlined, i64 [[TMP9]], i64 [[TMP11]], ptr [[TMP0]])
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK9: omp.inner.for.inc:
+// CHECK9-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
+// CHECK9-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK9: omp.inner.for.end:
+// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK9: omp.loop.exit:
+// CHECK9-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP2]])
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l69.omp_outlined.omp_outlined
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK9-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK9-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK9-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK9-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK9-NEXT: store i32 19, ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK9-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK9-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK9-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP2]] to i32
+// CHECK9-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK9-NEXT: store i32 [[CONV2]], ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK9-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK9-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+// CHECK9-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP4]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK9-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP5]], 19
+// CHECK9-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK9: cond.true:
+// CHECK9-NEXT: br label [[COND_END:%.*]]
+// CHECK9: cond.false:
+// CHECK9-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT: br label [[COND_END]]
+// CHECK9: cond.end:
+// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ 19, [[COND_TRUE]] ], [ [[TMP6]], [[COND_FALSE]] ]
+// CHECK9-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK9-NEXT: store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK9: omp.inner.for.cond:
+// CHECK9-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT: [[CMP3:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK9-NEXT: br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK9: omp.inner.for.body:
+// CHECK9-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP10]], 2
+// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
+// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK9-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK9-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: [[DIV4:%.*]] = sdiv i32 [[TMP12]], 2
+// CHECK9-NEXT: [[MUL5:%.*]] = mul nsw i32 [[DIV4]], 2
+// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP11]], [[MUL5]]
+// CHECK9-NEXT: [[MUL6:%.*]] = mul nsw i32 [[SUB]], 1
+// CHECK9-NEXT: [[ADD7:%.*]] = add nsw i32 0, [[MUL6]]
+// CHECK9-NEXT: store i32 [[ADD7]], ptr [[J]], align 4
+// CHECK9-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4
+// CHECK9-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64
+// CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [2 x i32]], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK9-NEXT: [[TMP14:%.*]] = load i32, ptr [[J]], align 4
+// CHECK9-NEXT: [[IDXPROM8:%.*]] = sext i32 [[TMP14]] to i64
+// CHECK9-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [2 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM8]]
+// CHECK9-NEXT: store i32 0, ptr [[ARRAYIDX9]], align 4
+// CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK9: omp.body.continue:
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK9: omp.inner.for.inc:
+// CHECK9-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP15]], 1
+// CHECK9-NEXT: store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK9: omp.inner.for.end:
+// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK9: omp.loop.exit:
+// CHECK9-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP4]])
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK9-SAME: () #[[ATTR6:[0-9]+]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@main
+// CHECK11-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef [[ARGV:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK11-NEXT: entry:
+// CHECK11-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[ARGV_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[N:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[M:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[SAVED_STACK:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[__VLA_EXPR0:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[__VLA_EXPR1:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[M_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [5 x ptr], align 4
+// CHECK11-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [5 x ptr], align 4
+// CHECK11-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [5 x ptr], align 4
+// CHECK11-NEXT: [[DOTOFFLOAD_SIZES:%.*]] = alloca [5 x i64], align 4
+// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i64, align 8
+// CHECK11-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK11-NEXT: store i32 0, ptr [[RETVAL]], align 4
+// CHECK11-NEXT: store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4
+// CHECK11-NEXT: store ptr [[ARGV]], ptr [[ARGV_ADDR]], align 4
+// CHECK11-NEXT: store i32 100, ptr [[N]], align 4
+// CHECK11-NEXT: store i32 2, ptr [[M]], align 4
+// CHECK11-NEXT: [[TMP0:%.*]] = load i32, ptr [[N]], align 4
+// CHECK11-NEXT: [[TMP1:%.*]] = load i32, ptr [[M]], align 4
+// CHECK11-NEXT: [[TMP2:%.*]] = call ptr @llvm.stacksave()
+// CHECK11-NEXT: store ptr [[TMP2]], ptr [[SAVED_STACK]], align 4
+// CHECK11-NEXT: [[TMP3:%.*]] = mul nuw i32 [[TMP0]], [[TMP1]]
+// CHECK11-NEXT: [[VLA:%.*]] = alloca i32, i32 [[TMP3]], align 4
+// CHECK11-NEXT: store i32 [[TMP0]], ptr [[__VLA_EXPR0]], align 4
+// CHECK11-NEXT: store i32 [[TMP1]], ptr [[__VLA_EXPR1]], align 4
+// CHECK11-NEXT: [[TMP4:%.*]] = load i32, ptr [[N]], align 4
+// CHECK11-NEXT: store i32 [[TMP4]], ptr [[N_CASTED]], align 4
+// CHECK11-NEXT: [[TMP5:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK11-NEXT: [[TMP6:%.*]] = load i32, ptr [[M]], align 4
+// CHECK11-NEXT: store i32 [[TMP6]], ptr [[M_CASTED]], align 4
+// CHECK11-NEXT: [[TMP7:%.*]] = load i32, ptr [[M_CASTED]], align 4
+// CHECK11-NEXT: [[TMP8:%.*]] = mul nuw i32 [[TMP0]], [[TMP1]]
+// CHECK11-NEXT: [[TMP9:%.*]] = mul nuw i32 [[TMP8]], 4
+// CHECK11-NEXT: [[TMP10:%.*]] = sext i32 [[TMP9]] to i64
+// CHECK11-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[DOTOFFLOAD_SIZES]], ptr align 4 @.offload_sizes, i32 40, i1 false)
+// CHECK11-NEXT: [[TMP11:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK11-NEXT: store i32 [[TMP5]], ptr [[TMP11]], align 4
+// CHECK11-NEXT: [[TMP12:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK11-NEXT: store i32 [[TMP5]], ptr [[TMP12]], align 4
+// CHECK11-NEXT: [[TMP13:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK11-NEXT: store ptr null, ptr [[TMP13]], align 4
+// CHECK11-NEXT: [[TMP14:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK11-NEXT: store i32 [[TMP7]], ptr [[TMP14]], align 4
+// CHECK11-NEXT: [[TMP15:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK11-NEXT: store i32 [[TMP7]], ptr [[TMP15]], align 4
+// CHECK11-NEXT: [[TMP16:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK11-NEXT: store ptr null, ptr [[TMP16]], align 4
+// CHECK11-NEXT: [[TMP17:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK11-NEXT: store i32 [[TMP0]], ptr [[TMP17]], align 4
+// CHECK11-NEXT: [[TMP18:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK11-NEXT: store i32 [[TMP0]], ptr [[TMP18]], align 4
+// CHECK11-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
+// CHECK11-NEXT: store ptr null, ptr [[TMP19]], align 4
+// CHECK11-NEXT: [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 3
+// CHECK11-NEXT: store i32 [[TMP1]], ptr [[TMP20]], align 4
+// CHECK11-NEXT: [[TMP21:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 3
+// CHECK11-NEXT: store i32 [[TMP1]], ptr [[TMP21]], align 4
+// CHECK11-NEXT: [[TMP22:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 3
+// CHECK11-NEXT: store ptr null, ptr [[TMP22]], align 4
+// CHECK11-NEXT: [[TMP23:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 4
+// CHECK11-NEXT: store ptr [[VLA]], ptr [[TMP23]], align 4
+// CHECK11-NEXT: [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 4
+// CHECK11-NEXT: store ptr [[VLA]], ptr [[TMP24]], align 4
+// CHECK11-NEXT: [[TMP25:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 4
+// CHECK11-NEXT: store i64 [[TMP10]], ptr [[TMP25]], align 4
+// CHECK11-NEXT: [[TMP26:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 4
+// CHECK11-NEXT: store ptr null, ptr [[TMP26]], align 4
+// CHECK11-NEXT: [[TMP27:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK11-NEXT: [[TMP28:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK11-NEXT: [[TMP29:%.*]] = getelementptr inbounds [5 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
+// CHECK11-NEXT: [[TMP30:%.*]] = load i32, ptr [[N]], align 4
+// CHECK11-NEXT: store i32 [[TMP30]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT: [[TMP31:%.*]] = load i32, ptr [[M]], align 4
+// CHECK11-NEXT: store i32 [[TMP31]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK11-NEXT: [[TMP32:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP32]], 0
+// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK11-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64
+// CHECK11-NEXT: [[TMP33:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK11-NEXT: [[SUB4:%.*]] = sub nsw i32 [[TMP33]], 0
+// CHECK11-NEXT: [[DIV5:%.*]] = sdiv i32 [[SUB4]], 1
+// CHECK11-NEXT: [[CONV6:%.*]] = sext i32 [[DIV5]] to i64
+// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV6]]
+// CHECK11-NEXT: [[SUB7:%.*]] = sub nsw i64 [[MUL]], 1
+// CHECK11-NEXT: store i64 [[SUB7]], ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK11-NEXT: [[TMP34:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK11-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP34]], 1
+// CHECK11-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK11-NEXT: store i32 2, ptr [[TMP35]], align 4
+// CHECK11-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK11-NEXT: store i32 5, ptr [[TMP36]], align 4
+// CHECK11-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK11-NEXT: store ptr [[TMP27]], ptr [[TMP37]], align 4
+// CHECK11-NEXT: [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK11-NEXT: store ptr [[TMP28]], ptr [[TMP38]], align 4
+// CHECK11-NEXT: [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK11-NEXT: store ptr [[TMP29]], ptr [[TMP39]], align 4
+// CHECK11-NEXT: [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK11-NEXT: store ptr @.offload_maptypes, ptr [[TMP40]], align 4
+// CHECK11-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK11-NEXT: store ptr null, ptr [[TMP41]], align 4
+// CHECK11-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK11-NEXT: store ptr null, ptr [[TMP42]], align 4
+// CHECK11-NEXT: [[TMP43:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK11-NEXT: store i64 [[ADD]], ptr [[TMP43]], align 8
+// CHECK11-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK11-NEXT: store i64 0, ptr [[TMP44]], align 8
+// CHECK11-NEXT: [[TMP45:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK11-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP45]], align 4
+// CHECK11-NEXT: [[TMP46:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK11-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP46]], align 4
+// CHECK11-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK11-NEXT: store i32 0, ptr [[TMP47]], align 4
+// CHECK11-NEXT: [[TMP48:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83.region_id, ptr [[KERNEL_ARGS]])
+// CHECK11-NEXT: [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0
+// CHECK11-NEXT: br i1 [[TMP49]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK11: omp_offload.failed:
+// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83(i32 [[TMP5]], i32 [[TMP7]], i32 [[TMP0]], i32 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
+// CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK11: omp_offload.cont:
+// CHECK11-NEXT: [[TMP50:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
+// CHECK11-NEXT: [[CALL:%.*]] = call noundef i32 @_Z5tmainIiLi10ELi2EEiT_(i32 noundef [[TMP50]])
+// CHECK11-NEXT: store i32 [[CALL]], ptr [[RETVAL]], align 4
+// CHECK11-NEXT: [[TMP51:%.*]] = load ptr, ptr [[SAVED_STACK]], align 4
+// CHECK11-NEXT: call void @llvm.stackrestore(ptr [[TMP51]])
+// CHECK11-NEXT: [[TMP52:%.*]] = load i32, ptr [[RETVAL]], align 4
+// CHECK11-NEXT: ret i32 [[TMP52]]
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83
+// CHECK11-SAME: (i32 noundef [[N:%.*]], i32 noundef [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK11-NEXT: entry:
+// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[VLA_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[VLA_ADDR2:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[VLA1]], ptr [[VLA_ADDR2]], align 4
+// CHECK11-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK11-NEXT: [[TMP0:%.*]] = load i32, ptr [[VLA_ADDR]], align 4
+// CHECK11-NEXT: [[TMP1:%.*]] = load i32, ptr [[VLA_ADDR2]], align 4
+// CHECK11-NEXT: [[TMP2:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 5, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83.omp_outlined, ptr [[N_ADDR]], ptr [[M_ADDR]], i32 [[TMP0]], i32 [[TMP1]], ptr [[TMP2]])
+// CHECK11-NEXT: ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83.omp_outlined
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
+// CHECK11-NEXT: entry:
+// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[M_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[VLA_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[VLA_ADDR2:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8
+// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[_TMP3:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTCAPTURE_EXPR_4:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTCAPTURE_EXPR_5:%.*]] = alloca i64, align 8
+// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i64, align 8
+// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i64, align 8
+// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8
+// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[I11:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[J12:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK11-NEXT: store ptr [[N]], ptr [[N_ADDR]], align 4
+// CHECK11-NEXT: store ptr [[M]], ptr [[M_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[VLA1]], ptr [[VLA_ADDR2]], align 4
+// CHECK11-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[N_ADDR]], align 4
+// CHECK11-NEXT: [[TMP1:%.*]] = load ptr, ptr [[M_ADDR]], align 4
+// CHECK11-NEXT: [[TMP2:%.*]] = load i32, ptr [[VLA_ADDR]], align 4
+// CHECK11-NEXT: [[TMP3:%.*]] = load i32, ptr [[VLA_ADDR2]], align 4
+// CHECK11-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK11-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK11-NEXT: store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK11-NEXT: store i32 [[TMP6]], ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK11-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0
+// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK11-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64
+// CHECK11-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK11-NEXT: [[SUB6:%.*]] = sub nsw i32 [[TMP8]], 0
+// CHECK11-NEXT: [[DIV7:%.*]] = sdiv i32 [[SUB6]], 1
+// CHECK11-NEXT: [[CONV8:%.*]] = sext i32 [[DIV7]] to i64
+// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV8]]
+// CHECK11-NEXT: [[SUB9:%.*]] = sub nsw i64 [[MUL]], 1
+// CHECK11-NEXT: store i64 [[SUB9]], ptr [[DOTCAPTURE_EXPR_5]], align 8
+// CHECK11-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK11-NEXT: store i32 0, ptr [[J]], align 4
+// CHECK11-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP9]]
+// CHECK11-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK11: land.lhs.true:
+// CHECK11-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK11-NEXT: [[CMP10:%.*]] = icmp slt i32 0, [[TMP10]]
+// CHECK11-NEXT: br i1 [[CMP10]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]]
+// CHECK11: omp.precond.then:
+// CHECK11-NEXT: store i64 0, ptr [[DOTOMP_COMB_LB]], align 8
+// CHECK11-NEXT: [[TMP11:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_5]], align 8
+// CHECK11-NEXT: store i64 [[TMP11]], ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK11-NEXT: store i64 1, ptr [[DOTOMP_STRIDE]], align 8
+// CHECK11-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK11-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
+// CHECK11-NEXT: call void @__kmpc_for_static_init_8(ptr @[[GLOB1:[0-9]+]], i32 [[TMP13]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i64 1, i64 1)
+// CHECK11-NEXT: [[TMP14:%.*]] = load i64, ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK11-NEXT: [[TMP15:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_5]], align 8
+// CHECK11-NEXT: [[CMP13:%.*]] = icmp sgt i64 [[TMP14]], [[TMP15]]
+// CHECK11-NEXT: br i1 [[CMP13]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK11: cond.true:
+// CHECK11-NEXT: [[TMP16:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_5]], align 8
+// CHECK11-NEXT: br label [[COND_END:%.*]]
+// CHECK11: cond.false:
+// CHECK11-NEXT: [[TMP17:%.*]] = load i64, ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK11-NEXT: br label [[COND_END]]
+// CHECK11: cond.end:
+// CHECK11-NEXT: [[COND:%.*]] = phi i64 [ [[TMP16]], [[COND_TRUE]] ], [ [[TMP17]], [[COND_FALSE]] ]
+// CHECK11-NEXT: store i64 [[COND]], ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK11-NEXT: [[TMP18:%.*]] = load i64, ptr [[DOTOMP_COMB_LB]], align 8
+// CHECK11-NEXT: store i64 [[TMP18]], ptr [[DOTOMP_IV]], align 8
+// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK11: omp.inner.for.cond:
+// CHECK11-NEXT: [[TMP19:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK11-NEXT: [[TMP20:%.*]] = load i64, ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK11-NEXT: [[CMP14:%.*]] = icmp sle i64 [[TMP19]], [[TMP20]]
+// CHECK11-NEXT: br i1 [[CMP14]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK11: omp.inner.for.body:
+// CHECK11-NEXT: [[TMP21:%.*]] = load i64, ptr [[DOTOMP_COMB_LB]], align 8
+// CHECK11-NEXT: [[TMP22:%.*]] = trunc i64 [[TMP21]] to i32
+// CHECK11-NEXT: [[TMP23:%.*]] = load i64, ptr [[DOTOMP_COMB_UB]], align 8
+// CHECK11-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
+// CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 7, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83.omp_outlined.omp_outlined, i32 [[TMP22]], i32 [[TMP24]], ptr [[TMP0]], ptr [[TMP1]], i32 [[TMP2]], i32 [[TMP3]], ptr [[TMP4]])
+// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK11: omp.inner.for.inc:
+// CHECK11-NEXT: [[TMP25:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK11-NEXT: [[TMP26:%.*]] = load i64, ptr [[DOTOMP_STRIDE]], align 8
+// CHECK11-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP25]], [[TMP26]]
+// CHECK11-NEXT: store i64 [[ADD]], ptr [[DOTOMP_IV]], align 8
+// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK11: omp.inner.for.end:
+// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK11: omp.loop.exit:
+// CHECK11-NEXT: [[TMP27:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
+// CHECK11-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP28]])
+// CHECK11-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK11: omp.precond.end:
+// CHECK11-NEXT: ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83.omp_outlined.omp_outlined
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
+// CHECK11-NEXT: entry:
+// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[M_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[VLA_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[VLA_ADDR2:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i64, align 8
+// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[_TMP3:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTCAPTURE_EXPR_4:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTCAPTURE_EXPR_5:%.*]] = alloca i64, align 8
+// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i64, align 8
+// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i64, align 8
+// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i64, align 8
+// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[I13:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[J14:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK11-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK11-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK11-NEXT: store ptr [[N]], ptr [[N_ADDR]], align 4
+// CHECK11-NEXT: store ptr [[M]], ptr [[M_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[VLA1]], ptr [[VLA_ADDR2]], align 4
+// CHECK11-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[N_ADDR]], align 4
+// CHECK11-NEXT: [[TMP1:%.*]] = load ptr, ptr [[M_ADDR]], align 4
+// CHECK11-NEXT: [[TMP2:%.*]] = load i32, ptr [[VLA_ADDR]], align 4
+// CHECK11-NEXT: [[TMP3:%.*]] = load i32, ptr [[VLA_ADDR2]], align 4
+// CHECK11-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK11-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK11-NEXT: store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK11-NEXT: store i32 [[TMP6]], ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK11-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP7]], 0
+// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK11-NEXT: [[CONV:%.*]] = sext i32 [[DIV]] to i64
+// CHECK11-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK11-NEXT: [[SUB6:%.*]] = sub nsw i32 [[TMP8]], 0
+// CHECK11-NEXT: [[DIV7:%.*]] = sdiv i32 [[SUB6]], 1
+// CHECK11-NEXT: [[CONV8:%.*]] = sext i32 [[DIV7]] to i64
+// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV8]]
+// CHECK11-NEXT: [[SUB9:%.*]] = sub nsw i64 [[MUL]], 1
+// CHECK11-NEXT: store i64 [[SUB9]], ptr [[DOTCAPTURE_EXPR_5]], align 8
+// CHECK11-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK11-NEXT: store i32 0, ptr [[J]], align 4
+// CHECK11-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP9]]
+// CHECK11-NEXT: br i1 [[CMP]], label [[LAND_LHS_TRUE:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK11: land.lhs.true:
+// CHECK11-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK11-NEXT: [[CMP10:%.*]] = icmp slt i32 0, [[TMP10]]
+// CHECK11-NEXT: br i1 [[CMP10]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END]]
+// CHECK11: omp.precond.then:
+// CHECK11-NEXT: store i64 0, ptr [[DOTOMP_LB]], align 8
+// CHECK11-NEXT: [[TMP11:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_5]], align 8
+// CHECK11-NEXT: store i64 [[TMP11]], ptr [[DOTOMP_UB]], align 8
+// CHECK11-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK11-NEXT: [[CONV11:%.*]] = zext i32 [[TMP12]] to i64
+// CHECK11-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK11-NEXT: [[CONV12:%.*]] = zext i32 [[TMP13]] to i64
+// CHECK11-NEXT: store i64 [[CONV11]], ptr [[DOTOMP_LB]], align 8
+// CHECK11-NEXT: store i64 [[CONV12]], ptr [[DOTOMP_UB]], align 8
+// CHECK11-NEXT: store i64 1, ptr [[DOTOMP_STRIDE]], align 8
+// CHECK11-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK11-NEXT: [[TMP14:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4
+// CHECK11-NEXT: call void @__kmpc_for_static_init_8(ptr @[[GLOB2]], i32 [[TMP15]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i64 1, i64 1)
+// CHECK11-NEXT: [[TMP16:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8
+// CHECK11-NEXT: [[TMP17:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_5]], align 8
+// CHECK11-NEXT: [[CMP15:%.*]] = icmp sgt i64 [[TMP16]], [[TMP17]]
+// CHECK11-NEXT: br i1 [[CMP15]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK11: cond.true:
+// CHECK11-NEXT: [[TMP18:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_5]], align 8
+// CHECK11-NEXT: br label [[COND_END:%.*]]
+// CHECK11: cond.false:
+// CHECK11-NEXT: [[TMP19:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8
+// CHECK11-NEXT: br label [[COND_END]]
+// CHECK11: cond.end:
+// CHECK11-NEXT: [[COND:%.*]] = phi i64 [ [[TMP18]], [[COND_TRUE]] ], [ [[TMP19]], [[COND_FALSE]] ]
+// CHECK11-NEXT: store i64 [[COND]], ptr [[DOTOMP_UB]], align 8
+// CHECK11-NEXT: [[TMP20:%.*]] = load i64, ptr [[DOTOMP_LB]], align 8
+// CHECK11-NEXT: store i64 [[TMP20]], ptr [[DOTOMP_IV]], align 8
+// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK11: omp.inner.for.cond:
+// CHECK11-NEXT: [[TMP21:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK11-NEXT: [[TMP22:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8
+// CHECK11-NEXT: [[CMP16:%.*]] = icmp sle i64 [[TMP21]], [[TMP22]]
+// CHECK11-NEXT: br i1 [[CMP16]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK11: omp.inner.for.body:
+// CHECK11-NEXT: [[TMP23:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK11-NEXT: [[TMP24:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK11-NEXT: [[SUB17:%.*]] = sub nsw i32 [[TMP24]], 0
+// CHECK11-NEXT: [[DIV18:%.*]] = sdiv i32 [[SUB17]], 1
+// CHECK11-NEXT: [[MUL19:%.*]] = mul nsw i32 1, [[DIV18]]
+// CHECK11-NEXT: [[CONV20:%.*]] = sext i32 [[MUL19]] to i64
+// CHECK11-NEXT: [[DIV21:%.*]] = sdiv i64 [[TMP23]], [[CONV20]]
+// CHECK11-NEXT: [[MUL22:%.*]] = mul nsw i64 [[DIV21]], 1
+// CHECK11-NEXT: [[ADD:%.*]] = add nsw i64 0, [[MUL22]]
+// CHECK11-NEXT: [[CONV23:%.*]] = trunc i64 [[ADD]] to i32
+// CHECK11-NEXT: store i32 [[CONV23]], ptr [[I13]], align 4
+// CHECK11-NEXT: [[TMP25:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK11-NEXT: [[TMP26:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK11-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK11-NEXT: [[SUB24:%.*]] = sub nsw i32 [[TMP27]], 0
+// CHECK11-NEXT: [[DIV25:%.*]] = sdiv i32 [[SUB24]], 1
+// CHECK11-NEXT: [[MUL26:%.*]] = mul nsw i32 1, [[DIV25]]
+// CHECK11-NEXT: [[CONV27:%.*]] = sext i32 [[MUL26]] to i64
+// CHECK11-NEXT: [[DIV28:%.*]] = sdiv i64 [[TMP26]], [[CONV27]]
+// CHECK11-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK11-NEXT: [[SUB29:%.*]] = sub nsw i32 [[TMP28]], 0
+// CHECK11-NEXT: [[DIV30:%.*]] = sdiv i32 [[SUB29]], 1
+// CHECK11-NEXT: [[MUL31:%.*]] = mul nsw i32 1, [[DIV30]]
+// CHECK11-NEXT: [[CONV32:%.*]] = sext i32 [[MUL31]] to i64
+// CHECK11-NEXT: [[MUL33:%.*]] = mul nsw i64 [[DIV28]], [[CONV32]]
+// CHECK11-NEXT: [[SUB34:%.*]] = sub nsw i64 [[TMP25]], [[MUL33]]
+// CHECK11-NEXT: [[MUL35:%.*]] = mul nsw i64 [[SUB34]], 1
+// CHECK11-NEXT: [[ADD36:%.*]] = add nsw i64 0, [[MUL35]]
+// CHECK11-NEXT: [[CONV37:%.*]] = trunc i64 [[ADD36]] to i32
+// CHECK11-NEXT: store i32 [[CONV37]], ptr [[J14]], align 4
+// CHECK11-NEXT: [[TMP29:%.*]] = load i32, ptr [[I13]], align 4
+// CHECK11-NEXT: [[TMP30:%.*]] = mul nsw i32 [[TMP29]], [[TMP3]]
+// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 [[TMP30]]
+// CHECK11-NEXT: [[TMP31:%.*]] = load i32, ptr [[J14]], align 4
+// CHECK11-NEXT: [[ARRAYIDX38:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX]], i32 [[TMP31]]
+// CHECK11-NEXT: store i32 0, ptr [[ARRAYIDX38]], align 4
+// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK11: omp.body.continue:
+// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK11: omp.inner.for.inc:
+// CHECK11-NEXT: [[TMP32:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
+// CHECK11-NEXT: [[ADD39:%.*]] = add nsw i64 [[TMP32]], 1
+// CHECK11-NEXT: store i64 [[ADD39]], ptr [[DOTOMP_IV]], align 8
+// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK11: omp.inner.for.end:
+// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK11: omp.loop.exit:
+// CHECK11-NEXT: [[TMP33:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT: [[TMP34:%.*]] = load i32, ptr [[TMP33]], align 4
+// CHECK11-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP34]])
+// CHECK11-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK11: omp.precond.end:
+// CHECK11-NEXT: ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiLi10ELi2EEiT_
+// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat {
+// CHECK11-NEXT: entry:
+// CHECK11-NEXT: [[ARGC_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[A:%.*]] = alloca [10 x [2 x i32]], align 4
+// CHECK11-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// CHECK11-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// CHECK11-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK11-NEXT: store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4
+// CHECK11-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK11-NEXT: store ptr [[A]], ptr [[TMP0]], align 4
+// CHECK11-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK11-NEXT: store ptr [[A]], ptr [[TMP1]], align 4
+// CHECK11-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK11-NEXT: store ptr null, ptr [[TMP2]], align 4
+// CHECK11-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK11-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK11-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK11-NEXT: store i32 2, ptr [[TMP5]], align 4
+// CHECK11-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK11-NEXT: store i32 1, ptr [[TMP6]], align 4
+// CHECK11-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK11-NEXT: store ptr [[TMP3]], ptr [[TMP7]], align 4
+// CHECK11-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK11-NEXT: store ptr [[TMP4]], ptr [[TMP8]], align 4
+// CHECK11-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK11-NEXT: store ptr @.offload_sizes.1, ptr [[TMP9]], align 4
+// CHECK11-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK11-NEXT: store ptr @.offload_maptypes.2, ptr [[TMP10]], align 4
+// CHECK11-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK11-NEXT: store ptr null, ptr [[TMP11]], align 4
+// CHECK11-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK11-NEXT: store ptr null, ptr [[TMP12]], align 4
+// CHECK11-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK11-NEXT: store i64 20, ptr [[TMP13]], align 8
+// CHECK11-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK11-NEXT: store i64 0, ptr [[TMP14]], align 8
+// CHECK11-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK11-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP15]], align 4
+// CHECK11-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK11-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4
+// CHECK11-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK11-NEXT: store i32 0, ptr [[TMP17]], align 4
+// CHECK11-NEXT: [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l69.region_id, ptr [[KERNEL_ARGS]])
+// CHECK11-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// CHECK11-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK11: omp_offload.failed:
+// CHECK11-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l69(ptr [[A]]) #[[ATTR3]]
+// CHECK11-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK11: omp_offload.cont:
+// CHECK11-NEXT: ret i32 0
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l69
+// CHECK11-SAME: (ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] {
+// CHECK11-NEXT: entry:
+// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l69.omp_outlined, ptr [[TMP0]])
+// CHECK11-NEXT: ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l69.omp_outlined
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] {
+// CHECK11-NEXT: entry:
+// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK11-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK11-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK11-NEXT: store i32 19, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK11-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK11-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK11-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK11-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP2]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK11-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK11-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 19
+// CHECK11-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK11: cond.true:
+// CHECK11-NEXT: br label [[COND_END:%.*]]
+// CHECK11: cond.false:
+// CHECK11-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK11-NEXT: br label [[COND_END]]
+// CHECK11: cond.end:
+// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ 19, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK11-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK11-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK11-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK11: omp.inner.for.cond:
+// CHECK11-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK11-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK11-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK11: omp.inner.for.body:
+// CHECK11-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK11-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l69.omp_outlined.omp_outlined, i32 [[TMP8]], i32 [[TMP9]], ptr [[TMP0]])
+// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK11: omp.inner.for.inc:
+// CHECK11-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], [[TMP11]]
+// CHECK11-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK11: omp.inner.for.end:
+// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK11: omp.loop.exit:
+// CHECK11-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP2]])
+// CHECK11-NEXT: ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l69.omp_outlined.omp_outlined
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] {
+// CHECK11-NEXT: entry:
+// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[J:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK11-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK11-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK11-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4
+// CHECK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// CHECK11-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK11-NEXT: store i32 19, ptr [[DOTOMP_UB]], align 4
+// CHECK11-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK11-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK11-NEXT: store i32 [[TMP1]], ptr [[DOTOMP_LB]], align 4
+// CHECK11-NEXT: store i32 [[TMP2]], ptr [[DOTOMP_UB]], align 4
+// CHECK11-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK11-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK11-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+// CHECK11-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP4]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK11-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK11-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP5]], 19
+// CHECK11-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK11: cond.true:
+// CHECK11-NEXT: br label [[COND_END:%.*]]
+// CHECK11: cond.false:
+// CHECK11-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK11-NEXT: br label [[COND_END]]
+// CHECK11: cond.end:
+// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ 19, [[COND_TRUE]] ], [ [[TMP6]], [[COND_FALSE]] ]
+// CHECK11-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK11-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK11-NEXT: store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK11: omp.inner.for.cond:
+// CHECK11-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK11-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK11-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK11: omp.inner.for.body:
+// CHECK11-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP10]], 2
+// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
+// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK11-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK11-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT: [[DIV3:%.*]] = sdiv i32 [[TMP12]], 2
+// CHECK11-NEXT: [[MUL4:%.*]] = mul nsw i32 [[DIV3]], 2
+// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP11]], [[MUL4]]
+// CHECK11-NEXT: [[MUL5:%.*]] = mul nsw i32 [[SUB]], 1
+// CHECK11-NEXT: [[ADD6:%.*]] = add nsw i32 0, [[MUL5]]
+// CHECK11-NEXT: store i32 [[ADD6]], ptr [[J]], align 4
+// CHECK11-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4
+// CHECK11-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [2 x i32]], ptr [[TMP0]], i32 0, i32 [[TMP13]]
+// CHECK11-NEXT: [[TMP14:%.*]] = load i32, ptr [[J]], align 4
+// CHECK11-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [2 x i32], ptr [[ARRAYIDX]], i32 0, i32 [[TMP14]]
+// CHECK11-NEXT: store i32 0, ptr [[ARRAYIDX7]], align 4
+// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK11: omp.body.continue:
+// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK11: omp.inner.for.inc:
+// CHECK11-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP15]], 1
+// CHECK11-NEXT: store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK11: omp.inner.for.end:
+// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK11: omp.loop.exit:
+// CHECK11-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP4]])
+// CHECK11-NEXT: ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK11-SAME: () #[[ATTR6:[0-9]+]] {
+// CHECK11-NEXT: entry:
+// CHECK11-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK11-NEXT: ret void
+//
diff --git a/clang/test/OpenMP/teams_generic_loop_private_codegen.cpp b/clang/test/OpenMP/teams_generic_loop_private_codegen.cpp
new file mode 100644
index 00000000000000..23a6a306c628c5
--- /dev/null
+++ b/clang/test/OpenMP/teams_generic_loop_private_codegen.cpp
@@ -0,0 +1,1918 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefix=CHECK3
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefix=CHECK3
+
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefix=CHECK9
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefix=CHECK9
+
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DLAMBDA -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+struct St {
+ int a, b;
+ St() : a(0), b(0) {}
+ St(const St &st) : a(st.a + st.b), b(0) {}
+ ~St() {}
+};
+
+volatile int g = 1212;
+volatile int &g1 = g;
+
+template <class T>
+struct S {
+ T f;
+ S(T a) : f(a + g) {}
+ S() : f(g) {}
+ S(const S &s, St t = St()) : f(s.f + t.a) {}
+ operator T() { return T(); }
+ ~S() {}
+};
+
+
+template <typename T>
+T tmain() {
+ S<T> test;
+ T t_var = T();
+ T vec[] = {1, 2};
+ S<T> s_arr[] = {1, 2};
+ S<T> &var = test;
+#pragma omp target
+#pragma omp teams loop private(t_var, vec, s_arr, var)
+ for (int i = 0; i < 2; ++i) {
+ vec[i] = t_var;
+ s_arr[i] = var;
+ }
+ return T();
+}
+
+S<float> test;
+int t_var = 333;
+int vec[] = {1, 2};
+S<float> s_arr[] = {1, 2};
+S<float> var(3);
+
+int main() {
+ static int sivar;
+#ifdef LAMBDA
+ [&]() {
+#pragma omp target
+#pragma omp teams loop private(g, g1, sivar)
+ for (int i = 0; i < 2; ++i) {
+
+ // Skip global, bound tid and loop vars
+
+ g = 1;
+ g1 = 1;
+ sivar = 2;
+
+ // Skip global, bound tid and loop vars
+ [&]() {
+ g = 2;
+ g1 = 2;
+ sivar = 4;
+
+ }();
+ }
+ }();
+ return 0;
+#else
+#pragma omp target
+#pragma omp teams loop private(t_var, vec, s_arr, var, sivar)
+ for (int i = 0; i < 2; ++i) {
+ vec[i] = t_var;
+ s_arr[i] = var;
+ sivar += i;
+ }
+ return tmain<int>();
+#endif
+}
+
+
+
+// Skip global, bound tid and loop vars
+
+// private(s_arr)
+
+// private(var)
+
+
+// Skip global, bound tid and loop vars
+
+// private(s_arr)
+
+// private(var)
+
+
+
+
+// Skip global, bound tid and loop vars
+
+// private(s_arr)
+
+
+// private(var)
+
+
+// Skip global, bound tid and loop vars
+// prev lb and ub
+// iter variables
+
+// private(s_arr)
+
+
+// private(var)
+
+
+
+#endif
+// CHECK1-LABEL: define {{[^@]+}}@__cxx_global_var_init
+// CHECK1-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) @test)
+// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN1SIfED1Ev, ptr @test, ptr @__dso_handle) #[[ATTR2:[0-9]+]]
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_ZN1SIfEC1Ev
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: call void @_ZN1SIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_ZN1SIfED1Ev
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR2]]
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_ZN1SIfEC2Ev
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP0:%.*]] = load volatile i32, ptr @g, align 4
+// CHECK1-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
+// CHECK1-NEXT: store float [[CONV]], ptr [[F]], align 4
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_ZN1SIfED2Ev
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@__cxx_global_var_init.1
+// CHECK1-SAME: () #[[ATTR0]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) @s_arr, float noundef 1.000000e+00)
+// CHECK1-NEXT: call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) getelementptr inbounds ([[STRUCT_S:%.*]], ptr @s_arr, i64 1), float noundef 2.000000e+00)
+// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @__cxx_global_array_dtor, ptr null, ptr @__dso_handle) #[[ATTR2]]
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_ZN1SIfEC1Ef
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], float noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca float, align 4
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: store float [[A]], ptr [[A_ADDR]], align 4
+// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR]], align 4
+// CHECK1-NEXT: call void @_ZN1SIfEC2Ef(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], float noundef [[TMP0]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@__cxx_global_array_dtor
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR0]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK1-NEXT: br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK1: arraydestroy.body:
+// CHECK1-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ getelementptr inbounds ([[STRUCT_S:%.*]], ptr @s_arr, i64 2), [[ENTRY:%.*]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK1-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
+// CHECK1-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR2]]
+// CHECK1-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], @s_arr
+// CHECK1-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE1:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK1: arraydestroy.done1:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_ZN1SIfEC2Ef
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], float noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca float, align 4
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: store float [[A]], ptr [[A_ADDR]], align 4
+// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = load volatile i32, ptr @g, align 4
+// CHECK1-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP1]] to float
+// CHECK1-NEXT: [[ADD:%.*]] = fadd float [[TMP0]], [[CONV]]
+// CHECK1-NEXT: store float [[ADD]], ptr [[F]], align 4
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@__cxx_global_var_init.2
+// CHECK1-SAME: () #[[ATTR0]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) @var, float noundef 3.000000e+00)
+// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN1SIfED1Ev, ptr @var, ptr @__dso_handle) #[[ATTR2]]
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@main
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[RETVAL]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK1-NEXT: store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK1-NEXT: store i32 0, ptr [[TMP1]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK1-NEXT: store ptr null, ptr [[TMP2]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK1-NEXT: store ptr null, ptr [[TMP3]], align 8
+// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK1-NEXT: store ptr null, ptr [[TMP4]], align 8
+// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK1-NEXT: store ptr null, ptr [[TMP5]], align 8
+// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK1-NEXT: store ptr null, ptr [[TMP6]], align 8
+// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK1-NEXT: store ptr null, ptr [[TMP7]], align 8
+// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK1-NEXT: store i64 2, ptr [[TMP8]], align 8
+// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK1-NEXT: store i64 0, ptr [[TMP9]], align 8
+// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP10]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP11]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK1-NEXT: store i32 0, ptr [[TMP12]], align 4
+// CHECK1-NEXT: [[TMP13:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96.region_id, ptr [[KERNEL_ARGS]])
+// CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
+// CHECK1-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK1: omp_offload.failed:
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96() #[[ATTR2]]
+// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK1: omp_offload.cont:
+// CHECK1-NEXT: [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v()
+// CHECK1-NEXT: ret i32 [[CALL]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96
+// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96.omp_outlined)
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[T_VAR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4
+// CHECK1-NEXT: [[S_ARR:%.*]] = alloca [2 x %struct.S], align 4
+// CHECK1-NEXT: [[VAR:%.*]] = alloca [[STRUCT_S:%.*]], align 4
+// CHECK1-NEXT: [[SIVAR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
+// CHECK1-NEXT: [[ARRAYCTOR_END:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN]], i64 2
+// CHECK1-NEXT: br label [[ARRAYCTOR_LOOP:%.*]]
+// CHECK1: arrayctor.loop:
+// CHECK1-NEXT: [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
+// CHECK1-NEXT: call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]])
+// CHECK1-NEXT: [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYCTOR_CUR]], i64 1
+// CHECK1-NEXT: [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
+// CHECK1-NEXT: br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
+// CHECK1: arrayctor.cont:
+// CHECK1-NEXT: call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]])
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 1
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
+// CHECK1: omp.inner.for.cond.cleanup:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]])
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP14]])
+// CHECK1-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR2]]
+// CHECK1-NEXT: [[ARRAY_BEGIN2:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN2]], i64 2
+// CHECK1-NEXT: br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK1: arraydestroy.body:
+// CHECK1-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK1-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
+// CHECK1-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR2]]
+// CHECK1-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN2]]
+// CHECK1-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE3:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK1: arraydestroy.done3:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96.omp_outlined.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[T_VAR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4
+// CHECK1-NEXT: [[S_ARR:%.*]] = alloca [2 x %struct.S], align 4
+// CHECK1-NEXT: [[VAR:%.*]] = alloca [[STRUCT_S:%.*]], align 4
+// CHECK1-NEXT: [[SIVAR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK1-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK1-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[CONV1]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
+// CHECK1-NEXT: [[ARRAYCTOR_END:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN]], i64 2
+// CHECK1-NEXT: br label [[ARRAYCTOR_LOOP:%.*]]
+// CHECK1: arrayctor.loop:
+// CHECK1-NEXT: [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
+// CHECK1-NEXT: call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]])
+// CHECK1-NEXT: [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYCTOR_CUR]], i64 1
+// CHECK1-NEXT: [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
+// CHECK1-NEXT: br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
+// CHECK1: arrayctor.cont:
+// CHECK1-NEXT: call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]])
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 1
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK1-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
+// CHECK1: omp.inner.for.cond.cleanup:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[T_VAR]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP11]] to i64
+// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC]], i64 0, i64 [[IDXPROM]]
+// CHECK1-NEXT: store i32 [[TMP10]], ptr [[ARRAYIDX]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: [[IDXPROM3:%.*]] = sext i32 [[TMP12]] to i64
+// CHECK1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i64 0, i64 [[IDXPROM3]]
+// CHECK1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[ARRAYIDX4]], ptr align 4 [[VAR]], i64 4, i1 false)
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[SIVAR]], align 4
+// CHECK1-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP14]], [[TMP13]]
+// CHECK1-NEXT: store i32 [[ADD5]], ptr [[SIVAR]], align 4
+// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1: omp.body.continue:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP15]], 1
+// CHECK1-NEXT: store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: [[TMP16:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP17]])
+// CHECK1-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR2]]
+// CHECK1-NEXT: [[ARRAY_BEGIN7:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN7]], i64 2
+// CHECK1-NEXT: br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK1: arraydestroy.body:
+// CHECK1-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP18]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK1-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
+// CHECK1-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR2]]
+// CHECK1-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN7]]
+// CHECK1-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE8:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK1: arraydestroy.done8:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
+// CHECK1-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
+// CHECK1-NEXT: [[T_VAR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4
+// CHECK1-NEXT: [[S_ARR:%.*]] = alloca [2 x %struct.S.0], align 4
+// CHECK1-NEXT: [[VAR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[_TMP1:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK1-NEXT: call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
+// CHECK1-NEXT: store i32 0, ptr [[T_VAR]], align 4
+// CHECK1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
+// CHECK1-NEXT: [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 0
+// CHECK1-NEXT: call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef signext 1)
+// CHECK1-NEXT: [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i64 1
+// CHECK1-NEXT: call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2)
+// CHECK1-NEXT: store ptr [[TEST]], ptr [[VAR]], align 8
+// CHECK1-NEXT: store ptr undef, ptr [[_TMP1]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK1-NEXT: store i32 2, ptr [[TMP0]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK1-NEXT: store i32 0, ptr [[TMP1]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK1-NEXT: store ptr null, ptr [[TMP2]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK1-NEXT: store ptr null, ptr [[TMP3]], align 8
+// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK1-NEXT: store ptr null, ptr [[TMP4]], align 8
+// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK1-NEXT: store ptr null, ptr [[TMP5]], align 8
+// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK1-NEXT: store ptr null, ptr [[TMP6]], align 8
+// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK1-NEXT: store ptr null, ptr [[TMP7]], align 8
+// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK1-NEXT: store i64 2, ptr [[TMP8]], align 8
+// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK1-NEXT: store i64 0, ptr [[TMP9]], align 8
+// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP10]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP11]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK1-NEXT: store i32 0, ptr [[TMP12]], align 4
+// CHECK1-NEXT: [[TMP13:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.region_id, ptr [[KERNEL_ARGS]])
+// CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
+// CHECK1-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK1: omp_offload.failed:
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56() #[[ATTR2]]
+// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK1: omp_offload.cont:
+// CHECK1-NEXT: store i32 0, ptr [[RETVAL]], align 4
+// CHECK1-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i64 2
+// CHECK1-NEXT: br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK1: arraydestroy.body:
+// CHECK1-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK1-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
+// CHECK1-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR2]]
+// CHECK1-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
+// CHECK1-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK1: arraydestroy.done2:
+// CHECK1-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR2]]
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[RETVAL]], align 4
+// CHECK1-NEXT: ret i32 [[TMP16]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_ZN1SIiEC1Ev
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: call void @_ZN1SIiEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_ZN1SIiEC1Ei
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef signext [[A:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK1-NEXT: call void @_ZN1SIiEC2Ei(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], i32 noundef signext [[TMP0]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56
+// CHECK1-SAME: () #[[ATTR4]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined)
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[_TMP1:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[T_VAR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4
+// CHECK1-NEXT: [[S_ARR:%.*]] = alloca [2 x %struct.S.0], align 4
+// CHECK1-NEXT: [[VAR:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
+// CHECK1-NEXT: [[_TMP2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr undef, ptr [[_TMP1]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
+// CHECK1-NEXT: [[ARRAYCTOR_END:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i64 2
+// CHECK1-NEXT: br label [[ARRAYCTOR_LOOP:%.*]]
+// CHECK1: arrayctor.loop:
+// CHECK1-NEXT: [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
+// CHECK1-NEXT: call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]])
+// CHECK1-NEXT: [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYCTOR_CUR]], i64 1
+// CHECK1-NEXT: [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
+// CHECK1-NEXT: br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
+// CHECK1: arrayctor.cont:
+// CHECK1-NEXT: call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]])
+// CHECK1-NEXT: store ptr [[VAR]], ptr [[_TMP2]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 1
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP3:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK1-NEXT: br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
+// CHECK1: omp.inner.for.cond.cleanup:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]])
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP14]])
+// CHECK1-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR2]]
+// CHECK1-NEXT: [[ARRAY_BEGIN4:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN4]], i64 2
+// CHECK1-NEXT: br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK1: arraydestroy.body:
+// CHECK1-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK1-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
+// CHECK1-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR2]]
+// CHECK1-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN4]]
+// CHECK1-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE5:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK1: arraydestroy.done5:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[_TMP1:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[T_VAR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4
+// CHECK1-NEXT: [[S_ARR:%.*]] = alloca [2 x %struct.S.0], align 4
+// CHECK1-NEXT: [[VAR:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
+// CHECK1-NEXT: [[_TMP3:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: store ptr undef, ptr [[_TMP1]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK1-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK1-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[CONV2]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
+// CHECK1-NEXT: [[ARRAYCTOR_END:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i64 2
+// CHECK1-NEXT: br label [[ARRAYCTOR_LOOP:%.*]]
+// CHECK1: arrayctor.loop:
+// CHECK1-NEXT: [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
+// CHECK1-NEXT: call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]])
+// CHECK1-NEXT: [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYCTOR_CUR]], i64 1
+// CHECK1-NEXT: [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
+// CHECK1-NEXT: br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
+// CHECK1: arrayctor.cont:
+// CHECK1-NEXT: call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]])
+// CHECK1-NEXT: store ptr [[VAR]], ptr [[_TMP3]], align 8
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 1
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP4:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK1-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
+// CHECK1: omp.inner.for.cond.cleanup:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[T_VAR]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP11]] to i64
+// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC]], i64 0, i64 [[IDXPROM]]
+// CHECK1-NEXT: store i32 [[TMP10]], ptr [[ARRAYIDX]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = load ptr, ptr [[_TMP3]], align 8
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: [[IDXPROM5:%.*]] = sext i32 [[TMP13]] to i64
+// CHECK1-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i64 0, i64 [[IDXPROM5]]
+// CHECK1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[ARRAYIDX6]], ptr align 4 [[TMP12]], i64 4, i1 false)
+// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1: omp.body.continue:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP14]], 1
+// CHECK1-NEXT: store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: [[TMP15:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP16]])
+// CHECK1-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR2]]
+// CHECK1-NEXT: [[ARRAY_BEGIN8:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN8]], i64 2
+// CHECK1-NEXT: br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK1: arraydestroy.body:
+// CHECK1-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP17]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK1-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
+// CHECK1-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR2]]
+// CHECK1-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN8]]
+// CHECK1-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE9:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK1: arraydestroy.done9:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_ZN1SIiED1Ev
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR2]]
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_ZN1SIiEC2Ev
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP0:%.*]] = load volatile i32, ptr @g, align 4
+// CHECK1-NEXT: store i32 [[TMP0]], ptr [[F]], align 4
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_ZN1SIiEC2Ei
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef signext [[A:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = load volatile i32, ptr @g, align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP1]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[F]], align 4
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_ZN1SIiED2Ev
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_GLOBAL__sub_I_teams_generic_loop_private_codegen.cpp
+// CHECK1-SAME: () #[[ATTR0]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: call void @__cxx_global_var_init()
+// CHECK1-NEXT: call void @__cxx_global_var_init.1()
+// CHECK1-NEXT: call void @__cxx_global_var_init.2()
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK1-SAME: () #[[ATTR0]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@__cxx_global_var_init
+// CHECK3-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) @test)
+// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN1SIfED1Ev, ptr @test, ptr @__dso_handle) #[[ATTR2:[0-9]+]]
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_ZN1SIfEC1Ev
+// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: call void @_ZN1SIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]])
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_ZN1SIfED1Ev
+// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR2]]
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_ZN1SIfEC2Ev
+// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP0:%.*]] = load volatile i32, ptr @g, align 4
+// CHECK3-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
+// CHECK3-NEXT: store float [[CONV]], ptr [[F]], align 4
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_ZN1SIfED2Ev
+// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@__cxx_global_var_init.1
+// CHECK3-SAME: () #[[ATTR0]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) @s_arr, float noundef 1.000000e+00)
+// CHECK3-NEXT: call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) getelementptr inbounds ([[STRUCT_S:%.*]], ptr @s_arr, i32 1), float noundef 2.000000e+00)
+// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @__cxx_global_array_dtor, ptr null, ptr @__dso_handle) #[[ATTR2]]
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_ZN1SIfEC1Ef
+// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], float noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca float, align 4
+// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: store float [[A]], ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: call void @_ZN1SIfEC2Ef(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], float noundef [[TMP0]])
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@__cxx_global_array_dtor
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR0]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
+// CHECK3-NEXT: br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK3: arraydestroy.body:
+// CHECK3-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ getelementptr inbounds ([[STRUCT_S:%.*]], ptr @s_arr, i32 2), [[ENTRY:%.*]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK3-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
+// CHECK3-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR2]]
+// CHECK3-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], @s_arr
+// CHECK3-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE1:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK3: arraydestroy.done1:
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_ZN1SIfEC2Ef
+// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], float noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca float, align 4
+// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: store float [[A]], ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load volatile i32, ptr @g, align 4
+// CHECK3-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP1]] to float
+// CHECK3-NEXT: [[ADD:%.*]] = fadd float [[TMP0]], [[CONV]]
+// CHECK3-NEXT: store float [[ADD]], ptr [[F]], align 4
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@__cxx_global_var_init.2
+// CHECK3-SAME: () #[[ATTR0]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) @var, float noundef 3.000000e+00)
+// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN1SIfED1Ev, ptr @var, ptr @__dso_handle) #[[ATTR2]]
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@main
+// CHECK3-SAME: () #[[ATTR3:[0-9]+]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK3-NEXT: store i32 0, ptr [[RETVAL]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK3-NEXT: store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK3-NEXT: store i32 0, ptr [[TMP1]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK3-NEXT: store ptr null, ptr [[TMP2]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK3-NEXT: store ptr null, ptr [[TMP3]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK3-NEXT: store ptr null, ptr [[TMP4]], align 4
+// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK3-NEXT: store ptr null, ptr [[TMP5]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK3-NEXT: store ptr null, ptr [[TMP6]], align 4
+// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK3-NEXT: store ptr null, ptr [[TMP7]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK3-NEXT: store i64 2, ptr [[TMP8]], align 8
+// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK3-NEXT: store i64 0, ptr [[TMP9]], align 8
+// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP10]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP11]], align 4
+// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK3-NEXT: store i32 0, ptr [[TMP12]], align 4
+// CHECK3-NEXT: [[TMP13:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96.region_id, ptr [[KERNEL_ARGS]])
+// CHECK3-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
+// CHECK3-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK3: omp_offload.failed:
+// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96() #[[ATTR2]]
+// CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK3: omp_offload.cont:
+// CHECK3-NEXT: [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v()
+// CHECK3-NEXT: ret i32 [[CALL]]
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96
+// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96.omp_outlined)
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[T_VAR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4
+// CHECK3-NEXT: [[S_ARR:%.*]] = alloca [2 x %struct.S], align 4
+// CHECK3-NEXT: [[VAR:%.*]] = alloca [[STRUCT_S:%.*]], align 4
+// CHECK3-NEXT: [[SIVAR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
+// CHECK3-NEXT: [[ARRAYCTOR_END:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN]], i32 2
+// CHECK3-NEXT: br label [[ARRAYCTOR_LOOP:%.*]]
+// CHECK3: arrayctor.loop:
+// CHECK3-NEXT: [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
+// CHECK3-NEXT: call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]])
+// CHECK3-NEXT: [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYCTOR_CUR]], i32 1
+// CHECK3-NEXT: [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
+// CHECK3-NEXT: br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
+// CHECK3: arrayctor.cont:
+// CHECK3-NEXT: call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]])
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 1
+// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK3: cond.true:
+// CHECK3-NEXT: br label [[COND_END:%.*]]
+// CHECK3: cond.false:
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: br label [[COND_END]]
+// CHECK3: cond.end:
+// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK3-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3: omp.inner.for.cond:
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK3-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
+// CHECK3: omp.inner.for.cond.cleanup:
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3: omp.inner.for.body:
+// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96.omp_outlined.omp_outlined, i32 [[TMP7]], i32 [[TMP8]])
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3: omp.inner.for.inc:
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK3: omp.inner.for.end:
+// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3: omp.loop.exit:
+// CHECK3-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP12]])
+// CHECK3-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR2]]
+// CHECK3-NEXT: [[ARRAY_BEGIN2:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN2]], i32 2
+// CHECK3-NEXT: br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK3: arraydestroy.body:
+// CHECK3-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP13]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK3-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
+// CHECK3-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR2]]
+// CHECK3-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN2]]
+// CHECK3-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE3:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK3: arraydestroy.done3:
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96.omp_outlined.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[T_VAR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4
+// CHECK3-NEXT: [[S_ARR:%.*]] = alloca [2 x %struct.S], align 4
+// CHECK3-NEXT: [[VAR:%.*]] = alloca [[STRUCT_S:%.*]], align 4
+// CHECK3-NEXT: [[SIVAR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP0]], ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
+// CHECK3-NEXT: [[ARRAYCTOR_END:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN]], i32 2
+// CHECK3-NEXT: br label [[ARRAYCTOR_LOOP:%.*]]
+// CHECK3: arrayctor.loop:
+// CHECK3-NEXT: [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
+// CHECK3-NEXT: call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]])
+// CHECK3-NEXT: [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYCTOR_CUR]], i32 1
+// CHECK3-NEXT: [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
+// CHECK3-NEXT: br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
+// CHECK3: arrayctor.cont:
+// CHECK3-NEXT: call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]])
+// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 1
+// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK3: cond.true:
+// CHECK3-NEXT: br label [[COND_END:%.*]]
+// CHECK3: cond.false:
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: br label [[COND_END]]
+// CHECK3: cond.end:
+// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK3-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3: omp.inner.for.cond:
+// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK3-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
+// CHECK3: omp.inner.for.cond.cleanup:
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3: omp.inner.for.body:
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[T_VAR]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4
+// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC]], i32 0, i32 [[TMP11]]
+// CHECK3-NEXT: store i32 [[TMP10]], ptr [[ARRAYIDX]], align 4
+// CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[I]], align 4
+// CHECK3-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 [[TMP12]]
+// CHECK3-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[ARRAYIDX2]], ptr align 4 [[VAR]], i32 4, i1 false)
+// CHECK3-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4
+// CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[SIVAR]], align 4
+// CHECK3-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP14]], [[TMP13]]
+// CHECK3-NEXT: store i32 [[ADD3]], ptr [[SIVAR]], align 4
+// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK3: omp.body.continue:
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3: omp.inner.for.inc:
+// CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP15]], 1
+// CHECK3-NEXT: store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK3: omp.inner.for.end:
+// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3: omp.loop.exit:
+// CHECK3-NEXT: [[TMP16:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP17]])
+// CHECK3-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR2]]
+// CHECK3-NEXT: [[ARRAY_BEGIN5:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN5]], i32 2
+// CHECK3-NEXT: br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK3: arraydestroy.body:
+// CHECK3-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP18]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK3-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
+// CHECK3-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR2]]
+// CHECK3-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN5]]
+// CHECK3-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE6:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK3: arraydestroy.done6:
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
+// CHECK3-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
+// CHECK3-NEXT: [[T_VAR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4
+// CHECK3-NEXT: [[S_ARR:%.*]] = alloca [2 x %struct.S.0], align 4
+// CHECK3-NEXT: [[VAR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[_TMP1:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK3-NEXT: call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]])
+// CHECK3-NEXT: store i32 0, ptr [[T_VAR]], align 4
+// CHECK3-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
+// CHECK3-NEXT: [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
+// CHECK3-NEXT: call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_BEGIN]], i32 noundef 1)
+// CHECK3-NEXT: [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYINIT_BEGIN]], i32 1
+// CHECK3-NEXT: call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2)
+// CHECK3-NEXT: store ptr [[TEST]], ptr [[VAR]], align 4
+// CHECK3-NEXT: store ptr undef, ptr [[_TMP1]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK3-NEXT: store i32 2, ptr [[TMP0]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK3-NEXT: store i32 0, ptr [[TMP1]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK3-NEXT: store ptr null, ptr [[TMP2]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK3-NEXT: store ptr null, ptr [[TMP3]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK3-NEXT: store ptr null, ptr [[TMP4]], align 4
+// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK3-NEXT: store ptr null, ptr [[TMP5]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK3-NEXT: store ptr null, ptr [[TMP6]], align 4
+// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK3-NEXT: store ptr null, ptr [[TMP7]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK3-NEXT: store i64 2, ptr [[TMP8]], align 8
+// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK3-NEXT: store i64 0, ptr [[TMP9]], align 8
+// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP10]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP11]], align 4
+// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK3-NEXT: store i32 0, ptr [[TMP12]], align 4
+// CHECK3-NEXT: [[TMP13:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB3]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.region_id, ptr [[KERNEL_ARGS]])
+// CHECK3-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
+// CHECK3-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK3: omp_offload.failed:
+// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56() #[[ATTR2]]
+// CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK3: omp_offload.cont:
+// CHECK3-NEXT: store i32 0, ptr [[RETVAL]], align 4
+// CHECK3-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i32 2
+// CHECK3-NEXT: br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK3: arraydestroy.body:
+// CHECK3-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK3-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
+// CHECK3-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR2]]
+// CHECK3-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
+// CHECK3-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK3: arraydestroy.done2:
+// CHECK3-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR2]]
+// CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[RETVAL]], align 4
+// CHECK3-NEXT: ret i32 [[TMP16]]
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_ZN1SIiEC1Ev
+// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: call void @_ZN1SIiEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]])
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_ZN1SIiEC1Ei
+// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: call void @_ZN1SIiEC2Ei(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], i32 noundef [[TMP0]])
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56
+// CHECK3-SAME: () #[[ATTR4]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined)
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[_TMP1:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[T_VAR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4
+// CHECK3-NEXT: [[S_ARR:%.*]] = alloca [2 x %struct.S.0], align 4
+// CHECK3-NEXT: [[VAR:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
+// CHECK3-NEXT: [[_TMP2:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr undef, ptr [[_TMP1]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
+// CHECK3-NEXT: [[ARRAYCTOR_END:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i32 2
+// CHECK3-NEXT: br label [[ARRAYCTOR_LOOP:%.*]]
+// CHECK3: arrayctor.loop:
+// CHECK3-NEXT: [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
+// CHECK3-NEXT: call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]])
+// CHECK3-NEXT: [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYCTOR_CUR]], i32 1
+// CHECK3-NEXT: [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
+// CHECK3-NEXT: br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
+// CHECK3: arrayctor.cont:
+// CHECK3-NEXT: call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]])
+// CHECK3-NEXT: store ptr [[VAR]], ptr [[_TMP2]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 1
+// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK3: cond.true:
+// CHECK3-NEXT: br label [[COND_END:%.*]]
+// CHECK3: cond.false:
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: br label [[COND_END]]
+// CHECK3: cond.end:
+// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK3-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3: omp.inner.for.cond:
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[CMP3:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK3-NEXT: br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
+// CHECK3: omp.inner.for.cond.cleanup:
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3: omp.inner.for.body:
+// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined.omp_outlined, i32 [[TMP7]], i32 [[TMP8]])
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3: omp.inner.for.inc:
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK3: omp.inner.for.end:
+// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3: omp.loop.exit:
+// CHECK3-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP12]])
+// CHECK3-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR2]]
+// CHECK3-NEXT: [[ARRAY_BEGIN4:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN4]], i32 2
+// CHECK3-NEXT: br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK3: arraydestroy.body:
+// CHECK3-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP13]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK3-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
+// CHECK3-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR2]]
+// CHECK3-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN4]]
+// CHECK3-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE5:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK3: arraydestroy.done5:
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[_TMP1:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[T_VAR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4
+// CHECK3-NEXT: [[S_ARR:%.*]] = alloca [2 x %struct.S.0], align 4
+// CHECK3-NEXT: [[VAR:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
+// CHECK3-NEXT: [[_TMP2:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: store ptr undef, ptr [[_TMP1]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP0]], ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
+// CHECK3-NEXT: [[ARRAYCTOR_END:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i32 2
+// CHECK3-NEXT: br label [[ARRAYCTOR_LOOP:%.*]]
+// CHECK3: arrayctor.loop:
+// CHECK3-NEXT: [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
+// CHECK3-NEXT: call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]])
+// CHECK3-NEXT: [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYCTOR_CUR]], i32 1
+// CHECK3-NEXT: [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
+// CHECK3-NEXT: br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
+// CHECK3: arrayctor.cont:
+// CHECK3-NEXT: call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]])
+// CHECK3-NEXT: store ptr [[VAR]], ptr [[_TMP2]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 1
+// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK3: cond.true:
+// CHECK3-NEXT: br label [[COND_END:%.*]]
+// CHECK3: cond.false:
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: br label [[COND_END]]
+// CHECK3: cond.end:
+// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK3-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3: omp.inner.for.cond:
+// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[CMP3:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK3-NEXT: br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
+// CHECK3: omp.inner.for.cond.cleanup:
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3: omp.inner.for.body:
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[T_VAR]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4
+// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC]], i32 0, i32 [[TMP11]]
+// CHECK3-NEXT: store i32 [[TMP10]], ptr [[ARRAYIDX]], align 4
+// CHECK3-NEXT: [[TMP12:%.*]] = load ptr, ptr [[_TMP2]], align 4
+// CHECK3-NEXT: [[TMP13:%.*]] = load i32, ptr [[I]], align 4
+// CHECK3-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 [[TMP13]]
+// CHECK3-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[ARRAYIDX4]], ptr align 4 [[TMP12]], i32 4, i1 false)
+// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK3: omp.body.continue:
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3: omp.inner.for.inc:
+// CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP14]], 1
+// CHECK3-NEXT: store i32 [[ADD5]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK3: omp.inner.for.end:
+// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3: omp.loop.exit:
+// CHECK3-NEXT: [[TMP15:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP16]])
+// CHECK3-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR2]]
+// CHECK3-NEXT: [[ARRAY_BEGIN6:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN6]], i32 2
+// CHECK3-NEXT: br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK3: arraydestroy.body:
+// CHECK3-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP17]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK3-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
+// CHECK3-NEXT: call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR2]]
+// CHECK3-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN6]]
+// CHECK3-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE7:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK3: arraydestroy.done7:
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_ZN1SIiED1Ev
+// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR2]]
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_ZN1SIiEC2Ev
+// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP0:%.*]] = load volatile i32, ptr @g, align 4
+// CHECK3-NEXT: store i32 [[TMP0]], ptr [[F]], align 4
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_ZN1SIiEC2Ei
+// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load volatile i32, ptr @g, align 4
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP1]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[F]], align 4
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_ZN1SIiED2Ev
+// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_GLOBAL__sub_I_teams_generic_loop_private_codegen.cpp
+// CHECK3-SAME: () #[[ATTR0]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: call void @__cxx_global_var_init()
+// CHECK3-NEXT: call void @__cxx_global_var_init.1()
+// CHECK3-NEXT: call void @__cxx_global_var_init.2()
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK3-SAME: () #[[ATTR0]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@__cxx_global_var_init
+// CHECK9-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) @test)
+// CHECK9-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN1SIfED1Ev, ptr @test, ptr @__dso_handle) #[[ATTR2:[0-9]+]]
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@_ZN1SIfEC1Ev
+// CHECK9-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK9-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK9-NEXT: call void @_ZN1SIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]])
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@_ZN1SIfED1Ev
+// CHECK9-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK9-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK9-NEXT: call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR2]]
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@_ZN1SIfEC2Ev
+// CHECK9-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK9-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK9-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK9-NEXT: [[TMP0:%.*]] = load volatile i32, ptr @g, align 4
+// CHECK9-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
+// CHECK9-NEXT: store float [[CONV]], ptr [[F]], align 4
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@_ZN1SIfED2Ev
+// CHECK9-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK9-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@__cxx_global_var_init.1
+// CHECK9-SAME: () #[[ATTR0]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) @s_arr, float noundef 1.000000e+00)
+// CHECK9-NEXT: call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) getelementptr inbounds ([[STRUCT_S:%.*]], ptr @s_arr, i64 1), float noundef 2.000000e+00)
+// CHECK9-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @__cxx_global_array_dtor, ptr null, ptr @__dso_handle) #[[ATTR2]]
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@_ZN1SIfEC1Ef
+// CHECK9-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], float noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca float, align 4
+// CHECK9-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK9-NEXT: store float [[A]], ptr [[A_ADDR]], align 4
+// CHECK9-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK9-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR]], align 4
+// CHECK9-NEXT: call void @_ZN1SIfEC2Ef(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], float noundef [[TMP0]])
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@__cxx_global_array_dtor
+// CHECK9-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR0]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK9-NEXT: br label [[ARRAYDESTROY_BODY:%.*]]
+// CHECK9: arraydestroy.body:
+// CHECK9-NEXT: [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ getelementptr inbounds ([[STRUCT_S:%.*]], ptr @s_arr, i64 2), [[ENTRY:%.*]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
+// CHECK9-NEXT: [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
+// CHECK9-NEXT: call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR2]]
+// CHECK9-NEXT: [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], @s_arr
+// CHECK9-NEXT: br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE1:%.*]], label [[ARRAYDESTROY_BODY]]
+// CHECK9: arraydestroy.done1:
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@_ZN1SIfEC2Ef
+// CHECK9-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], float noundef [[A:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca float, align 4
+// CHECK9-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK9-NEXT: store float [[A]], ptr [[A_ADDR]], align 4
+// CHECK9-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK9-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK9-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR]], align 4
+// CHECK9-NEXT: [[TMP1:%.*]] = load volatile i32, ptr @g, align 4
+// CHECK9-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP1]] to float
+// CHECK9-NEXT: [[ADD:%.*]] = fadd float [[TMP0]], [[CONV]]
+// CHECK9-NEXT: store float [[ADD]], ptr [[F]], align 4
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@__cxx_global_var_init.2
+// CHECK9-SAME: () #[[ATTR0]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: call void @_ZN1SIfEC1Ef(ptr noundef nonnull align 4 dereferenceable(4) @var, float noundef 3.000000e+00)
+// CHECK9-NEXT: [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN1SIfED1Ev, ptr @var, ptr @__dso_handle) #[[ATTR2]]
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@main
+// CHECK9-SAME: () #[[ATTR3:[0-9]+]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[REF_TMP:%.*]] = alloca [[CLASS_ANON:%.*]], align 1
+// CHECK9-NEXT: store i32 0, ptr [[RETVAL]], align 4
+// CHECK9-NEXT: call void @"_ZZ4mainENK3$_0clEv"(ptr noundef nonnull align 1 dereferenceable(1) [[REF_TMP]])
+// CHECK9-NEXT: ret i32 0
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l75
+// CHECK9-SAME: (i64 noundef [[G1:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[G1_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[TMP:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: store i64 [[G1]], ptr [[G1_ADDR]], align 8
+// CHECK9-NEXT: store ptr [[G1_ADDR]], ptr [[TMP]], align 8
+// CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3:[0-9]+]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l75.omp_outlined)
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l75.omp_outlined
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[_TMP1:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[G:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[G1:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[_TMP2:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[SIVAR:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK9-NEXT: store ptr undef, ptr [[_TMP1]], align 8
+// CHECK9-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK9-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK9-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK9-NEXT: store ptr [[G1]], ptr [[_TMP2]], align 8
+// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK9-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK9-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 1
+// CHECK9-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK9: cond.true:
+// CHECK9-NEXT: br label [[COND_END:%.*]]
+// CHECK9: cond.false:
+// CHECK9-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT: br label [[COND_END]]
+// CHECK9: cond.end:
+// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// CHECK9-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK9-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK9: omp.inner.for.cond:
+// CHECK9-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT: [[CMP3:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// CHECK9-NEXT: br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK9: omp.inner.for.body:
+// CHECK9-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK9-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+// CHECK9-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l75.omp_outlined.omp_outlined, i64 [[TMP8]], i64 [[TMP10]])
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK9: omp.inner.for.inc:
+// CHECK9-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
+// CHECK9-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK9: omp.inner.for.end:
+// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK9: omp.loop.exit:
+// CHECK9-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP1]])
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l75.omp_outlined.omp_outlined
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR5]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[_TMP1:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[G:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[G1:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[_TMP3:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[SIVAR:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[REF_TMP:%.*]] = alloca [[CLASS_ANON_0:%.*]], align 8
+// CHECK9-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK9-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK9-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK9-NEXT: store ptr undef, ptr [[_TMP1]], align 8
+// CHECK9-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK9-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT: [[TMP0:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK9-NEXT: [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
+// CHECK9-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK9-NEXT: [[CONV2:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK9-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK9-NEXT: store i32 [[CONV2]], ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK9-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK9-NEXT: store ptr [[G1]], ptr [[_TMP3]], align 8
+// CHECK9-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+// CHECK9-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK9-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 1
+// CHECK9-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK9: cond.true:
+// CHECK9-NEXT: br label [[COND_END:%.*]]
+// CHECK9: cond.false:
+// CHECK9-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT: br label [[COND_END]]
+// CHECK9: cond.end:
+// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
+// CHECK9-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK9-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK9: omp.inner.for.cond:
+// CHECK9-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT: [[CMP4:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
+// CHECK9-NEXT: br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK9: omp.inner.for.body:
+// CHECK9-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
+// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK9-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK9-NEXT: store i32 1, ptr [[G]], align 4
+// CHECK9-NEXT: [[TMP10:%.*]] = load ptr, ptr [[_TMP3]], align 8
+// CHECK9-NEXT: store volatile i32 1, ptr [[TMP10]], align 4
+// CHECK9-NEXT: store i32 2, ptr [[SIVAR]], align 4
+// CHECK9-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 0
+// CHECK9-NEXT: store ptr [[G]], ptr [[TMP11]], align 8
+// CHECK9-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 1
+// CHECK9-NEXT: [[TMP13:%.*]] = load ptr, ptr [[_TMP3]], align 8
+// CHECK9-NEXT: store ptr [[TMP13]], ptr [[TMP12]], align 8
+// CHECK9-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 2
+// CHECK9-NEXT: store ptr [[SIVAR]], ptr [[TMP14]], align 8
+// CHECK9-NEXT: call void @"_ZZZ4mainENK3$_0clEvENKUlvE_clEv"(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]])
+// CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK9: omp.body.continue:
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK9: omp.inner.for.inc:
+// CHECK9-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP15]], 1
+// CHECK9-NEXT: store i32 [[ADD5]], ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK9: omp.inner.for.end:
+// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK9: omp.loop.exit:
+// CHECK9-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP3]])
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@_GLOBAL__sub_I_teams_generic_loop_private_codegen.cpp
+// CHECK9-SAME: () #[[ATTR0]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: call void @__cxx_global_var_init()
+// CHECK9-NEXT: call void @__cxx_global_var_init.1()
+// CHECK9-NEXT: call void @__cxx_global_var_init.2()
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK9-SAME: () #[[ATTR0]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK9-NEXT: ret void
+//
diff --git a/clang/test/OpenMP/teams_generic_loop_reduction_codegen.cpp b/clang/test/OpenMP/teams_generic_loop_reduction_codegen.cpp
new file mode 100644
index 00000000000000..bafba6a2e103bc
--- /dev/null
+++ b/clang/test/OpenMP/teams_generic_loop_reduction_codegen.cpp
@@ -0,0 +1,1524 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK3
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK3
+
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCHECK -verify -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCHECK -fopenmp-simd -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK9
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK9
+
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -DLAMBDA -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp-simd -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+template <typename T>
+T tmain() {
+ T t_var = T();
+ T vec[] = {1, 2};
+#pragma omp target
+#pragma omp teams loop reduction(+: t_var)
+ for (int i = 0; i < 2; ++i) {
+ t_var += (T) i;
+ }
+ return T();
+}
+
+int main() {
+ static int sivar;
+#ifdef LAMBDA
+
+ [&]() {
+#pragma omp target
+#pragma omp teams loop reduction(+: sivar)
+ for (int i = 0; i < 2; ++i) {
+
+ // Skip global and bound tid vars
+
+
+
+ // Skip global and bound tid vars, and prev lb and ub vars
+ // skip loop vars
+
+
+ sivar += i;
+
+ [&]() {
+
+ sivar += 4;
+
+ }();
+ }
+ }();
+ return 0;
+#else
+#pragma omp target
+#pragma omp teams loop reduction(+: sivar)
+ for (int i = 0; i < 2; ++i) {
+ sivar += i;
+ }
+ return tmain<int>();
+#endif
+}
+
+
+
+
+// Skip global and bound tid vars
+
+
+// Skip global and bound tid vars, and prev lb and ub
+// skip loop vars
+
+
+
+
+// Skip global and bound tid vars
+
+
+// Skip global and bound tid vars, and prev lb and ub vars
+// skip loop vars
+
+#endif
+// CHECK1-LABEL: define {{[^@]+}}@main
+// CHECK1-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[SIVAR_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[RETVAL]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr @_ZZ4mainE5sivar, align 4
+// CHECK1-NEXT: store i32 [[TMP0]], ptr [[SIVAR_CASTED]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = load i64, ptr [[SIVAR_CASTED]], align 8
+// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT: store i64 [[TMP1]], ptr [[TMP2]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT: store i64 [[TMP1]], ptr [[TMP3]], align 8
+// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK1-NEXT: store ptr null, ptr [[TMP4]], align 8
+// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK1-NEXT: store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK1-NEXT: store i32 1, ptr [[TMP8]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK1-NEXT: store ptr [[TMP5]], ptr [[TMP9]], align 8
+// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK1-NEXT: store ptr [[TMP6]], ptr [[TMP10]], align 8
+// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK1-NEXT: store ptr @.offload_sizes, ptr [[TMP11]], align 8
+// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK1-NEXT: store ptr @.offload_maptypes, ptr [[TMP12]], align 8
+// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK1-NEXT: store ptr null, ptr [[TMP13]], align 8
+// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK1-NEXT: store ptr null, ptr [[TMP14]], align 8
+// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK1-NEXT: store i64 2, ptr [[TMP15]], align 8
+// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK1-NEXT: store i64 0, ptr [[TMP16]], align 8
+// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4
+// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP18]], align 4
+// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK1-NEXT: store i32 0, ptr [[TMP19]], align 4
+// CHECK1-NEXT: [[TMP20:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB4:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.region_id, ptr [[KERNEL_ARGS]])
+// CHECK1-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
+// CHECK1-NEXT: br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK1: omp_offload.failed:
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68(i64 [[TMP1]]) #[[ATTR2:[0-9]+]]
+// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK1: omp_offload.cont:
+// CHECK1-NEXT: [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v()
+// CHECK1-NEXT: ret i32 [[CALL]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68
+// CHECK1-SAME: (i64 noundef [[SIVAR:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[SIVAR_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB4]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined, ptr [[SIVAR_ADDR]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[SIVAR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[SIVAR1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SIVAR_ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[SIVAR1]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP2]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 1
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK1-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB4]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined, i64 [[TMP9]], i64 [[TMP11]], ptr [[SIVAR1]])
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]])
+// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
+// CHECK1-NEXT: store ptr [[SIVAR1]], ptr [[TMP14]], align 8
+// CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3:[0-9]+]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK1-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+// CHECK1-NEXT: ]
+// CHECK1: .omp.reduction.case1:
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[SIVAR1]], align 4
+// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+// CHECK1-NEXT: store i32 [[ADD3]], ptr [[TMP0]], align 4
+// CHECK1-NEXT: call void @__kmpc_end_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK1: .omp.reduction.case2:
+// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[SIVAR1]], align 4
+// CHECK1-NEXT: [[TMP19:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP18]] monotonic, align 4
+// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK1: .omp.reduction.default:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[SIVAR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[SIVAR2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SIVAR_ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK1-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32
+// CHECK1-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[CONV1]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[SIVAR2]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP4]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP5]], 1
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP6]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP3:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK1-NEXT: br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[SIVAR2]], align 4
+// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP12]], [[TMP11]]
+// CHECK1-NEXT: store i32 [[ADD4]], ptr [[SIVAR2]], align 4
+// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1: omp.body.continue:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP13]], 1
+// CHECK1-NEXT: store i32 [[ADD5]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP4]])
+// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
+// CHECK1-NEXT: store ptr [[SIVAR2]], ptr [[TMP14]], align 8
+// CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK1-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+// CHECK1-NEXT: ]
+// CHECK1: .omp.reduction.case1:
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[SIVAR2]], align 4
+// CHECK1-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+// CHECK1-NEXT: store i32 [[ADD6]], ptr [[TMP0]], align 4
+// CHECK1-NEXT: call void @__kmpc_end_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK1: .omp.reduction.case2:
+// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[SIVAR2]], align 4
+// CHECK1-NEXT: [[TMP19:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP18]] monotonic, align 4
+// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK1: .omp.reduction.default:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined.omp.reduction.reduction_func
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK1-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
+// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
+// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[TMP7]], align 4
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp.reduction.reduction_func
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK1-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
+// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
+// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[TMP7]], align 4
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
+// CHECK1-SAME: () #[[ATTR5:[0-9]+]] comdat {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[T_VAR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4
+// CHECK1-NEXT: [[T_VAR_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[T_VAR]], align 4
+// CHECK1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i64 8, i1 false)
+// CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[T_VAR]], align 4
+// CHECK1-NEXT: store i32 [[TMP0]], ptr [[T_VAR_CASTED]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8
+// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT: store i64 [[TMP1]], ptr [[TMP2]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT: store i64 [[TMP1]], ptr [[TMP3]], align 8
+// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK1-NEXT: store ptr null, ptr [[TMP4]], align 8
+// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK1-NEXT: store i32 2, ptr [[TMP7]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK1-NEXT: store i32 1, ptr [[TMP8]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK1-NEXT: store ptr [[TMP5]], ptr [[TMP9]], align 8
+// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK1-NEXT: store ptr [[TMP6]], ptr [[TMP10]], align 8
+// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK1-NEXT: store ptr @.offload_sizes.1, ptr [[TMP11]], align 8
+// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK1-NEXT: store ptr @.offload_maptypes.2, ptr [[TMP12]], align 8
+// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK1-NEXT: store ptr null, ptr [[TMP13]], align 8
+// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK1-NEXT: store ptr null, ptr [[TMP14]], align 8
+// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK1-NEXT: store i64 2, ptr [[TMP15]], align 8
+// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK1-NEXT: store i64 0, ptr [[TMP16]], align 8
+// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4
+// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP18]], align 4
+// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK1-NEXT: store i32 0, ptr [[TMP19]], align 4
+// CHECK1-NEXT: [[TMP20:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB4]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.region_id, ptr [[KERNEL_ARGS]])
+// CHECK1-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
+// CHECK1-NEXT: br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK1: omp_offload.failed:
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(i64 [[TMP1]]) #[[ATTR2]]
+// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK1: omp_offload.cont:
+// CHECK1-NEXT: ret i32 0
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32
+// CHECK1-SAME: (i64 noundef [[T_VAR:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[T_VAR_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB4]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined, ptr [[T_VAR_ADDR]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[T_VAR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[T_VAR1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[T_VAR]], ptr [[T_VAR_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[T_VAR_ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[T_VAR1]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP2]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 1
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK1-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB4]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined, i64 [[TMP9]], i64 [[TMP11]], ptr [[T_VAR1]])
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP2]])
+// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
+// CHECK1-NEXT: store ptr [[T_VAR1]], ptr [[TMP14]], align 8
+// CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK1-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+// CHECK1-NEXT: ]
+// CHECK1: .omp.reduction.case1:
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[T_VAR1]], align 4
+// CHECK1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+// CHECK1-NEXT: store i32 [[ADD3]], ptr [[TMP0]], align 4
+// CHECK1-NEXT: call void @__kmpc_end_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK1: .omp.reduction.case2:
+// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[T_VAR1]], align 4
+// CHECK1-NEXT: [[TMP19:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP18]] monotonic, align 4
+// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK1: .omp.reduction.default:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[T_VAR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[T_VAR2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[T_VAR]], ptr [[T_VAR_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[T_VAR_ADDR]], align 8
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK1-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32
+// CHECK1-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[CONV1]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[T_VAR2]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP4]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP5]], 1
+// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP6]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[CMP3:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK1-NEXT: br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[T_VAR2]], align 4
+// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP12]], [[TMP11]]
+// CHECK1-NEXT: store i32 [[ADD4]], ptr [[T_VAR2]], align 4
+// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1: omp.body.continue:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP13]], 1
+// CHECK1-NEXT: store i32 [[ADD5]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP4]])
+// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
+// CHECK1-NEXT: store ptr [[T_VAR2]], ptr [[TMP14]], align 8
+// CHECK1-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK1-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+// CHECK1-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+// CHECK1-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+// CHECK1-NEXT: ]
+// CHECK1: .omp.reduction.case1:
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[T_VAR2]], align 4
+// CHECK1-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+// CHECK1-NEXT: store i32 [[ADD6]], ptr [[TMP0]], align 4
+// CHECK1-NEXT: call void @__kmpc_end_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK1: .omp.reduction.case2:
+// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[T_VAR2]], align 4
+// CHECK1-NEXT: [[TMP19:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP18]] monotonic, align 4
+// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK1: .omp.reduction.default:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined.omp.reduction.reduction_func
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK1-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
+// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
+// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[TMP7]], align 4
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK1-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
+// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
+// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[TMP7]], align 4
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK1-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@main
+// CHECK3-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[SIVAR_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK3-NEXT: store i32 0, ptr [[RETVAL]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load i32, ptr @_ZZ4mainE5sivar, align 4
+// CHECK3-NEXT: store i32 [[TMP0]], ptr [[SIVAR_CASTED]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[SIVAR_CASTED]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT: store i32 [[TMP1]], ptr [[TMP2]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT: store i32 [[TMP1]], ptr [[TMP3]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK3-NEXT: store ptr null, ptr [[TMP4]], align 4
+// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK3-NEXT: store i32 2, ptr [[TMP7]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK3-NEXT: store i32 1, ptr [[TMP8]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK3-NEXT: store ptr [[TMP5]], ptr [[TMP9]], align 4
+// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK3-NEXT: store ptr [[TMP6]], ptr [[TMP10]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK3-NEXT: store ptr @.offload_sizes, ptr [[TMP11]], align 4
+// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK3-NEXT: store ptr @.offload_maptypes, ptr [[TMP12]], align 4
+// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK3-NEXT: store ptr null, ptr [[TMP13]], align 4
+// CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK3-NEXT: store ptr null, ptr [[TMP14]], align 4
+// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK3-NEXT: store i64 2, ptr [[TMP15]], align 8
+// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK3-NEXT: store i64 0, ptr [[TMP16]], align 8
+// CHECK3-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4
+// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP18]], align 4
+// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK3-NEXT: store i32 0, ptr [[TMP19]], align 4
+// CHECK3-NEXT: [[TMP20:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB4:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.region_id, ptr [[KERNEL_ARGS]])
+// CHECK3-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
+// CHECK3-NEXT: br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK3: omp_offload.failed:
+// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68(i32 [[TMP1]]) #[[ATTR2:[0-9]+]]
+// CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK3: omp_offload.cont:
+// CHECK3-NEXT: [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v()
+// CHECK3-NEXT: ret i32 [[CALL]]
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68
+// CHECK3-SAME: (i32 noundef [[SIVAR:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[SIVAR_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4
+// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB4]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined, ptr [[SIVAR_ADDR]])
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[SIVAR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[SIVAR1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[SIVAR]], ptr [[SIVAR_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SIVAR_ADDR]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[SIVAR1]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP2]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 1
+// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK3: cond.true:
+// CHECK3-NEXT: br label [[COND_END:%.*]]
+// CHECK3: cond.false:
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: br label [[COND_END]]
+// CHECK3: cond.end:
+// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK3-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3: omp.inner.for.cond:
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK3-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3: omp.inner.for.body:
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB4]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined, i32 [[TMP8]], i32 [[TMP9]], ptr [[SIVAR1]])
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3: omp.inner.for.inc:
+// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], [[TMP11]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK3: omp.inner.for.end:
+// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3: omp.loop.exit:
+// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]])
+// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
+// CHECK3-NEXT: store ptr [[SIVAR1]], ptr [[TMP12]], align 4
+// CHECK3-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3:[0-9]+]], i32 [[TMP2]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK3-NEXT: switch i32 [[TMP13]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+// CHECK3-NEXT: ]
+// CHECK3: .omp.reduction.case1:
+// CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[SIVAR1]], align 4
+// CHECK3-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
+// CHECK3-NEXT: store i32 [[ADD3]], ptr [[TMP0]], align 4
+// CHECK3-NEXT: call void @__kmpc_end_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK3: .omp.reduction.case2:
+// CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[SIVAR1]], align 4
+// CHECK3-NEXT: [[TMP17:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP16]] monotonic, align 4
+// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK3: .omp.reduction.default:
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[SIVAR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[SIVAR1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[SIVAR]], ptr [[SIVAR_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SIVAR_ADDR]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[SIVAR1]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP4]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP5]], 1
+// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK3: cond.true:
+// CHECK3-NEXT: br label [[COND_END:%.*]]
+// CHECK3: cond.false:
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: br label [[COND_END]]
+// CHECK3: cond.end:
+// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP6]], [[COND_FALSE]] ]
+// CHECK3-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3: omp.inner.for.cond:
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK3-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3: omp.inner.for.body:
+// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4
+// CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[SIVAR1]], align 4
+// CHECK3-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP12]], [[TMP11]]
+// CHECK3-NEXT: store i32 [[ADD3]], ptr [[SIVAR1]], align 4
+// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK3: omp.body.continue:
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3: omp.inner.for.inc:
+// CHECK3-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP13]], 1
+// CHECK3-NEXT: store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK3: omp.inner.for.end:
+// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3: omp.loop.exit:
+// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP4]])
+// CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
+// CHECK3-NEXT: store ptr [[SIVAR1]], ptr [[TMP14]], align 4
+// CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK3-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+// CHECK3-NEXT: ]
+// CHECK3: .omp.reduction.case1:
+// CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK3-NEXT: [[TMP17:%.*]] = load i32, ptr [[SIVAR1]], align 4
+// CHECK3-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+// CHECK3-NEXT: store i32 [[ADD5]], ptr [[TMP0]], align 4
+// CHECK3-NEXT: call void @__kmpc_end_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK3: .omp.reduction.case2:
+// CHECK3-NEXT: [[TMP18:%.*]] = load i32, ptr [[SIVAR1]], align 4
+// CHECK3-NEXT: [[TMP19:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP18]] monotonic, align 4
+// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK3: .omp.reduction.default:
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined.omp.reduction.reduction_func
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
+// CHECK3-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[TMP7]], align 4
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp.reduction.reduction_func
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
+// CHECK3-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[TMP7]], align 4
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
+// CHECK3-SAME: () #[[ATTR5:[0-9]+]] comdat {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[T_VAR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[VEC:%.*]] = alloca [2 x i32], align 4
+// CHECK3-NEXT: [[T_VAR_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK3-NEXT: store i32 0, ptr [[T_VAR]], align 4
+// CHECK3-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC]], ptr align 4 @__const._Z5tmainIiET_v.vec, i32 8, i1 false)
+// CHECK3-NEXT: [[TMP0:%.*]] = load i32, ptr [[T_VAR]], align 4
+// CHECK3-NEXT: store i32 [[TMP0]], ptr [[T_VAR_CASTED]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT: store i32 [[TMP1]], ptr [[TMP2]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT: store i32 [[TMP1]], ptr [[TMP3]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK3-NEXT: store ptr null, ptr [[TMP4]], align 4
+// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK3-NEXT: store i32 2, ptr [[TMP7]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK3-NEXT: store i32 1, ptr [[TMP8]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK3-NEXT: store ptr [[TMP5]], ptr [[TMP9]], align 4
+// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK3-NEXT: store ptr [[TMP6]], ptr [[TMP10]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK3-NEXT: store ptr @.offload_sizes.1, ptr [[TMP11]], align 4
+// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK3-NEXT: store ptr @.offload_maptypes.2, ptr [[TMP12]], align 4
+// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK3-NEXT: store ptr null, ptr [[TMP13]], align 4
+// CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK3-NEXT: store ptr null, ptr [[TMP14]], align 4
+// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK3-NEXT: store i64 2, ptr [[TMP15]], align 8
+// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK3-NEXT: store i64 0, ptr [[TMP16]], align 8
+// CHECK3-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP17]], align 4
+// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP18]], align 4
+// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK3-NEXT: store i32 0, ptr [[TMP19]], align 4
+// CHECK3-NEXT: [[TMP20:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB4]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.region_id, ptr [[KERNEL_ARGS]])
+// CHECK3-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
+// CHECK3-NEXT: br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK3: omp_offload.failed:
+// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(i32 [[TMP1]]) #[[ATTR2]]
+// CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK3: omp_offload.cont:
+// CHECK3-NEXT: ret i32 0
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32
+// CHECK3-SAME: (i32 noundef [[T_VAR:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[T_VAR_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4
+// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB4]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined, ptr [[T_VAR_ADDR]])
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[T_VAR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[T_VAR1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[T_VAR]], ptr [[T_VAR_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[T_VAR_ADDR]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[T_VAR1]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP2]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 1
+// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK3: cond.true:
+// CHECK3-NEXT: br label [[COND_END:%.*]]
+// CHECK3: cond.false:
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: br label [[COND_END]]
+// CHECK3: cond.end:
+// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK3-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3: omp.inner.for.cond:
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK3-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3: omp.inner.for.body:
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB4]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined, i32 [[TMP8]], i32 [[TMP9]], ptr [[T_VAR1]])
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3: omp.inner.for.inc:
+// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], [[TMP11]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK3: omp.inner.for.end:
+// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3: omp.loop.exit:
+// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP2]])
+// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
+// CHECK3-NEXT: store ptr [[T_VAR1]], ptr [[TMP12]], align 4
+// CHECK3-NEXT: [[TMP13:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP2]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK3-NEXT: switch i32 [[TMP13]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+// CHECK3-NEXT: ]
+// CHECK3: .omp.reduction.case1:
+// CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[T_VAR1]], align 4
+// CHECK3-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
+// CHECK3-NEXT: store i32 [[ADD3]], ptr [[TMP0]], align 4
+// CHECK3-NEXT: call void @__kmpc_end_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK3: .omp.reduction.case2:
+// CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[T_VAR1]], align 4
+// CHECK3-NEXT: [[TMP17:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP16]] monotonic, align 4
+// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK3: .omp.reduction.default:
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[T_VAR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[T_VAR1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[T_VAR]], ptr [[T_VAR_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[T_VAR_ADDR]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[T_VAR1]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP4]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP5]], 1
+// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK3: cond.true:
+// CHECK3-NEXT: br label [[COND_END:%.*]]
+// CHECK3: cond.false:
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: br label [[COND_END]]
+// CHECK3: cond.end:
+// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP6]], [[COND_FALSE]] ]
+// CHECK3-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3: omp.inner.for.cond:
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK3-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3: omp.inner.for.body:
+// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4
+// CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[T_VAR1]], align 4
+// CHECK3-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP12]], [[TMP11]]
+// CHECK3-NEXT: store i32 [[ADD3]], ptr [[T_VAR1]], align 4
+// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK3: omp.body.continue:
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3: omp.inner.for.inc:
+// CHECK3-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP13]], 1
+// CHECK3-NEXT: store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK3: omp.inner.for.end:
+// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3: omp.loop.exit:
+// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP4]])
+// CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
+// CHECK3-NEXT: store ptr [[T_VAR1]], ptr [[TMP14]], align 4
+// CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i32 4, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK3-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+// CHECK3-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+// CHECK3-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+// CHECK3-NEXT: ]
+// CHECK3: .omp.reduction.case1:
+// CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK3-NEXT: [[TMP17:%.*]] = load i32, ptr [[T_VAR1]], align 4
+// CHECK3-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+// CHECK3-NEXT: store i32 [[ADD5]], ptr [[TMP0]], align 4
+// CHECK3-NEXT: call void @__kmpc_end_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK3: .omp.reduction.case2:
+// CHECK3-NEXT: [[TMP18:%.*]] = load i32, ptr [[T_VAR1]], align 4
+// CHECK3-NEXT: [[TMP19:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP18]] monotonic, align 4
+// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK3: .omp.reduction.default:
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined.omp.reduction.reduction_func
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
+// CHECK3-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[TMP7]], align 4
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
+// CHECK3-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[TMP7]], align 4
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK3-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@main
+// CHECK9-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[REF_TMP:%.*]] = alloca [[CLASS_ANON:%.*]], align 1
+// CHECK9-NEXT: store i32 0, ptr [[RETVAL]], align 4
+// CHECK9-NEXT: call void @"_ZZ4mainENK3$_0clEv"(ptr noundef nonnull align 1 dereferenceable(1) [[REF_TMP]])
+// CHECK9-NEXT: ret i32 0
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l45
+// CHECK9-SAME: (i64 noundef [[SIVAR:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[SIVAR_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
+// CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB4:[0-9]+]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l45.omp_outlined, ptr [[SIVAR_ADDR]])
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l45.omp_outlined
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[SIVAR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[SIVAR1:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
+// CHECK9-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK9-NEXT: store ptr [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
+// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SIVAR_ADDR]], align 8
+// CHECK9-NEXT: store i32 0, ptr [[SIVAR1]], align 4
+// CHECK9-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK9-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK9-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK9-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP2]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK9-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 1
+// CHECK9-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK9: cond.true:
+// CHECK9-NEXT: br label [[COND_END:%.*]]
+// CHECK9: cond.false:
+// CHECK9-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT: br label [[COND_END]]
+// CHECK9: cond.end:
+// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK9-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK9-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK9: omp.inner.for.cond:
+// CHECK9-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK9-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK9: omp.inner.for.body:
+// CHECK9-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK9-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+// CHECK9-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
+// CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB4]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l45.omp_outlined.omp_outlined, i64 [[TMP9]], i64 [[TMP11]], ptr [[SIVAR1]])
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK9: omp.inner.for.inc:
+// CHECK9-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
+// CHECK9-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK9: omp.inner.for.end:
+// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK9: omp.loop.exit:
+// CHECK9-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]])
+// CHECK9-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
+// CHECK9-NEXT: store ptr [[SIVAR1]], ptr [[TMP14]], align 8
+// CHECK9-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3:[0-9]+]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l45.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK9-NEXT: switch i32 [[TMP15]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+// CHECK9-NEXT: ]
+// CHECK9: .omp.reduction.case1:
+// CHECK9-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK9-NEXT: [[TMP17:%.*]] = load i32, ptr [[SIVAR1]], align 4
+// CHECK9-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
+// CHECK9-NEXT: store i32 [[ADD3]], ptr [[TMP0]], align 4
+// CHECK9-NEXT: call void @__kmpc_end_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK9-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK9: .omp.reduction.case2:
+// CHECK9-NEXT: [[TMP18:%.*]] = load i32, ptr [[SIVAR1]], align 4
+// CHECK9-NEXT: [[TMP19:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP18]] monotonic, align 4
+// CHECK9-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK9: .omp.reduction.default:
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l45.omp_outlined.omp_outlined
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[SIVAR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[SIVAR2:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[REF_TMP:%.*]] = alloca [[CLASS_ANON_0:%.*]], align 8
+// CHECK9-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
+// CHECK9-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK9-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK9-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK9-NEXT: store ptr [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
+// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SIVAR_ADDR]], align 8
+// CHECK9-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK9-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT: [[TMP1:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK9-NEXT: [[CONV:%.*]] = trunc i64 [[TMP1]] to i32
+// CHECK9-NEXT: [[TMP2:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK9-NEXT: [[CONV1:%.*]] = trunc i64 [[TMP2]] to i32
+// CHECK9-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK9-NEXT: store i32 [[CONV1]], ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK9-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK9-NEXT: store i32 0, ptr [[SIVAR2]], align 4
+// CHECK9-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4
+// CHECK9-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP4]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK9-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP5]], 1
+// CHECK9-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK9: cond.true:
+// CHECK9-NEXT: br label [[COND_END:%.*]]
+// CHECK9: cond.false:
+// CHECK9-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT: br label [[COND_END]]
+// CHECK9: cond.end:
+// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP6]], [[COND_FALSE]] ]
+// CHECK9-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK9-NEXT: store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK9: omp.inner.for.cond:
+// CHECK9-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT: [[CMP3:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
+// CHECK9-NEXT: br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK9: omp.inner.for.body:
+// CHECK9-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
+// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK9-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK9-NEXT: [[TMP11:%.*]] = load i32, ptr [[I]], align 4
+// CHECK9-NEXT: [[TMP12:%.*]] = load i32, ptr [[SIVAR2]], align 4
+// CHECK9-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP12]], [[TMP11]]
+// CHECK9-NEXT: store i32 [[ADD4]], ptr [[SIVAR2]], align 4
+// CHECK9-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 0
+// CHECK9-NEXT: store ptr [[SIVAR2]], ptr [[TMP13]], align 8
+// CHECK9-NEXT: call void @"_ZZZ4mainENK3$_0clEvENKUlvE_clEv"(ptr noundef nonnull align 8 dereferenceable(8) [[REF_TMP]])
+// CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK9: omp.body.continue:
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK9: omp.inner.for.inc:
+// CHECK9-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP14]], 1
+// CHECK9-NEXT: store i32 [[ADD5]], ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]]
+// CHECK9: omp.inner.for.end:
+// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK9: omp.loop.exit:
+// CHECK9-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP4]])
+// CHECK9-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
+// CHECK9-NEXT: store ptr [[SIVAR2]], ptr [[TMP15]], align 8
+// CHECK9-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l45.omp_outlined.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK9-NEXT: switch i32 [[TMP16]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+// CHECK9-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+// CHECK9-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+// CHECK9-NEXT: ]
+// CHECK9: .omp.reduction.case1:
+// CHECK9-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK9-NEXT: [[TMP18:%.*]] = load i32, ptr [[SIVAR2]], align 4
+// CHECK9-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK9-NEXT: store i32 [[ADD6]], ptr [[TMP0]], align 4
+// CHECK9-NEXT: call void @__kmpc_end_reduce_nowait(ptr @[[GLOB3]], i32 [[TMP4]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK9-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK9: .omp.reduction.case2:
+// CHECK9-NEXT: [[TMP19:%.*]] = load i32, ptr [[SIVAR2]], align 4
+// CHECK9-NEXT: [[TMP20:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP19]] monotonic, align 4
+// CHECK9-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK9: .omp.reduction.default:
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l45.omp_outlined.omp_outlined.omp.reduction.reduction_func
+// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK9-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK9-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK9-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK9-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
+// CHECK9-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+// CHECK9-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
+// CHECK9-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// CHECK9-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK9-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK9-NEXT: store i32 [[ADD]], ptr [[TMP7]], align 4
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l45.omp_outlined.omp.reduction.reduction_func
+// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK9-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK9-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK9-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK9-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
+// CHECK9-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+// CHECK9-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
+// CHECK9-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// CHECK9-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK9-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK9-NEXT: store i32 [[ADD]], ptr [[TMP7]], align 4
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
+// CHECK9-SAME: () #[[ATTR6:[0-9]+]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: call void @__tgt_register_requires(i64 1)
+// CHECK9-NEXT: ret void
+//
More information about the cfe-commits
mailing list