[clang] [flang] [llvm] [openmp] [OpenMP][Offload] Add offload runtime support for dyn_groupprivate clause (PR #152831)
Kevin Sala Penades via cfe-commits
cfe-commits at lists.llvm.org
Wed Mar 11 23:05:23 PDT 2026
https://github.com/kevinsala updated https://github.com/llvm/llvm-project/pull/152831
>From 099c502bdf02ed9bc34bbfc70a6e786746ecee90 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Fri, 8 Aug 2025 10:43:52 -0700
Subject: [PATCH 01/38] [OpenMP] Add parser/semantic support for
dyn_groupprivate clause
---
clang/include/clang/AST/OpenMPClause.h | 155 ++++++++++++++++++
clang/include/clang/AST/RecursiveASTVisitor.h | 8 +
.../clang/Basic/DiagnosticSemaKinds.td | 3 +
clang/include/clang/Basic/OpenMPKinds.def | 9 +
clang/include/clang/Basic/OpenMPKinds.h | 10 ++
clang/include/clang/Sema/SemaOpenMP.h | 7 +
clang/lib/AST/OpenMPClause.cpp | 21 +++
clang/lib/AST/StmtProfile.cpp | 6 +
clang/lib/Basic/OpenMPKinds.cpp | 17 ++
clang/lib/Parse/ParseOpenMP.cpp | 47 +++++-
clang/lib/Sema/SemaOpenMP.cpp | 73 ++++++++-
clang/lib/Sema/TreeTransform.h | 26 +++
clang/lib/Serialization/ASTReader.cpp | 16 ++
clang/lib/Serialization/ASTWriter.cpp | 11 ++
.../target_dyn_groupprivate_messages.cpp | 87 ++++++++++
...target_teams_dyn_groupprivate_messages.cpp | 87 ++++++++++
clang/tools/libclang/CIndex.cpp | 5 +
llvm/include/llvm/Frontend/OpenMP/OMP.td | 25 +++
18 files changed, 605 insertions(+), 8 deletions(-)
create mode 100644 clang/test/OpenMP/target_dyn_groupprivate_messages.cpp
create mode 100644 clang/test/OpenMP/target_teams_dyn_groupprivate_messages.cpp
diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h
index 1118d3e062e68..a3983120df069 100644
--- a/clang/include/clang/AST/OpenMPClause.h
+++ b/clang/include/clang/AST/OpenMPClause.h
@@ -9768,6 +9768,161 @@ class OMPXDynCGroupMemClause
Expr *getSize() const { return getStmtAs<Expr>(); }
};
+/// This represents 'dyn_groupprivate' clause in '#pragma omp target ...'
+/// and '#pragma omp teams ...' directives.
+///
+/// \code
+/// #pragma omp target [...] dyn_groupprivate(a,b: N)
+/// \endcode
+class OMPDynGroupprivateClause : public OMPClause, public OMPClauseWithPreInit {
+ friend class OMPClauseReader;
+
+ /// Location of '('.
+ SourceLocation LParenLoc;
+
+ /// Modifiers for 'dyn_groupprivate' clause.
+ enum { FIRST, SECOND, NUM_MODIFIERS };
+ OpenMPDynGroupprivateClauseModifier Modifiers[NUM_MODIFIERS];
+
+ /// Locations of modifiers.
+ SourceLocation ModifiersLoc[NUM_MODIFIERS];
+
+ /// The size of the dyn_groupprivate.
+ Expr *Size = nullptr;
+
+ /// Set the first dyn_groupprivate modifier.
+ ///
+ /// \param M The modifier.
+ void setFirstDynGroupprivateModifier(OpenMPDynGroupprivateClauseModifier M) {
+ Modifiers[FIRST] = M;
+ }
+
+ /// Set the second dyn_groupprivate modifier.
+ ///
+ /// \param M The modifier.
+ void setSecondDynGroupprivateModifier(OpenMPDynGroupprivateClauseModifier M) {
+ Modifiers[SECOND] = M;
+ }
+
+ /// Set location of the first dyn_groupprivate modifier.
+ void setFirstDynGroupprivateModifierLoc(SourceLocation Loc) {
+ ModifiersLoc[FIRST] = Loc;
+ }
+
+ /// Set location of the second dyn_groupprivate modifier.
+ void setSecondDynGroupprivateModifierLoc(SourceLocation Loc) {
+ ModifiersLoc[SECOND] = Loc;
+ }
+
+ /// Set dyn_groupprivate modifier location.
+ ///
+ /// \param M The modifier location.
+ void setDynGroupprivateModifer(OpenMPDynGroupprivateClauseModifier M) {
+ if (Modifiers[FIRST] == OMPC_DYN_GROUPPRIVATE_unknown)
+ Modifiers[FIRST] = M;
+ else {
+ assert(Modifiers[SECOND] == OMPC_DYN_GROUPPRIVATE_unknown);
+ Modifiers[SECOND] = M;
+ }
+ }
+
+ /// Sets the location of '('.
+ ///
+ /// \param Loc Location of '('.
+ void setLParenLoc(SourceLocation Loc) { LParenLoc = Loc; }
+
+ /// Set size.
+ ///
+ /// \param E Size.
+ void setSize(Expr *E) { Size = E; }
+
+public:
+ /// Build 'dyn_groupprivate' clause with a size expression \a Size.
+ ///
+ /// \param StartLoc Starting location of the clause.
+ /// \param LParenLoc Location of '('.
+ /// \param EndLoc Ending location of the clause.
+ /// \param Size Size.
+ /// \param M1 The first modifier applied to 'dyn_groupprivate' clause.
+ /// \param M1Loc Location of the first modifier.
+ /// \param M2 The second modifier applied to 'dyn_groupprivate' clause.
+ /// \param M2Loc Location of the second modifier.
+ OMPDynGroupprivateClause(SourceLocation StartLoc, SourceLocation LParenLoc,
+ SourceLocation EndLoc, Expr *Size, Stmt *HelperSize,
+ OpenMPDirectiveKind CaptureRegion,
+ OpenMPDynGroupprivateClauseModifier M1,
+ SourceLocation M1Loc,
+ OpenMPDynGroupprivateClauseModifier M2,
+ SourceLocation M2Loc)
+ : OMPClause(llvm::omp::OMPC_dyn_groupprivate, StartLoc, EndLoc),
+ OMPClauseWithPreInit(this), LParenLoc(LParenLoc), Size(Size) {
+ setPreInitStmt(HelperSize, CaptureRegion);
+ Modifiers[FIRST] = M1;
+ Modifiers[SECOND] = M2;
+ ModifiersLoc[FIRST] = M1Loc;
+ ModifiersLoc[SECOND] = M2Loc;
+ }
+
+ /// Build an empty clause.
+ explicit OMPDynGroupprivateClause()
+ : OMPClause(llvm::omp::OMPC_dyn_groupprivate, SourceLocation(),
+ SourceLocation()),
+ OMPClauseWithPreInit(this) {
+ Modifiers[FIRST] = OMPC_DYN_GROUPPRIVATE_unknown;
+ Modifiers[SECOND] = OMPC_DYN_GROUPPRIVATE_unknown;
+ }
+
+ /// Get the first modifier of the clause.
+ OpenMPDynGroupprivateClauseModifier getFirstDynGroupprivateModifier() const {
+ return Modifiers[FIRST];
+ }
+
+ /// Get the second modifier of the clause.
+ OpenMPDynGroupprivateClauseModifier getSecondDynGroupprivateModifier() const {
+ return Modifiers[SECOND];
+ }
+
+ /// Get location of '('.
+ SourceLocation getLParenLoc() { return LParenLoc; }
+
+ /// Get the first modifier location.
+ SourceLocation getFirstDynGroupprivateModifierLoc() const {
+ return ModifiersLoc[FIRST];
+ }
+
+ /// Get the second modifier location.
+ SourceLocation getSecondDynGroupprivateModifierLoc() const {
+ return ModifiersLoc[SECOND];
+ }
+
+ /// Get size.
+ Expr *getSize() { return Size; }
+
+ /// Get size.
+ const Expr *getSize() const { return Size; }
+
+ child_range children() {
+ return child_range(reinterpret_cast<Stmt **>(&Size),
+ reinterpret_cast<Stmt **>(&Size) + 1);
+ }
+
+ const_child_range children() const {
+ auto Children = const_cast<OMPDynGroupprivateClause *>(this)->children();
+ return const_child_range(Children.begin(), Children.end());
+ }
+
+ child_range used_children() {
+ return child_range(child_iterator(), child_iterator());
+ }
+ const_child_range used_children() const {
+ return const_child_range(const_child_iterator(), const_child_iterator());
+ }
+
+ static bool classof(const OMPClause *T) {
+ return T->getClauseKind() == llvm::omp::OMPC_dyn_groupprivate;
+ }
+};
+
/// This represents the 'doacross' clause for the '#pragma omp ordered'
/// directive.
///
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index 5cb2f57edffe4..129115d56fe82 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -4060,6 +4060,14 @@ bool RecursiveASTVisitor<Derived>::VisitOMPXDynCGroupMemClause(
return true;
}
+template <typename Derived>
+bool RecursiveASTVisitor<Derived>::VisitOMPDynGroupprivateClause(
+ OMPDynGroupprivateClause *C) {
+ TRY_TO(VisitOMPClauseWithPreInit(C));
+ TRY_TO(TraverseStmt(C->getSize()));
+ return true;
+}
+
template <typename Derived>
bool RecursiveASTVisitor<Derived>::VisitOMPDoacrossClause(
OMPDoacrossClause *C) {
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index b285309e0b3ca..edfa2d229789a 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -11995,6 +11995,9 @@ def err_omp_unexpected_schedule_modifier : Error<
"modifier '%0' cannot be used along with modifier '%1'">;
def err_omp_schedule_nonmonotonic_static : Error<
"'nonmonotonic' modifier can only be specified with 'dynamic' or 'guided' schedule kind">;
+def err_omp_unexpected_dyn_groupprivate_modifier
+ : Error<"modifier '%0' cannot be used along with modifier '%1' in "
+ "dyn_groupprivate">;
def err_omp_simple_clause_incompatible_with_ordered : Error<
"'%0' clause with '%1' modifier cannot be specified if an 'ordered' clause is specified">;
def err_omp_ordered_simd : Error<
diff --git a/clang/include/clang/Basic/OpenMPKinds.def b/clang/include/clang/Basic/OpenMPKinds.def
index 9d6f816eea91f..3321e19cae9b1 100644
--- a/clang/include/clang/Basic/OpenMPKinds.def
+++ b/clang/include/clang/Basic/OpenMPKinds.def
@@ -83,6 +83,9 @@
#ifndef OPENMP_GRAINSIZE_MODIFIER
#define OPENMP_GRAINSIZE_MODIFIER(Name)
#endif
+#ifndef OPENMP_DYN_GROUPPRIVATE_MODIFIER
+#define OPENMP_DYN_GROUPPRIVATE_MODIFIER(Name)
+#endif
#ifndef OPENMP_NUMTASKS_MODIFIER
#define OPENMP_NUMTASKS_MODIFIER(Name)
#endif
@@ -227,6 +230,11 @@ OPENMP_BIND_KIND(thread)
// Modifiers for the 'grainsize' clause.
OPENMP_GRAINSIZE_MODIFIER(strict)
+// Modifiers for the 'dyn_groupprivate' clause.
+OPENMP_DYN_GROUPPRIVATE_MODIFIER(cgroup)
+OPENMP_DYN_GROUPPRIVATE_MODIFIER(strict)
+OPENMP_DYN_GROUPPRIVATE_MODIFIER(fallback)
+
// Modifiers for the 'num_tasks' clause.
OPENMP_NUMTASKS_MODIFIER(strict)
@@ -245,6 +253,7 @@ OPENMP_DOACROSS_MODIFIER(source_omp_cur_iteration)
#undef OPENMP_NUMTASKS_MODIFIER
#undef OPENMP_NUMTHREADS_MODIFIER
+#undef OPENMP_DYN_GROUPPRIVATE_MODIFIER
#undef OPENMP_GRAINSIZE_MODIFIER
#undef OPENMP_BIND_KIND
#undef OPENMP_ADJUST_ARGS_KIND
diff --git a/clang/include/clang/Basic/OpenMPKinds.h b/clang/include/clang/Basic/OpenMPKinds.h
index f40db4c13c55a..3e164bf1adf22 100644
--- a/clang/include/clang/Basic/OpenMPKinds.h
+++ b/clang/include/clang/Basic/OpenMPKinds.h
@@ -217,6 +217,16 @@ enum OpenMPGrainsizeClauseModifier {
OMPC_GRAINSIZE_unknown
};
+enum OpenMPDynGroupprivateClauseModifier {
+#define OPENMP_DYN_GROUPPRIVATE_MODIFIER(Name) OMPC_DYN_GROUPPRIVATE_##Name,
+#include "clang/Basic/OpenMPKinds.def"
+ OMPC_DYN_GROUPPRIVATE_unknown
+};
+
+/// Number of allowed dyn_groupprivate-modifiers.
+static constexpr unsigned NumberOfOMPDynGroupprivateClauseModifiers =
+ OMPC_DYN_GROUPPRIVATE_unknown;
+
enum OpenMPNumTasksClauseModifier {
#define OPENMP_NUMTASKS_MODIFIER(Name) OMPC_NUMTASKS_##Name,
#include "clang/Basic/OpenMPKinds.def"
diff --git a/clang/include/clang/Sema/SemaOpenMP.h b/clang/include/clang/Sema/SemaOpenMP.h
index 91c3d4bd5210e..3b161ff3c7d45 100644
--- a/clang/include/clang/Sema/SemaOpenMP.h
+++ b/clang/include/clang/Sema/SemaOpenMP.h
@@ -1385,6 +1385,13 @@ class SemaOpenMP : public SemaBase {
SourceLocation LParenLoc,
SourceLocation EndLoc);
+ /// Called on a well-formed 'dyn_groupprivate' clause.
+ OMPClause *ActOnOpenMPDynGroupprivateClause(
+ OpenMPDynGroupprivateClauseModifier M1,
+ OpenMPDynGroupprivateClauseModifier M2, Expr *Size,
+ SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation M1Loc,
+ SourceLocation M2Loc, SourceLocation EndLoc);
+
/// Called on well-formed 'doacross' clause.
OMPClause *
ActOnOpenMPDoacrossClause(OpenMPDoacrossClauseModifier DepType,
diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp
index de8b5996818de..a6e79f1e2230b 100644
--- a/clang/lib/AST/OpenMPClause.cpp
+++ b/clang/lib/AST/OpenMPClause.cpp
@@ -104,6 +104,8 @@ const OMPClauseWithPreInit *OMPClauseWithPreInit::get(const OMPClause *C) {
return static_cast<const OMPFilterClause *>(C);
case OMPC_ompx_dyn_cgroup_mem:
return static_cast<const OMPXDynCGroupMemClause *>(C);
+ case OMPC_dyn_groupprivate:
+ return static_cast<const OMPDynGroupprivateClause *>(C);
case OMPC_default:
case OMPC_proc_bind:
case OMPC_safelen:
@@ -2725,6 +2727,25 @@ void OMPClausePrinter::VisitOMPXDynCGroupMemClause(
OS << ")";
}
+void OMPClausePrinter::VisitOMPDynGroupprivateClause(
+ OMPDynGroupprivateClause *Node) {
+ OS << "dyn_groupprivate(";
+ if (Node->getFirstDynGroupprivateModifier() !=
+ OMPC_DYN_GROUPPRIVATE_unknown) {
+ OS << getOpenMPSimpleClauseTypeName(
+ OMPC_dyn_groupprivate, Node->getFirstDynGroupprivateModifier());
+ if (Node->getSecondDynGroupprivateModifier() !=
+ OMPC_DYN_GROUPPRIVATE_unknown) {
+ OS << ", ";
+ OS << getOpenMPSimpleClauseTypeName(
+ OMPC_dyn_groupprivate, Node->getSecondDynGroupprivateModifier());
+ }
+ OS << ": ";
+ }
+ Node->getSize()->printPretty(OS, nullptr, Policy, 0);
+ OS << ')';
+}
+
void OMPClausePrinter::VisitOMPDoacrossClause(OMPDoacrossClause *Node) {
OS << "doacross(";
OpenMPDoacrossClauseModifier DepType = Node->getDependenceType();
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index c61450e19f1b6..6b1a016649547 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -957,6 +957,12 @@ void OMPClauseProfiler::VisitOMPXDynCGroupMemClause(
if (Expr *Size = C->getSize())
Profiler->VisitStmt(Size);
}
+void OMPClauseProfiler::VisitOMPDynGroupprivateClause(
+ const OMPDynGroupprivateClause *C) {
+ VistOMPClauseWithPreInit(C);
+ if (auto *Size = C->getSize())
+ Profiler->VisitStmt(Size);
+}
void OMPClauseProfiler::VisitOMPDoacrossClause(const OMPDoacrossClause *C) {
VisitOMPClauseList(C);
}
diff --git a/clang/lib/Basic/OpenMPKinds.cpp b/clang/lib/Basic/OpenMPKinds.cpp
index d3d393bd09396..5a2b578cfb33e 100644
--- a/clang/lib/Basic/OpenMPKinds.cpp
+++ b/clang/lib/Basic/OpenMPKinds.cpp
@@ -171,6 +171,13 @@ unsigned clang::getOpenMPSimpleClauseType(OpenMPClauseKind Kind, StringRef Str,
return OMPC_GRAINSIZE_unknown;
return Type;
}
+ case OMPC_dyn_groupprivate: {
+ return llvm::StringSwitch<unsigned>(Str)
+#define OPENMP_DYN_GROUPPRIVATE_MODIFIER(Name) \
+ .Case(#Name, OMPC_DYN_GROUPPRIVATE_##Name)
+#include "clang/Basic/OpenMPKinds.def"
+ .Default(OMPC_DYN_GROUPPRIVATE_unknown);
+ }
case OMPC_num_tasks: {
unsigned Type = llvm::StringSwitch<unsigned>(Str)
#define OPENMP_NUMTASKS_MODIFIER(Name) .Case(#Name, OMPC_NUMTASKS_##Name)
@@ -508,6 +515,16 @@ const char *clang::getOpenMPSimpleClauseTypeName(OpenMPClauseKind Kind,
#include "clang/Basic/OpenMPKinds.def"
}
llvm_unreachable("Invalid OpenMP 'grainsize' clause modifier");
+ case OMPC_dyn_groupprivate:
+ switch (Type) {
+ case OMPC_DYN_GROUPPRIVATE_unknown:
+ return "unknown";
+#define OPENMP_DYN_GROUPPRIVATE_MODIFIER(Name) \
+ case OMPC_DYN_GROUPPRIVATE_##Name: \
+ return #Name;
+#include "clang/Basic/OpenMPKinds.def"
+ }
+ llvm_unreachable("Invalid OpenMP 'dyn_groupprivate' clause modifier");
case OMPC_num_tasks:
switch (Type) {
case OMPC_NUMTASKS_unknown:
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index aa6a0c61a2c17..fe3c20b765e52 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -3039,6 +3039,7 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
case OMPC_align:
case OMPC_message:
case OMPC_ompx_dyn_cgroup_mem:
+ case OMPC_dyn_groupprivate:
// OpenMP [2.5, Restrictions]
// At most one num_threads clause can appear on the directive.
// OpenMP [2.8.1, simd construct, Restrictions]
@@ -3077,7 +3078,7 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
PP.LookAhead(/*N=*/0).isNot(tok::l_paren))
Clause = ParseOpenMPClause(CKind, WrongDirective);
else if (CKind == OMPC_grainsize || CKind == OMPC_num_tasks ||
- CKind == OMPC_num_threads)
+ CKind == OMPC_num_threads || CKind == OMPC_dyn_groupprivate)
Clause = ParseOpenMPSingleExprWithArgClause(DKind, CKind, WrongDirective);
else
Clause = ParseOpenMPSingleExprClause(CKind, WrongDirective);
@@ -3835,6 +3836,40 @@ OMPClause *Parser::ParseOpenMPSingleExprWithArgClause(OpenMPDirectiveKind DKind,
Arg.push_back(OMPC_GRAINSIZE_unknown);
KLoc.emplace_back();
}
+ } else if (Kind == OMPC_dyn_groupprivate) {
+ enum { Modifier1, Modifier2, NumberOfElements };
+ Arg.resize(NumberOfElements);
+ KLoc.resize(NumberOfElements);
+ Arg[Modifier1] = OMPC_DYN_GROUPPRIVATE_unknown;
+ Arg[Modifier2] = OMPC_DYN_GROUPPRIVATE_unknown;
+ unsigned Modifier = getOpenMPSimpleClauseType(
+ Kind, Tok.isAnnotation() ? "" : PP.getSpelling(Tok), getLangOpts());
+
+ if (Modifier < OMPC_DYN_GROUPPRIVATE_unknown) {
+ // Parse 'modifier'
+ Arg[Modifier1] = Modifier;
+ KLoc[Modifier1] = Tok.getLocation();
+ if (Tok.isNot(tok::r_paren) && Tok.isNot(tok::comma) &&
+ Tok.isNot(tok::annot_pragma_openmp_end))
+ ConsumeAnyToken();
+ if (Tok.is(tok::comma)) {
+ // Parse ',' 'modifier'
+ ConsumeAnyToken();
+ Modifier = getOpenMPSimpleClauseType(
+ Kind, Tok.isAnnotation() ? "" : PP.getSpelling(Tok), getLangOpts());
+ Arg[Modifier2] = Modifier;
+ KLoc[Modifier2] = Tok.getLocation();
+ if (Tok.isNot(tok::r_paren) && Tok.isNot(tok::comma) &&
+ Tok.isNot(tok::annot_pragma_openmp_end))
+ ConsumeAnyToken();
+ }
+ // Parse ':'
+ if (Tok.is(tok::colon))
+ ConsumeAnyToken();
+ else
+ Diag(Tok, diag::warn_pragma_expected_colon)
+ << "dyn_groupprivate modifier";
+ }
} else if (Kind == OMPC_num_tasks) {
// Parse optional <num_tasks modifier> ':'
OpenMPNumTasksClauseModifier Modifier =
@@ -3909,11 +3944,11 @@ OMPClause *Parser::ParseOpenMPSingleExprWithArgClause(OpenMPDirectiveKind DKind,
}
}
- bool NeedAnExpression = (Kind == OMPC_schedule && DelimLoc.isValid()) ||
- (Kind == OMPC_dist_schedule && DelimLoc.isValid()) ||
- Kind == OMPC_if || Kind == OMPC_device ||
- Kind == OMPC_grainsize || Kind == OMPC_num_tasks ||
- Kind == OMPC_num_threads;
+ bool NeedAnExpression =
+ (Kind == OMPC_schedule && DelimLoc.isValid()) ||
+ (Kind == OMPC_dist_schedule && DelimLoc.isValid()) || Kind == OMPC_if ||
+ Kind == OMPC_device || Kind == OMPC_grainsize || Kind == OMPC_num_tasks ||
+ Kind == OMPC_num_threads || Kind == OMPC_dyn_groupprivate;
if (NeedAnExpression) {
SourceLocation ELoc = Tok.getLocation();
ExprResult LHS(
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 4ecc9b0d4c5c8..c63310757513e 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -15560,6 +15560,7 @@ OMPClause *SemaOpenMP::ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind,
case OMPC_holds:
Res = ActOnOpenMPHoldsClause(Expr, StartLoc, LParenLoc, EndLoc);
break;
+ case OMPC_dyn_groupprivate:
case OMPC_grainsize:
case OMPC_num_tasks:
case OMPC_num_threads:
@@ -15686,6 +15687,8 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
case OMPC_num_teams:
case OMPC_thread_limit:
case OMPC_ompx_dyn_cgroup_mem:
+ case OMPC_dyn_groupprivate:
+ // TODO: This may need to consider teams too.
if (Leafs[0] == OMPD_target)
return OMPD_target;
break;
@@ -16646,7 +16649,7 @@ OMPClause *SemaOpenMP::ActOnOpenMPSingleExprWithArgClause(
SourceLocation EndLoc) {
OMPClause *Res = nullptr;
switch (Kind) {
- case OMPC_schedule:
+ case OMPC_schedule: {
enum { Modifier1, Modifier2, ScheduleKind, NumberOfElements };
assert(Argument.size() == NumberOfElements &&
ArgumentLoc.size() == NumberOfElements);
@@ -16656,7 +16659,7 @@ OMPClause *SemaOpenMP::ActOnOpenMPSingleExprWithArgClause(
static_cast<OpenMPScheduleClauseKind>(Argument[ScheduleKind]), Expr,
StartLoc, LParenLoc, ArgumentLoc[Modifier1], ArgumentLoc[Modifier2],
ArgumentLoc[ScheduleKind], DelimLoc, EndLoc);
- break;
+ } break;
case OMPC_if:
assert(Argument.size() == 1 && ArgumentLoc.size() == 1);
Res = ActOnOpenMPIfClause(static_cast<OpenMPDirectiveKind>(Argument.back()),
@@ -16703,6 +16706,16 @@ OMPClause *SemaOpenMP::ActOnOpenMPSingleExprWithArgClause(
static_cast<OpenMPNumTasksClauseModifier>(Argument.back()), Expr,
StartLoc, LParenLoc, ArgumentLoc.back(), EndLoc);
break;
+ case OMPC_dyn_groupprivate: {
+ enum { Modifier1, Modifier2, NumberOfElements };
+ assert(Argument.size() == NumberOfElements &&
+ ArgumentLoc.size() == NumberOfElements);
+ Res = ActOnOpenMPDynGroupprivateClause(
+ static_cast<OpenMPDynGroupprivateClauseModifier>(Argument[Modifier1]),
+ static_cast<OpenMPDynGroupprivateClauseModifier>(Argument[Modifier2]),
+ Expr, StartLoc, LParenLoc, ArgumentLoc[Modifier1],
+ ArgumentLoc[Modifier2], EndLoc);
+ } break;
case OMPC_num_threads:
assert(Argument.size() == 1 && ArgumentLoc.size() == 1 &&
"Modifier for num_threads clause and its location are expected.");
@@ -17056,6 +17069,7 @@ OMPClause *SemaOpenMP::ActOnOpenMPClause(OpenMPClauseKind Kind,
case OMPC_affinity:
case OMPC_when:
case OMPC_ompx_dyn_cgroup_mem:
+ case OMPC_dyn_groupprivate:
default:
llvm_unreachable("Clause is not allowed.");
}
@@ -24143,6 +24157,61 @@ OMPClause *SemaOpenMP::ActOnOpenMPXDynCGroupMemClause(Expr *Size,
ValExpr, HelperValStmt, CaptureRegion, StartLoc, LParenLoc, EndLoc);
}
+OMPClause *SemaOpenMP::ActOnOpenMPDynGroupprivateClause(
+ OpenMPDynGroupprivateClauseModifier M1,
+ OpenMPDynGroupprivateClauseModifier M2, Expr *Size, SourceLocation StartLoc,
+ SourceLocation LParenLoc, SourceLocation M1Loc, SourceLocation M2Loc,
+ SourceLocation EndLoc) {
+
+ if ((M1Loc.isValid() && M1 == OMPC_DYN_GROUPPRIVATE_unknown) ||
+ (M2Loc.isValid() && M2 == OMPC_DYN_GROUPPRIVATE_unknown)) {
+ std::string Values = getListOfPossibleValues(
+ OMPC_dyn_groupprivate, /*First=*/0, OMPC_DYN_GROUPPRIVATE_unknown);
+ Diag((M1Loc.isValid() && M1 == OMPC_DYN_GROUPPRIVATE_unknown) ? M1Loc
+ : M2Loc,
+ diag::err_omp_unexpected_clause_value)
+ << Values << getOpenMPClauseName(OMPC_dyn_groupprivate);
+ return nullptr;
+ }
+
+ if ((M1Loc.isValid() && M2Loc.isValid() && M1 == M2) ||
+ (M1 == OMPC_DYN_GROUPPRIVATE_strict &&
+ M2 == OMPC_DYN_GROUPPRIVATE_fallback) ||
+ (M1 == OMPC_DYN_GROUPPRIVATE_fallback &&
+ M2 == OMPC_DYN_GROUPPRIVATE_strict)) {
+
+ Diag(M2Loc, diag::err_omp_unexpected_dyn_groupprivate_modifier)
+ << getOpenMPSimpleClauseTypeName(OMPC_dyn_groupprivate, M2)
+ << getOpenMPSimpleClauseTypeName(OMPC_dyn_groupprivate, M1);
+ return nullptr;
+ }
+
+ Expr *ValExpr = Size;
+ Stmt *HelperValStmt = nullptr;
+
+ // OpenMP [2.5, Restrictions]
+ // The dyn_groupprivate expression must evaluate to a positive integer
+ // value.
+ if (!isNonNegativeIntegerValue(ValExpr, SemaRef, OMPC_dyn_groupprivate,
+ /*StrictlyPositive=*/false))
+ return nullptr;
+
+ OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective();
+ OpenMPDirectiveKind CaptureRegion = getOpenMPCaptureRegionForClause(
+ DKind, OMPC_dyn_groupprivate, getLangOpts().OpenMP);
+ if (CaptureRegion != OMPD_unknown &&
+ !SemaRef.CurContext->isDependentContext()) {
+ ValExpr = SemaRef.MakeFullExpr(ValExpr).get();
+ llvm::MapVector<const Expr *, DeclRefExpr *> Captures;
+ ValExpr = tryBuildCapture(SemaRef, ValExpr, Captures).get();
+ HelperValStmt = buildPreInits(getASTContext(), Captures);
+ }
+
+ return new (getASTContext()) OMPDynGroupprivateClause(
+ StartLoc, LParenLoc, EndLoc, ValExpr, HelperValStmt, CaptureRegion, M1,
+ M1Loc, M2, M2Loc);
+}
+
OMPClause *SemaOpenMP::ActOnOpenMPDoacrossClause(
OpenMPDoacrossClauseModifier DepType, SourceLocation DepLoc,
SourceLocation ColonLoc, ArrayRef<Expr *> VarList, SourceLocation StartLoc,
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 758012f894a41..6222d63374824 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -2430,6 +2430,19 @@ class TreeTransform {
LParenLoc, EndLoc);
}
+ /// Build a new OpenMP 'dyn_groupprivate' clause.
+ ///
+ /// By default, performs semantic analysis to build the new OpenMP clause.
+ /// Subclasses may override this routine to provide different behavior.
+ OMPClause *RebuildOMPDynGroupprivateClause(
+ OpenMPDynGroupprivateClauseModifier M1,
+ OpenMPDynGroupprivateClauseModifier M2, Expr *Size,
+ SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation M1Loc,
+ SourceLocation M2Loc, SourceLocation EndLoc) {
+ return getSema().OpenMP().ActOnOpenMPDynGroupprivateClause(
+ M1, M2, Size, StartLoc, LParenLoc, M1Loc, M2Loc, EndLoc);
+ }
+
/// Build a new OpenMP 'ompx_attribute' clause.
///
/// By default, performs semantic analysis to build the new OpenMP clause.
@@ -11691,6 +11704,19 @@ OMPClause *TreeTransform<Derived>::TransformOMPXDynCGroupMemClause(
Size.get(), C->getBeginLoc(), C->getLParenLoc(), C->getEndLoc());
}
+template <typename Derived>
+OMPClause *TreeTransform<Derived>::TransformOMPDynGroupprivateClause(
+ OMPDynGroupprivateClause *C) {
+ ExprResult Size = getDerived().TransformExpr(C->getSize());
+ if (Size.isInvalid())
+ return nullptr;
+ return getDerived().RebuildOMPDynGroupprivateClause(
+ C->getFirstDynGroupprivateModifier(),
+ C->getSecondDynGroupprivateModifier(), Size.get(), C->getBeginLoc(),
+ C->getLParenLoc(), C->getFirstDynGroupprivateModifierLoc(),
+ C->getSecondDynGroupprivateModifierLoc(), C->getEndLoc());
+}
+
template <typename Derived>
OMPClause *
TreeTransform<Derived>::TransformOMPDoacrossClause(OMPDoacrossClause *C) {
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 30e0973149594..84a30fcfcea59 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -11417,6 +11417,9 @@ OMPClause *OMPClauseReader::readClause() {
case llvm::omp::OMPC_ompx_dyn_cgroup_mem:
C = new (Context) OMPXDynCGroupMemClause();
break;
+ case llvm::omp::OMPC_dyn_groupprivate:
+ C = new (Context) OMPDynGroupprivateClause();
+ break;
case llvm::omp::OMPC_doacross: {
unsigned NumVars = Record.readInt();
unsigned NumLoops = Record.readInt();
@@ -12583,6 +12586,19 @@ void OMPClauseReader::VisitOMPXDynCGroupMemClause(OMPXDynCGroupMemClause *C) {
C->setLParenLoc(Record.readSourceLocation());
}
+void OMPClauseReader::VisitOMPDynGroupprivateClause(
+ OMPDynGroupprivateClause *C) {
+ VisitOMPClauseWithPreInit(C);
+ C->setFirstDynGroupprivateModifier(
+ static_cast<OpenMPDynGroupprivateClauseModifier>(Record.readInt()));
+ C->setSecondDynGroupprivateModifier(
+ static_cast<OpenMPDynGroupprivateClauseModifier>(Record.readInt()));
+ C->setSize(Record.readSubExpr());
+ C->setLParenLoc(Record.readSourceLocation());
+ C->setFirstDynGroupprivateModifierLoc(Record.readSourceLocation());
+ C->setSecondDynGroupprivateModifierLoc(Record.readSourceLocation());
+}
+
void OMPClauseReader::VisitOMPDoacrossClause(OMPDoacrossClause *C) {
C->setLParenLoc(Record.readSourceLocation());
C->setDependenceType(
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 1a20fc9595dce..42d1ab91a6879 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -8586,6 +8586,17 @@ void OMPClauseWriter::VisitOMPXDynCGroupMemClause(OMPXDynCGroupMemClause *C) {
Record.AddSourceLocation(C->getLParenLoc());
}
+void OMPClauseWriter::VisitOMPDynGroupprivateClause(
+ OMPDynGroupprivateClause *C) {
+ VisitOMPClauseWithPreInit(C);
+ Record.push_back(C->getFirstDynGroupprivateModifier());
+ Record.push_back(C->getSecondDynGroupprivateModifier());
+ Record.AddStmt(C->getSize());
+ Record.AddSourceLocation(C->getLParenLoc());
+ Record.AddSourceLocation(C->getFirstDynGroupprivateModifierLoc());
+ Record.AddSourceLocation(C->getSecondDynGroupprivateModifierLoc());
+}
+
void OMPClauseWriter::VisitOMPDoacrossClause(OMPDoacrossClause *C) {
Record.push_back(C->varlist_size());
Record.push_back(C->getNumLoops());
diff --git a/clang/test/OpenMP/target_dyn_groupprivate_messages.cpp b/clang/test/OpenMP/target_dyn_groupprivate_messages.cpp
new file mode 100644
index 0000000000000..d5d855ee33e1f
--- /dev/null
+++ b/clang/test/OpenMP/target_dyn_groupprivate_messages.cpp
@@ -0,0 +1,87 @@
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 %s -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 %s -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp %s -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd %s -Wuninitialized
+
+void foo() {
+}
+
+bool foobool(int argc) {
+ return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, class S> // expected-note {{declared here}}
+int tmain(T argc, S **argv) {
+ T z;
+ #pragma omp target dyn_groupprivate // expected-error {{expected '(' after 'dyn_groupprivate'}}
+ foo();
+ #pragma omp target dyn_groupprivate ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+ foo();
+ #pragma omp target dyn_groupprivate () // expected-error {{expected expression}}
+ foo();
+ #pragma omp target dyn_groupprivate (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+ foo();
+ #pragma omp target dyn_groupprivate (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target' are ignored}}
+ foo();
+ #pragma omp target dyn_groupprivate (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+ foo();
+ #pragma omp target dyn_groupprivate (foobool(argc)), dyn_groupprivate (true) // expected-error {{directive '#pragma omp target' cannot contain more than one 'dyn_groupprivate' clause}}
+ foo();
+ #pragma omp target dyn_groupprivate (S) // expected-error {{'S' does not refer to a value}}
+ foo();
+ #pragma omp target dyn_groupprivate (argv[1]=2) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+ foo();
+ #pragma omp target dyn_groupprivate (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+ foo();
+ #pragma omp target dyn_groupprivate(argc+z)
+ foo();
+ return 0;
+}
+
+int main(int argc, char **argv) {
+constexpr int n = -1;
+int z;
+ #pragma omp target dyn_groupprivate // expected-error {{expected '(' after 'dyn_groupprivate'}}
+ foo();
+ #pragma omp target dyn_groupprivate ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+ foo();
+ #pragma omp target dyn_groupprivate () // expected-error {{expected expression}}
+ foo();
+ #pragma omp target dyn_groupprivate (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+ foo();
+ #pragma omp target dyn_groupprivate (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target' are ignored}}
+ foo();
+ #pragma omp target dyn_groupprivate (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+ foo();
+ #pragma omp target dyn_groupprivate (foobool(argc)), dyn_groupprivate (true) // expected-error {{directive '#pragma omp target' cannot contain more than one 'dyn_groupprivate' clause}}
+ foo();
+ #pragma omp target dyn_groupprivate (S1) // expected-error {{'S1' does not refer to a value}}
+ foo();
+ #pragma omp target dyn_groupprivate (argv[1]=2) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+ foo();
+ #pragma omp target dyn_groupprivate (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+ foo();
+ #pragma omp target dyn_groupprivate (1 0) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+ foo();
+ #pragma omp target dyn_groupprivate(dyn_groupprivate(tmain(argc, argv) // expected-error2 {{expected ')'}} expected-note2 {{to match this '('}} expected-note {{in instantiation of function template specialization 'tmain<int, char>' requested here}}
+ foo();
+ #pragma omp target dyn_groupprivate(-1) // expected-error {{argument to 'dyn_groupprivate' clause must be a non-negative integer value}}
+ foo();
+ #pragma omp target dyn_groupprivate(cgrou) // expected-error {{use of undeclared identifier 'cgrou'}}
+ foo();
+ #pragma omp target dyn_groupprivate(cgrou: argc) // expected-error {{use of undeclared identifier 'cgrou'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+ foo();
+ #pragma omp target dyn_groupprivate(cgroup,cgroup: argc) // expected-error {{modifier 'cgroup' cannot be used along with modifier 'cgroup' in dyn_groupprivate}}
+ foo();
+ #pragma omp target dyn_groupprivate(fallback,strict: argc) // expected-error {{modifier 'strict' cannot be used along with modifier 'fallback' in dyn_groupprivate}}
+ foo();
+ #pragma omp target dyn_groupprivate(strict,fallback: argc) // expected-error {{modifier 'fallback' cannot be used along with modifier 'strict' in dyn_groupprivate}}
+ foo();
+ #pragma omp target dyn_groupprivate(: argc) // expected-error {{expected ')'}} expected-error {{expected expression}} expected-note {{to match this '('}}
+ foo();
+
+ return tmain(argc, argv);
+}
+
diff --git a/clang/test/OpenMP/target_teams_dyn_groupprivate_messages.cpp b/clang/test/OpenMP/target_teams_dyn_groupprivate_messages.cpp
new file mode 100644
index 0000000000000..422dff547355c
--- /dev/null
+++ b/clang/test/OpenMP/target_teams_dyn_groupprivate_messages.cpp
@@ -0,0 +1,87 @@
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 %s -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 %s -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp %s -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd %s -Wuninitialized
+
+void foo() {
+}
+
+bool foobool(int argc) {
+ return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, class S> // expected-note {{declared here}}
+int tmain(T argc, S **argv) {
+ T z;
+ #pragma omp target teams dyn_groupprivate // expected-error {{expected '(' after 'dyn_groupprivate'}}
+ foo();
+ #pragma omp target teams dyn_groupprivate ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+ foo();
+ #pragma omp target teams dyn_groupprivate () // expected-error {{expected expression}}
+ foo();
+ #pragma omp target teams dyn_groupprivate (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+ foo();
+ #pragma omp target teams dyn_groupprivate (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target teams' are ignored}}
+ foo();
+ #pragma omp target teams dyn_groupprivate (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+ foo();
+ #pragma omp target teams dyn_groupprivate (foobool(argc)), dyn_groupprivate (true) // expected-error {{directive '#pragma omp target teams' cannot contain more than one 'dyn_groupprivate' clause}}
+ foo();
+ #pragma omp target teams dyn_groupprivate (S) // expected-error {{'S' does not refer to a value}}
+ foo();
+ #pragma omp target teams dyn_groupprivate (argv[1]=2) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+ foo();
+ #pragma omp target teams dyn_groupprivate (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+ foo();
+ #pragma omp target teams dyn_groupprivate(argc+z)
+ foo();
+ return 0;
+}
+
+int main(int argc, char **argv) {
+constexpr int n = -1;
+int z;
+ #pragma omp target teams dyn_groupprivate // expected-error {{expected '(' after 'dyn_groupprivate'}}
+ foo();
+ #pragma omp target teams dyn_groupprivate ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+ foo();
+ #pragma omp target teams dyn_groupprivate () // expected-error {{expected expression}}
+ foo();
+ #pragma omp target teams dyn_groupprivate (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+ foo();
+ #pragma omp target teams dyn_groupprivate (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target teams' are ignored}}
+ foo();
+ #pragma omp target teams dyn_groupprivate (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+ foo();
+ #pragma omp target teams dyn_groupprivate (foobool(argc)), dyn_groupprivate (true) // expected-error {{directive '#pragma omp target teams' cannot contain more than one 'dyn_groupprivate' clause}}
+ foo();
+ #pragma omp target teams dyn_groupprivate (S1) // expected-error {{'S1' does not refer to a value}}
+ foo();
+ #pragma omp target teams dyn_groupprivate (argv[1]=2) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+ foo();
+ #pragma omp target teams dyn_groupprivate (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+ foo();
+ #pragma omp target teams dyn_groupprivate (1 0) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+ foo();
+ #pragma omp target teams dyn_groupprivate(dyn_groupprivate(tmain(argc, argv) // expected-error2 {{expected ')'}} expected-note2 {{to match this '('}} expected-note {{in instantiation of function template specialization 'tmain<int, char>' requested here}}
+ foo();
+ #pragma omp target teams dyn_groupprivate(-1) // expected-error {{argument to 'dyn_groupprivate' clause must be a non-negative integer value}}
+ foo();
+ #pragma omp target teams dyn_groupprivate(cgrou) // expected-error {{use of undeclared identifier 'cgrou'}}
+ foo();
+ #pragma omp target teams dyn_groupprivate(cgrou: argc) // expected-error {{use of undeclared identifier 'cgrou'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+ foo();
+ #pragma omp target teams dyn_groupprivate(cgroup,cgroup: argc) // expected-error {{modifier 'cgroup' cannot be used along with modifier 'cgroup' in dyn_groupprivate}}
+ foo();
+ #pragma omp target teams dyn_groupprivate(fallback,strict: argc) // expected-error {{modifier 'strict' cannot be used along with modifier 'fallback' in dyn_groupprivate}}
+ foo();
+ #pragma omp target teams dyn_groupprivate(strict,fallback: argc) // expected-error {{modifier 'fallback' cannot be used along with modifier 'strict' in dyn_groupprivate}}
+ foo();
+ #pragma omp target teams dyn_groupprivate(: argc) // expected-error {{expected ')'}} expected-error {{expected expression}} expected-note {{to match this '('}}
+ foo();
+
+ return tmain(argc, argv);
+}
+
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index 9089984fa4a54..32d75d4cd8d38 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -2808,6 +2808,11 @@ void OMPClauseEnqueue::VisitOMPXDynCGroupMemClause(
VisitOMPClauseWithPreInit(C);
Visitor->AddStmt(C->getSize());
}
+void OMPClauseEnqueue::VisitOMPDynGroupprivateClause(
+ const OMPDynGroupprivateClause *C) {
+ VisitOMPClauseWithPreInit(C);
+ Visitor->AddStmt(C->getSize());
+}
void OMPClauseEnqueue::VisitOMPDoacrossClause(const OMPDoacrossClause *C) {
VisitOMPClauseList(C);
}
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index 1b94657dfae1e..78442a63b3ac4 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -178,6 +178,9 @@ def OMPC_Doacross : Clause<[Spelling<"doacross">]> {
def OMPC_DynamicAllocators : Clause<[Spelling<"dynamic_allocators">]> {
let clangClass = "OMPDynamicAllocatorsClause";
}
+def OMPC_DynGroupprivate : Clause<[Spelling<"dyn_groupprivate">]> {
+ let clangClass = "OMPDynGroupprivateClause";
+}
def OMPC_Enter : Clause<[Spelling<"enter">]> {
let flangClass = "OmpObjectList";
}
@@ -1104,6 +1107,7 @@ def OMP_Target : Directive<[Spelling<"target">]> {
let allowedOnceClauses = [
VersionedClause<OMPC_DefaultMap>,
VersionedClause<OMPC_Device>,
+ VersionedClause<OMPC_DynGroupprivate>,
VersionedClause<OMPC_If>,
VersionedClause<OMPC_NoWait>,
VersionedClause<OMPC_OMPX_Bare>,
@@ -1254,6 +1258,7 @@ def OMP_Teams : Directive<[Spelling<"teams">]> {
];
let allowedOnceClauses = [
VersionedClause<OMPC_Default>,
+ VersionedClause<OMPC_DynGroupprivate>,
VersionedClause<OMPC_If, 52>,
VersionedClause<OMPC_NumTeams>,
VersionedClause<OMPC_ThreadLimit>,
@@ -1522,6 +1527,7 @@ def OMP_target_loop : Directive<[Spelling<"target loop">]> {
let allowedOnceClauses = [
VersionedClause<OMPC_Bind, 50>,
VersionedClause<OMPC_Collapse>,
+ VersionedClause<OMPC_DynGroupprivate>,
VersionedClause<OMPC_Order>,
VersionedClause<OMPC_ThreadLimit>,
VersionedClause<OMPC_OMPX_DynCGroupMem>,
@@ -1983,6 +1989,7 @@ def OMP_TargetParallel : Directive<[Spelling<"target parallel">]> {
let allowedOnceClauses = [
VersionedClause<OMPC_DefaultMap>,
VersionedClause<OMPC_Device>,
+ VersionedClause<OMPC_DynGroupprivate>,
VersionedClause<OMPC_NumThreads>,
VersionedClause<OMPC_OMPX_DynCGroupMem>,
VersionedClause<OMPC_ProcBind>,
@@ -2086,6 +2093,7 @@ def OMP_TargetParallelFor : Directive<[Spelling<"target parallel for">]> {
VersionedClause<OMPC_UsesAllocators, 50>,
];
let allowedOnceClauses = [
+ VersionedClause<OMPC_DynGroupprivate>,
VersionedClause<OMPC_OMPX_DynCGroupMem>,
VersionedClause<OMPC_ThreadLimit, 51>,
];
@@ -2126,6 +2134,7 @@ def OMP_TargetParallelForSimd
VersionedClause<OMPC_UsesAllocators, 50>,
];
let allowedOnceClauses = [
+ VersionedClause<OMPC_DynGroupprivate>,
VersionedClause<OMPC_OMPX_DynCGroupMem>,
VersionedClause<OMPC_ThreadLimit, 51>,
];
@@ -2155,6 +2164,7 @@ def OMP_target_parallel_loop : Directive<[Spelling<"target parallel loop">]> {
VersionedClause<OMPC_Collapse>,
VersionedClause<OMPC_Default>,
VersionedClause<OMPC_DefaultMap>,
+ VersionedClause<OMPC_DynGroupprivate>,
VersionedClause<OMPC_NoWait>,
VersionedClause<OMPC_NumThreads>,
VersionedClause<OMPC_OMPX_DynCGroupMem>,
@@ -2189,6 +2199,7 @@ def OMP_TargetSimd : Directive<[Spelling<"target simd">]> {
VersionedClause<OMPC_Collapse>,
VersionedClause<OMPC_DefaultMap>,
VersionedClause<OMPC_Device>,
+ VersionedClause<OMPC_DynGroupprivate>,
VersionedClause<OMPC_NumThreads>,
VersionedClause<OMPC_OMPX_DynCGroupMem>,
VersionedClause<OMPC_Order, 50>,
@@ -2220,6 +2231,7 @@ def OMP_TargetTeams : Directive<[Spelling<"target teams">]> {
VersionedClause<OMPC_Default>,
VersionedClause<OMPC_DefaultMap>,
VersionedClause<OMPC_Device>,
+ VersionedClause<OMPC_DynGroupprivate>,
VersionedClause<OMPC_NoWait>,
VersionedClause<OMPC_NumTeams>,
VersionedClause<OMPC_OMPX_DynCGroupMem>,
@@ -2252,6 +2264,7 @@ def OMP_TargetTeamsDistribute
VersionedClause<OMPC_DefaultMap>,
VersionedClause<OMPC_Device>,
VersionedClause<OMPC_DistSchedule>,
+ VersionedClause<OMPC_DynGroupprivate>,
VersionedClause<OMPC_NoWait>,
VersionedClause<OMPC_NumTeams>,
VersionedClause<OMPC_OMPX_DynCGroupMem>,
@@ -2284,6 +2297,7 @@ def OMP_TargetTeamsDistributeParallelDo
VersionedClause<OMPC_DefaultMap>,
VersionedClause<OMPC_Device>,
VersionedClause<OMPC_DistSchedule>,
+ VersionedClause<OMPC_DynGroupprivate>,
VersionedClause<OMPC_NoWait>,
VersionedClause<OMPC_NumTeams>,
VersionedClause<OMPC_NumThreads>,
@@ -2322,6 +2336,7 @@ def OMP_TargetTeamsDistributeParallelDoSimd
VersionedClause<OMPC_DefaultMap>,
VersionedClause<OMPC_Device>,
VersionedClause<OMPC_DistSchedule>,
+ VersionedClause<OMPC_DynGroupprivate>,
VersionedClause<OMPC_NoWait>,
VersionedClause<OMPC_NumTeams>,
VersionedClause<OMPC_NumThreads>,
@@ -2367,6 +2382,7 @@ def OMP_TargetTeamsDistributeParallelFor
VersionedClause<OMPC_UsesAllocators, 50>,
];
let allowedOnceClauses = [
+ VersionedClause<OMPC_DynGroupprivate>,
VersionedClause<OMPC_OMPX_DynCGroupMem>,
];
let leafConstructs =
@@ -2409,6 +2425,7 @@ def OMP_TargetTeamsDistributeParallelForSimd
VersionedClause<OMPC_UsesAllocators, 50>,
];
let allowedOnceClauses = [
+ VersionedClause<OMPC_DynGroupprivate>,
VersionedClause<OMPC_OMPX_DynCGroupMem>,
];
let leafConstructs =
@@ -2441,6 +2458,7 @@ def OMP_TargetTeamsDistributeSimd
VersionedClause<OMPC_DefaultMap>,
VersionedClause<OMPC_Device>,
VersionedClause<OMPC_DistSchedule>,
+ VersionedClause<OMPC_DynGroupprivate>,
VersionedClause<OMPC_NoWait>,
VersionedClause<OMPC_NumTeams>,
VersionedClause<OMPC_OMPX_DynCGroupMem>,
@@ -2474,6 +2492,7 @@ def OMP_target_teams_loop : Directive<[Spelling<"target teams loop">]> {
VersionedClause<OMPC_Bind, 50>,
VersionedClause<OMPC_Collapse>,
VersionedClause<OMPC_Default>,
+ VersionedClause<OMPC_DynGroupprivate>,
VersionedClause<OMPC_NoWait>,
VersionedClause<OMPC_NumTeams>,
VersionedClause<OMPC_OMPX_DynCGroupMem>,
@@ -2532,6 +2551,7 @@ def OMP_TeamsDistribute : Directive<[Spelling<"teams distribute">]> {
VersionedClause<OMPC_ThreadLimit>,
];
let allowedOnceClauses = [
+ VersionedClause<OMPC_DynGroupprivate>,
VersionedClause<OMPC_If>,
VersionedClause<OMPC_Order, 50>,
];
@@ -2555,6 +2575,7 @@ def OMP_TeamsDistributeParallelDo
VersionedClause<OMPC_Collapse>,
VersionedClause<OMPC_Default>,
VersionedClause<OMPC_DistSchedule>,
+ VersionedClause<OMPC_DynGroupprivate>,
VersionedClause<OMPC_NumTeams>,
VersionedClause<OMPC_NumThreads>,
VersionedClause<OMPC_Order, 50>,
@@ -2584,6 +2605,7 @@ def OMP_TeamsDistributeParallelDoSimd
VersionedClause<OMPC_Collapse>,
VersionedClause<OMPC_Default>,
VersionedClause<OMPC_DistSchedule>,
+ VersionedClause<OMPC_DynGroupprivate>,
VersionedClause<OMPC_NumTeams>,
VersionedClause<OMPC_NumThreads>,
VersionedClause<OMPC_Order, 50>,
@@ -2632,6 +2654,7 @@ def OMP_TeamsDistributeParallelForSimd
VersionedClause<OMPC_Collapse>,
VersionedClause<OMPC_Default>,
VersionedClause<OMPC_DistSchedule>,
+ VersionedClause<OMPC_DynGroupprivate>,
VersionedClause<OMPC_FirstPrivate>,
VersionedClause<OMPC_If>,
VersionedClause<OMPC_LastPrivate>,
@@ -2673,6 +2696,7 @@ def OMP_TeamsDistributeSimd : Directive<[Spelling<"teams distribute simd">]> {
VersionedClause<OMPC_Collapse>,
VersionedClause<OMPC_Default>,
VersionedClause<OMPC_DistSchedule>,
+ VersionedClause<OMPC_DynGroupprivate>,
VersionedClause<OMPC_NumTeams>,
VersionedClause<OMPC_Order, 50>,
VersionedClause<OMPC_SafeLen>,
@@ -2696,6 +2720,7 @@ def OMP_teams_loop : Directive<[Spelling<"teams loop">]> {
VersionedClause<OMPC_Bind, 50>,
VersionedClause<OMPC_Collapse>,
VersionedClause<OMPC_Default>,
+ VersionedClause<OMPC_DynGroupprivate>,
VersionedClause<OMPC_NumTeams>,
VersionedClause<OMPC_Order>,
VersionedClause<OMPC_ThreadLimit>,
>From fa3c7425ae9e5ffea83841f2be61b0f494b99038 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Fri, 8 Aug 2025 11:25:14 -0700
Subject: [PATCH 02/38] [OpenMP][Offload] Add offload runtime support for
dyn_groupprivate clause
---
offload/DeviceRTL/include/DeviceTypes.h | 4 +
offload/DeviceRTL/include/Interface.h | 2 +-
offload/DeviceRTL/include/State.h | 2 +-
offload/DeviceRTL/src/Kernel.cpp | 14 +-
offload/DeviceRTL/src/State.cpp | 48 +++++-
offload/include/Shared/APITypes.h | 6 +-
offload/include/Shared/Environment.h | 4 +-
offload/include/device.h | 3 +
offload/include/omptarget.h | 7 +-
offload/libomptarget/OpenMP/API.cpp | 14 ++
offload/libomptarget/device.cpp | 6 +
offload/libomptarget/exports | 1 +
.../amdgpu/dynamic_hsa/hsa_ext_amd.h | 1 +
offload/plugins-nextgen/amdgpu/src/rtl.cpp | 34 +++--
.../common/include/PluginInterface.h | 33 +++-
.../common/src/PluginInterface.cpp | 86 ++++++++---
.../plugins-nextgen/cuda/dynamic_cuda/cuda.h | 1 +
offload/plugins-nextgen/cuda/src/rtl.cpp | 37 +++--
offload/plugins-nextgen/host/src/rtl.cpp | 4 +-
.../offloading/dyn_groupprivate_strict.cpp | 141 ++++++++++++++++++
openmp/runtime/src/include/omp.h.var | 10 ++
openmp/runtime/src/kmp_csupport.cpp | 9 ++
openmp/runtime/src/kmp_stub.cpp | 16 ++
23 files changed, 418 insertions(+), 65 deletions(-)
create mode 100644 offload/test/offloading/dyn_groupprivate_strict.cpp
diff --git a/offload/DeviceRTL/include/DeviceTypes.h b/offload/DeviceRTL/include/DeviceTypes.h
index 2e5d92380f040..a43b506d6879e 100644
--- a/offload/DeviceRTL/include/DeviceTypes.h
+++ b/offload/DeviceRTL/include/DeviceTypes.h
@@ -163,4 +163,8 @@ typedef enum omp_allocator_handle_t {
///}
+enum omp_access_t {
+ omp_access_cgroup = 0,
+};
+
#endif
diff --git a/offload/DeviceRTL/include/Interface.h b/offload/DeviceRTL/include/Interface.h
index c4bfaaa2404b4..672afea206785 100644
--- a/offload/DeviceRTL/include/Interface.h
+++ b/offload/DeviceRTL/include/Interface.h
@@ -222,7 +222,7 @@ struct KernelEnvironmentTy;
int8_t __kmpc_is_spmd_exec_mode();
int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
- KernelLaunchEnvironmentTy &KernelLaunchEnvironment);
+ KernelLaunchEnvironmentTy *KernelLaunchEnvironment);
void __kmpc_target_deinit();
diff --git a/offload/DeviceRTL/include/State.h b/offload/DeviceRTL/include/State.h
index db396dae6e445..17c3c6f2d3e42 100644
--- a/offload/DeviceRTL/include/State.h
+++ b/offload/DeviceRTL/include/State.h
@@ -116,7 +116,7 @@ extern Local<ThreadStateTy **> ThreadStates;
/// Initialize the state machinery. Must be called by all threads.
void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
- KernelLaunchEnvironmentTy &KernelLaunchEnvironment);
+ KernelLaunchEnvironmentTy *KernelLaunchEnvironment);
/// Return the kernel and kernel launch environment associated with the current
/// kernel. The former is static and contains compile time information that
diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp
index 467e44a65276c..58e9a09105a76 100644
--- a/offload/DeviceRTL/src/Kernel.cpp
+++ b/offload/DeviceRTL/src/Kernel.cpp
@@ -34,8 +34,8 @@ enum OMPTgtExecModeFlags : unsigned char {
};
static void
-inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
- KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
+initializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
+ KernelLaunchEnvironmentTy *KernelLaunchEnvironment) {
// Order is important here.
synchronize::init(IsSPMD);
mapping::init(IsSPMD);
@@ -80,17 +80,17 @@ extern "C" {
/// \param Ident Source location identification, can be NULL.
///
int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
- KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
+ KernelLaunchEnvironmentTy *KernelLaunchEnvironment) {
ConfigurationEnvironmentTy &Configuration = KernelEnvironment.Configuration;
bool IsSPMD = Configuration.ExecMode & OMP_TGT_EXEC_MODE_SPMD;
bool UseGenericStateMachine = Configuration.UseGenericStateMachine;
if (IsSPMD) {
- inititializeRuntime(/*IsSPMD=*/true, KernelEnvironment,
- KernelLaunchEnvironment);
+ initializeRuntime(/*IsSPMD=*/true, KernelEnvironment,
+ KernelLaunchEnvironment);
synchronize::threadsAligned(atomic::relaxed);
} else {
- inititializeRuntime(/*IsSPMD=*/false, KernelEnvironment,
- KernelLaunchEnvironment);
+ initializeRuntime(/*IsSPMD=*/false, KernelEnvironment,
+ KernelLaunchEnvironment);
// No need to wait since only the main threads will execute user
// code and workers will run into a barrier right away.
}
diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp
index 62b03e7bba720..9e2a9999167b4 100644
--- a/offload/DeviceRTL/src/State.cpp
+++ b/offload/DeviceRTL/src/State.cpp
@@ -158,6 +158,34 @@ void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) {
memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
}
+struct DynCGroupMemTy {
+ void init(KernelLaunchEnvironmentTy *KLE, void *NativeDynCGroup) {
+ Size = 0;
+ Ptr = nullptr;
+ IsFallback = false;
+ if (KLE) {
+ Size = KLE->DynCGroupMemSize;
+ if (void *Fallback = KLE->DynCGroupMemFallback) {
+ Ptr = static_cast<char *>(Fallback) + Size * omp_get_team_num();
+ IsFallback = true;
+ } else {
+ Ptr = static_cast<char *>(NativeDynCGroup);
+ }
+ }
+ }
+
+ char *getPtr(size_t Offset) const { return Ptr + Offset; }
+ bool isFallback() const { return IsFallback; }
+ size_t getSize() const { return Size; }
+
+private:
+ char *Ptr;
+ size_t Size;
+ bool IsFallback;
+};
+
+[[clang::loader_uninitialized]] static Local<DynCGroupMemTy> DynCGroupMem;
+
} // namespace
void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
@@ -246,13 +274,18 @@ int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
} // namespace
void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
- KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
+ KernelLaunchEnvironmentTy *KLE) {
SharedMemorySmartStack.init(IsSPMD);
+
+ if (KLE == reinterpret_cast<KernelLaunchEnvironmentTy *>(~0))
+ KLE = nullptr;
+
if (mapping::isInitialThreadInLevel0(IsSPMD)) {
+ DynCGroupMem.init(KLE, DynamicSharedBuffer);
TeamState.init(IsSPMD);
ThreadStates = nullptr;
KernelEnvironmentPtr = &KernelEnvironment;
- KernelLaunchEnvironmentPtr = &KernelLaunchEnvironment;
+ KernelLaunchEnvironmentPtr = KLE;
}
}
@@ -430,6 +463,17 @@ int omp_get_team_num() { return mapping::getBlockIdInKernel(); }
int omp_get_initial_device(void) { return -1; }
int omp_is_initial_device(void) { return 0; }
+
+void *omp_get_dyn_groupprivate_ptr(size_t Offset, int *IsFallback,
+ omp_access_t) {
+ if (IsFallback != NULL)
+ *IsFallback = DynCGroupMem.isFallback();
+ return DynCGroupMem.getPtr(Offset);
+}
+
+size_t omp_get_dyn_groupprivate_size(omp_access_t) {
+ return DynCGroupMem.getSize();
+}
}
extern "C" {
diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index 978b53d5d69b9..0ef2dd162292b 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -97,8 +97,10 @@ struct KernelArgsTy {
struct {
uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause.
uint64_t IsCUDA : 1; // Was this kernel spawned via CUDA.
- uint64_t Unused : 62;
- } Flags = {0, 0, 0};
+ uint64_t AllowDynCGroupMemFallback : 1; // Allow fallback for dynamic cgroup
+ // mem fallback.
+ uint64_t Unused : 61;
+ } Flags = {0, 0, 0, 0};
// The number of teams (for x,y,z dimension).
uint32_t NumTeams[3] = {0, 0, 0};
// The number of threads (for x,y,z dimension).
diff --git a/offload/include/Shared/Environment.h b/offload/include/Shared/Environment.h
index 2a283bd6fa4ed..0670ac1090da4 100644
--- a/offload/include/Shared/Environment.h
+++ b/offload/include/Shared/Environment.h
@@ -93,9 +93,11 @@ struct KernelEnvironmentTy {
};
struct KernelLaunchEnvironmentTy {
+ void *ReductionBuffer = nullptr;
+ void *DynCGroupMemFallback = nullptr;
uint32_t ReductionCnt = 0;
uint32_t ReductionIterCnt = 0;
- void *ReductionBuffer = nullptr;
+ uint32_t DynCGroupMemSize = 0;
};
#endif // OMPTARGET_SHARED_ENVIRONMENT_H
diff --git a/offload/include/device.h b/offload/include/device.h
index f4b10abbaa3fd..0e93cf8ec1a8b 100644
--- a/offload/include/device.h
+++ b/offload/include/device.h
@@ -158,6 +158,9 @@ struct DeviceTy {
/// Indicate that there are pending images for this device or not.
void setHasPendingImages(bool V) { HasPendingImages = V; }
+ /// Get the maximum shared memory per team for any kernel.
+ uint64_t getMaxSharedTeamMemory();
+
private:
/// Deinitialize the device (and plugin).
void deinit();
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index 6971780c7bdb5..45bb74ec367d6 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -107,7 +107,7 @@ enum TargetAllocTy : int32_t {
inline KernelArgsTy CTorDTorKernelArgs = {1, 0, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr,
- 0, {0,0,0}, {1, 0, 0}, {1, 0, 0}, 0};
+ 0, {0,0,0,0}, {1, 0, 0}, {1, 0, 0}, 0};
struct DeviceTy;
@@ -273,10 +273,15 @@ struct __tgt_target_non_contig {
extern "C" {
#endif
+typedef enum {
+ omp_access_cgroup = 0,
+} omp_access_t;
+
void ompx_dump_mapping_tables(void);
int omp_get_num_devices(void);
int omp_get_device_num(void);
int omp_get_initial_device(void);
+size_t omp_get_groupprivate_limit(int device_num, omp_access_t access_group = omp_access_cgroup);
void *omp_target_alloc(size_t Size, int DeviceNum);
void omp_target_free(void *DevicePtr, int DeviceNum);
int omp_target_is_present(const void *Ptr, int DeviceNum);
diff --git a/offload/libomptarget/OpenMP/API.cpp b/offload/libomptarget/OpenMP/API.cpp
index 4576f9bd06121..1ed4192157fc8 100644
--- a/offload/libomptarget/OpenMP/API.cpp
+++ b/offload/libomptarget/OpenMP/API.cpp
@@ -98,6 +98,20 @@ EXTERN int omp_get_initial_device(void) {
return HostDevice;
}
+EXTERN size_t omp_get_groupprivate_limit(int DeviceNum,
+ omp_access_t AccessGroup) {
+ TIMESCOPE();
+ OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
+ if (DeviceNum == omp_get_initial_device())
+ return 0;
+
+ auto DeviceOrErr = PM->getDevice(DeviceNum);
+ if (!DeviceOrErr)
+ FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
+
+ return DeviceOrErr->getMaxSharedTeamMemory();
+}
+
EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) {
TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DeviceNum) +
";size=" + std::to_string(Size));
diff --git a/offload/libomptarget/device.cpp b/offload/libomptarget/device.cpp
index f88e30ae9e76b..31bfc7d092424 100644
--- a/offload/libomptarget/device.cpp
+++ b/offload/libomptarget/device.cpp
@@ -281,3 +281,9 @@ bool DeviceTy::useAutoZeroCopy() {
return false;
return RTL->use_auto_zero_copy(RTLDeviceID);
}
+
+uint64_t DeviceTy::getMaxSharedTeamMemory() {
+ using DeviceQueryKind = llvm::omp::target::plugin::DeviceQueryKind;
+ return RTL->query_device_info(
+ RTLDeviceID, DeviceQueryKind::DEVICE_QUERY_MAX_SHARED_TEAM_MEM);
+}
diff --git a/offload/libomptarget/exports b/offload/libomptarget/exports
index 2406776c1fb5f..b5a1401564d58 100644
--- a/offload/libomptarget/exports
+++ b/offload/libomptarget/exports
@@ -40,6 +40,7 @@ VERS1.0 {
omp_get_num_devices;
omp_get_device_num;
omp_get_initial_device;
+ omp_get_groupprivate_limit;
omp_target_alloc;
omp_target_free;
omp_target_is_present;
diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
index 3117763e35896..2cf156e576c5f 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
@@ -52,6 +52,7 @@ typedef enum {
HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE = 6,
HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT = 7,
HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL = 15,
+ HSA_AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE = 16,
} hsa_amd_memory_pool_info_t;
typedef enum {
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 12c7cc62905c9..fa373c2029f0c 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -273,7 +273,6 @@ struct AMDGPUMemoryPoolTy {
if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, GlobalFlags))
return Err;
-
return Plugin::success();
}
@@ -543,6 +542,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
return Err;
}
+ StaticBlockMemSize = GroupSize;
+
// Make sure it is a kernel symbol.
if (SymbolType != HSA_SYMBOL_KIND_KERNEL)
return Plugin::error(ErrorCode::INVALID_BINARY,
@@ -566,8 +567,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
/// Launch the AMDGPU kernel function.
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
- uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
- KernelLaunchParamsTy LaunchParams,
+ uint32_t NumBlocks[3], uint32_t DynBlockMemSize,
+ KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
/// Print more elaborate kernel launch info for AMDGPU
@@ -2020,6 +2021,20 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
if (auto Err = checkIfAPU())
return Err;
+ // Retrieve the size of the group memory.
+ for (const auto *Pool : AllMemoryPools) {
+ if (Pool->isGroup()) {
+ size_t Size = 0;
+ if (auto Err = Pool->getAttr(HSA_AMD_MEMORY_POOL_INFO_SIZE, Size))
+ return Err;
+ MaxBlockSharedMemSize = Size;
+ break;
+ }
+ }
+
+ // Supports block shared memory natively.
+ HasNativeBlockSharedMem = true;
+
return Plugin::success();
}
@@ -2856,7 +2871,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
KernelArgsTy KernelArgs = {};
uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u};
if (auto Err = AMDGPUKernel.launchImpl(
- *this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs,
+ *this, NumBlocksAndThreads, NumBlocksAndThreads, 0, KernelArgs,
KernelLaunchParamsTy{}, AsyncInfoWrapper))
return Err;
@@ -3357,6 +3372,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t NumThreads[3], uint32_t NumBlocks[3],
+ uint32_t DynBlockMemSize,
KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
@@ -3374,13 +3390,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
if (auto Err = ArgsMemoryManager.allocate(ArgsSize, &AllArgs))
return Err;
- // Account for user requested dynamic shared memory.
- uint32_t GroupSize = getGroupSize();
- if (uint32_t MaxDynCGroupMem = std::max(
- KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize())) {
- GroupSize += MaxDynCGroupMem;
- }
-
uint64_t StackSize;
if (auto Err = GenericDevice.getDeviceStackSize(StackSize))
return Err;
@@ -3434,7 +3443,8 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
// Push the kernel launch into the stream.
return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
- GroupSize, StackSize, ArgsMemoryManager);
+ getStaticBlockMemSize() + DynBlockMemSize,
+ StackSize, ArgsMemoryManager);
}
Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 162b149ab483e..3357ccfe0c9b5 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -226,6 +226,10 @@ struct InfoTreeNode {
}
};
+enum class DeviceQueryKind {
+ DEVICE_QUERY_MAX_SHARED_TEAM_MEM = 0,
+};
+
/// Class wrapping a __tgt_device_image and its offload entry table on a
/// specific device. This class is responsible for storing and managing
/// the offload entries for an image on a device.
@@ -312,13 +316,16 @@ struct GenericKernelTy {
AsyncInfoWrapperTy &AsyncInfoWrapper) const;
virtual Error launchImpl(GenericDeviceTy &GenericDevice,
uint32_t NumThreads[3], uint32_t NumBlocks[3],
- KernelArgsTy &KernelArgs,
+ uint32_t DynBlockMemSize, KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
/// Get the kernel name.
const char *getName() const { return Name.c_str(); }
+ /// Get the size of the static per-block memory consumed by the kernel.
+ uint32_t getStaticBlockMemSize() const { return StaticBlockMemSize; };
+
/// Get the kernel image.
DeviceImageTy &getImage() const {
assert(ImagePtr && "Kernel is not initialized!");
@@ -331,9 +338,9 @@ struct GenericKernelTy {
}
/// Return a device pointer to a new kernel launch environment.
- Expected<KernelLaunchEnvironmentTy *>
- getKernelLaunchEnvironment(GenericDeviceTy &GenericDevice, uint32_t Version,
- AsyncInfoWrapperTy &AsyncInfo) const;
+ Expected<KernelLaunchEnvironmentTy *> getKernelLaunchEnvironment(
+ GenericDeviceTy &GenericDevice, const KernelArgsTy &KernelArgs,
+ void *FallbackBlockMem, AsyncInfoWrapperTy &AsyncInfo) const;
/// Indicate whether an execution mode is valid.
static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) {
@@ -425,6 +432,9 @@ struct GenericKernelTy {
/// The maximum number of threads which the kernel could leverage.
uint32_t MaxNumThreads;
+ /// The static memory sized per block.
+ uint32_t StaticBlockMemSize = 0;
+
/// The kernel environment, including execution flags.
KernelEnvironmentTy KernelEnvironment;
@@ -731,6 +741,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
/// this id is not unique between different plugins; they may overlap.
int32_t getDeviceId() const { return DeviceId; }
+ /// Get the total shared memory per block that can be used in any kernel.
+ uint32_t getMaxBlockSharedMemSize() const { return MaxBlockSharedMemSize; }
+
+ /// Indicate whether the device has native block shared memory.
+ bool hasNativeBlockSharedMem() const { return HasNativeBlockSharedMem; }
+
/// Set the context of the device if needed, before calling device-specific
/// functions. Plugins may implement this function as a no-op if not needed.
virtual Error setContext() = 0;
@@ -1132,6 +1148,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
std::atomic<bool> OmptInitialized;
#endif
+ /// The total per-block shared memory that a kernel may use.
+ uint32_t MaxBlockSharedMemSize = 0;
+
+ /// Whether the device has native block shared memory.
+ bool HasNativeBlockSharedMem = false;
+
private:
DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0};
DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
@@ -1347,6 +1369,9 @@ struct GenericPluginTy {
/// Prints information about the given devices supported by the plugin.
void print_device_info(int32_t DeviceId);
+ /// Retrieve information about the given device.
+ int64_t query_device_info(int32_t DeviceId, DeviceQueryKind Query);
+
/// Creates an event in the given plugin if supported.
int32_t create_event(int32_t DeviceId, void **EventPtr);
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 81b9d423e13d8..2997585e1660f 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -477,20 +477,20 @@ Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,
Expected<KernelLaunchEnvironmentTy *>
GenericKernelTy::getKernelLaunchEnvironment(
- GenericDeviceTy &GenericDevice, uint32_t Version,
- AsyncInfoWrapperTy &AsyncInfoWrapper) const {
+ GenericDeviceTy &GenericDevice, const KernelArgsTy &KernelArgs,
+ void *FallbackBlockMem, AsyncInfoWrapperTy &AsyncInfoWrapper) const {
// Ctor/Dtor have no arguments, replaying uses the original kernel launch
// environment. Older versions of the compiler do not generate a kernel
// launch environment.
if (GenericDevice.Plugin.getRecordReplay().isReplaying() ||
- Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR)
+ KernelArgs.Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR)
return nullptr;
- if (!KernelEnvironment.Configuration.ReductionDataSize ||
- !KernelEnvironment.Configuration.ReductionBufferLength)
+ if ((!KernelEnvironment.Configuration.ReductionDataSize ||
+ !KernelEnvironment.Configuration.ReductionBufferLength) &&
+ KernelArgs.DynCGroupMem == 0)
return reinterpret_cast<KernelLaunchEnvironmentTy *>(~0);
- // TODO: Check if the kernel needs a launch environment.
auto AllocOrErr = GenericDevice.dataAlloc(sizeof(KernelLaunchEnvironmentTy),
/*HostPtr=*/nullptr,
TargetAllocTy::TARGET_ALLOC_DEVICE);
@@ -504,7 +504,9 @@ GenericKernelTy::getKernelLaunchEnvironment(
/// async data transfer.
auto &LocalKLE = (*AsyncInfoWrapper).KernelLaunchEnvironment;
LocalKLE = KernelLaunchEnvironment;
- {
+
+ if (KernelEnvironment.Configuration.ReductionDataSize &&
+ KernelEnvironment.Configuration.ReductionBufferLength) {
auto AllocOrErr = GenericDevice.dataAlloc(
KernelEnvironment.Configuration.ReductionDataSize *
KernelEnvironment.Configuration.ReductionBufferLength,
@@ -514,8 +516,13 @@ GenericKernelTy::getKernelLaunchEnvironment(
LocalKLE.ReductionBuffer = *AllocOrErr;
// Remember to free the memory later.
AsyncInfoWrapper.freeAllocationAfterSynchronization(*AllocOrErr);
+ } else {
+ LocalKLE.ReductionBuffer = nullptr;
}
+ LocalKLE.DynCGroupMemSize = KernelArgs.DynCGroupMem;
+ LocalKLE.DynCGroupMemFallback = FallbackBlockMem;
+
INFO(OMP_INFOTYPE_DATA_TRANSFER, GenericDevice.getDeviceId(),
"Copying data from host to device, HstPtr=" DPxMOD ", TgtPtr=" DPxMOD
", Size=%" PRId64 ", Name=KernelLaunchEnv\n",
@@ -556,8 +563,45 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
llvm::SmallVector<void *, 16> Args;
llvm::SmallVector<void *, 16> Ptrs;
+ uint32_t NumThreads[3] = {KernelArgs.ThreadLimit[0],
+ KernelArgs.ThreadLimit[1],
+ KernelArgs.ThreadLimit[2]};
+ uint32_t NumBlocks[3] = {KernelArgs.NumTeams[0], KernelArgs.NumTeams[1],
+ KernelArgs.NumTeams[2]};
+ if (!isBareMode()) {
+ NumThreads[0] = getNumThreads(GenericDevice, NumThreads);
+ NumBlocks[0] = getNumBlocks(GenericDevice, NumBlocks, KernelArgs.Tripcount,
+ NumThreads[0], KernelArgs.ThreadLimit[0] > 0);
+ }
+
+ uint32_t MaxBlockMemSize = GenericDevice.getMaxBlockSharedMemSize();
+ uint32_t DynBlockMemSize = KernelArgs.DynCGroupMem;
+ uint32_t TotalBlockMemSize = StaticBlockMemSize + DynBlockMemSize;
+ if (StaticBlockMemSize > MaxBlockMemSize)
+ return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+ "Static block memory size exceeds maximum");
+ else if (!KernelArgs.Flags.AllowDynCGroupMemFallback &&
+ TotalBlockMemSize > MaxBlockMemSize)
+ return Plugin::error(
+ ErrorCode::INVALID_ARGUMENT,
+ "Static and dynamic block memory size exceeds maximum");
+
+ void *FallbackBlockMem = nullptr;
+ if (DynBlockMemSize && (!GenericDevice.hasNativeBlockSharedMem() ||
+ TotalBlockMemSize > MaxBlockMemSize)) {
+ auto AllocOrErr = GenericDevice.dataAlloc(
+ NumBlocks[0] * DynBlockMemSize,
+ /*HostPtr=*/nullptr, TargetAllocTy::TARGET_ALLOC_DEVICE);
+ if (!AllocOrErr)
+ return AllocOrErr.takeError();
+
+ FallbackBlockMem = *AllocOrErr;
+ AsyncInfoWrapper.freeAllocationAfterSynchronization(FallbackBlockMem);
+ DynBlockMemSize = 0;
+ }
+
auto KernelLaunchEnvOrErr = getKernelLaunchEnvironment(
- GenericDevice, KernelArgs.Version, AsyncInfoWrapper);
+ GenericDevice, KernelArgs, FallbackBlockMem, AsyncInfoWrapper);
if (!KernelLaunchEnvOrErr)
return KernelLaunchEnvOrErr.takeError();
@@ -573,17 +617,6 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
Args, Ptrs, *KernelLaunchEnvOrErr);
}
- uint32_t NumThreads[3] = {KernelArgs.ThreadLimit[0],
- KernelArgs.ThreadLimit[1],
- KernelArgs.ThreadLimit[2]};
- uint32_t NumBlocks[3] = {KernelArgs.NumTeams[0], KernelArgs.NumTeams[1],
- KernelArgs.NumTeams[2]};
- if (!isBareMode()) {
- NumThreads[0] = getNumThreads(GenericDevice, NumThreads);
- NumBlocks[0] = getNumBlocks(GenericDevice, NumBlocks, KernelArgs.Tripcount,
- NumThreads[0], KernelArgs.ThreadLimit[0] > 0);
- }
-
// Record the kernel description after we modified the argument count and num
// blocks/threads.
RecordReplayTy &RecordReplay = GenericDevice.Plugin.getRecordReplay();
@@ -599,8 +632,8 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
printLaunchInfo(GenericDevice, KernelArgs, NumThreads, NumBlocks))
return Err;
- return launchImpl(GenericDevice, NumThreads, NumBlocks, KernelArgs,
- LaunchParams, AsyncInfoWrapper);
+ return launchImpl(GenericDevice, NumThreads, NumBlocks, DynBlockMemSize,
+ KernelArgs, LaunchParams, AsyncInfoWrapper);
}
KernelLaunchParamsTy GenericKernelTy::prepareArgs(
@@ -2077,6 +2110,17 @@ void GenericPluginTy::print_device_info(int32_t DeviceId) {
toString(std::move(Err)).data());
}
+int64_t GenericPluginTy::query_device_info(int32_t DeviceId,
+ DeviceQueryKind Query) {
+ const GenericDeviceTy &Device = getDevice(DeviceId);
+
+ switch (Query) {
+ case DeviceQueryKind::DEVICE_QUERY_MAX_SHARED_TEAM_MEM:
+ return Device.getMaxBlockSharedMemSize();
+ }
+ return 0;
+}
+
int32_t GenericPluginTy::create_event(int32_t DeviceId, void **EventPtr) {
auto Err = getDevice(DeviceId).createEvent(EventPtr);
if (Err) {
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index b6c022c8e7e8b..b6e087edea876 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -258,6 +258,7 @@ typedef enum CUdevice_attribute_enum {
typedef enum CUfunction_attribute_enum {
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
+ CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,
CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8,
} CUfunction_attribute;
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 15193de6ae430..eda7a85f750f0 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -148,13 +148,21 @@ struct CUDAKernelTy : public GenericKernelTy {
// The maximum number of threads cannot exceed the maximum of the kernel.
MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads);
+ int SharedMemSize;
+ Res = cuFuncGetAttribute(&SharedMemSize,
+ CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, Func);
+ if (auto Err = Plugin::check(Res, "Error in cuFuncGetAttribute: %s"))
+ return Err;
+
+ StaticBlockMemSize = SharedMemSize;
+
return Plugin::success();
}
/// Launch the CUDA kernel function.
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
- uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
- KernelLaunchParamsTy LaunchParams,
+ uint32_t NumBlocks[3], uint32_t DynBlockMemSize,
+ KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
private:
@@ -162,7 +170,7 @@ struct CUDAKernelTy : public GenericKernelTy {
CUfunction Func;
/// The maximum amount of dynamic shared memory per thread group. By default,
/// this is set to 48 KB.
- mutable uint32_t MaxDynCGroupMemLimit = 49152;
+ mutable uint32_t MaxDynBlockMemSize = 49152;
};
/// Class wrapping a CUDA stream reference. These are the objects handled by the
@@ -358,6 +366,15 @@ struct CUDADeviceTy : public GenericDeviceTy {
return Err;
HardwareParallelism = NumMuliprocessors * (MaxThreadsPerSM / WarpSize);
+ uint32_t MaxSharedMem;
+ if (auto Err = getDeviceAttr(
+ CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, MaxSharedMem))
+ return Err;
+ MaxBlockSharedMemSize = MaxSharedMem;
+
+ // Supports block shared memory natively.
+ HasNativeBlockSharedMem = true;
+
return Plugin::success();
}
@@ -1239,7 +1256,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
KernelArgsTy KernelArgs = {};
uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u};
if (auto Err = CUDAKernel.launchImpl(
- *this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs,
+ *this, NumBlocksAndThreads, NumBlocksAndThreads, 0, KernelArgs,
KernelLaunchParamsTy{}, AsyncInfoWrapper))
return Err;
@@ -1285,6 +1302,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t NumThreads[3], uint32_t NumBlocks[3],
+ uint32_t DynBlockMemSize,
KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
@@ -1294,9 +1312,6 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream))
return Err;
- uint32_t MaxDynCGroupMem =
- std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize());
-
void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data,
CU_LAUNCH_PARAM_BUFFER_SIZE,
reinterpret_cast<void *>(&LaunchParams.Size),
@@ -1308,18 +1323,18 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
GenericDevice.Plugin.getRPCServer().Thread->notify();
// In case we require more memory than the current limit.
- if (MaxDynCGroupMem >= MaxDynCGroupMemLimit) {
+ if (DynBlockMemSize >= MaxDynBlockMemSize) {
CUresult AttrResult = cuFuncSetAttribute(
- Func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, MaxDynCGroupMem);
+ Func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, DynBlockMemSize);
Plugin::check(
AttrResult,
"Error in cuLaunchKernel while setting the memory limits: %s");
- MaxDynCGroupMemLimit = MaxDynCGroupMem;
+ MaxDynBlockMemSize = DynBlockMemSize;
}
CUresult Res = cuLaunchKernel(Func, NumBlocks[0], NumBlocks[1], NumBlocks[2],
NumThreads[0], NumThreads[1], NumThreads[2],
- MaxDynCGroupMem, Stream, nullptr, Config);
+ DynBlockMemSize, Stream, nullptr, Config);
// Register a callback to indicate when the kernel is complete.
if (GenericDevice.getRPCServer())
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index d950572265b4c..dc82a2ef16e51 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -92,8 +92,8 @@ struct GenELF64KernelTy : public GenericKernelTy {
/// Launch the kernel using the libffi.
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
- uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
- KernelLaunchParamsTy LaunchParams,
+ uint32_t NumBlocks[3], uint32_t DynBlockMemSize,
+ KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override {
// Create a vector of ffi_types, one per argument.
SmallVector<ffi_type *, 16> ArgTypes(KernelArgs.NumArgs, &ffi_type_pointer);
diff --git a/offload/test/offloading/dyn_groupprivate_strict.cpp b/offload/test/offloading/dyn_groupprivate_strict.cpp
new file mode 100644
index 0000000000000..a35f8dd2b0595
--- /dev/null
+++ b/offload/test/offloading/dyn_groupprivate_strict.cpp
@@ -0,0 +1,141 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+// REQUIRES: gpu
+
+#include <omp.h>
+#include <stdio.h>
+
+#define N 512
+
+int main() {
+ int Result[N], NumThreads;
+
+#pragma omp target teams num_teams(1) thread_limit(N) \
+ dyn_groupprivate(strict : N * sizeof(Result[0])) \
+ map(from : Result, NumThreads)
+ {
+ int Buffer[N];
+#pragma omp parallel
+ {
+ int *DynBuffer = (int *)omp_get_dyn_groupprivate_ptr();
+ int TId = omp_get_thread_num();
+ if (TId == 0)
+ NumThreads = omp_get_num_threads();
+ Buffer[TId] = 7;
+ DynBuffer[TId] = 3;
+#pragma omp barrier
+ int WrappedTId = (TId + 37) % NumThreads;
+ Result[TId] = Buffer[WrappedTId] + DynBuffer[WrappedTId];
+ }
+ }
+
+ if (NumThreads < N / 2 || NumThreads > N) {
+ printf("Expected number of threads to be in [%i:%i], but got: %i", N / 2, N,
+ NumThreads);
+ return -1;
+ }
+
+ int Failed = 0;
+ for (int i = 0; i < NumThreads; ++i) {
+ if (Result[i] != 7 + 3) {
+ printf("Result[%i] is %i, expected %i\n", i, Result[i], 7 + 3);
+ ++Failed;
+ }
+ }
+
+ // Verify that the routines in the host returns NULL and zero.
+ if (omp_get_dyn_groupprivate_ptr())
+ ++Failed;
+ if (omp_get_dyn_groupprivate_size())
+ ++Failed;
+
+ size_t MaxSize = omp_get_groupprivate_limit(0, omp_access_cgroup);
+ size_t ExceededSize = MaxSize + 10;
+
+// Verify that the fallback modifier works.
+#pragma omp target dyn_groupprivate(fallback : ExceededSize) \
+ map(tofrom : Failed)
+ {
+ int IsFallback;
+ if (!omp_get_dyn_groupprivate_ptr(0, &IsFallback))
+ ++Failed;
+ if (!omp_get_dyn_groupprivate_size())
+ ++Failed;
+ if (omp_get_dyn_groupprivate_size() != ExceededSize)
+ ++Failed;
+ if (!IsFallback)
+ ++Failed;
+ }
+
+// Verify that the default modifier is fallback.
+#pragma omp target dyn_groupprivate(ExceededSize)
+ {
+ }
+
+// Verify that the strict modifier works.
+#pragma omp target dyn_groupprivate(strict : N) map(tofrom : Failed)
+ {
+ int IsFallback;
+ if (!omp_get_dyn_groupprivate_ptr(0, &IsFallback))
+ ++Failed;
+ if (!omp_get_dyn_groupprivate_size())
+ ++Failed;
+ if (omp_get_dyn_groupprivate_size() != N)
+ ++Failed;
+ if (IsFallback)
+ ++Failed;
+ }
+
+// Verify that the fallback does not trigger when not needed.
+#pragma omp target dyn_groupprivate(fallback : N) map(tofrom : Failed)
+ {
+ int IsFallback;
+ if (!omp_get_dyn_groupprivate_ptr(0, &IsFallback))
+ ++Failed;
+ if (!omp_get_dyn_groupprivate_size())
+ ++Failed;
+ if (omp_get_dyn_groupprivate_size() != N)
+ ++Failed;
+ if (IsFallback)
+ ++Failed;
+ }
+
+// Verify that the clause works when passing a zero size.
+#pragma omp target dyn_groupprivate(strict : 0) map(tofrom : Failed)
+ {
+ int IsFallback;
+ if (omp_get_dyn_groupprivate_ptr(0, &IsFallback))
+ ++Failed;
+ if (omp_get_dyn_groupprivate_size())
+ ++Failed;
+ if (IsFallback)
+ ++Failed;
+ }
+
+// Verify that the clause works when passing a zero size.
+#pragma omp target dyn_groupprivate(fallback : 0) map(tofrom : Failed)
+ {
+ int IsFallback;
+ if (omp_get_dyn_groupprivate_ptr(0, &IsFallback))
+ ++Failed;
+ if (omp_get_dyn_groupprivate_size())
+ ++Failed;
+ if (IsFallback)
+ ++Failed;
+ }
+
+// Verify that omitting the clause is the same as setting zero size.
+#pragma omp target map(tofrom : Failed)
+ {
+ int IsFallback;
+ if (omp_get_dyn_groupprivate_ptr(0, &IsFallback))
+ ++Failed;
+ if (omp_get_dyn_groupprivate_size())
+ ++Failed;
+ if (IsFallback)
+ ++Failed;
+ }
+
+ // CHECK: PASS
+ if (!Failed)
+ printf("PASS\n");
+}
diff --git a/openmp/runtime/src/include/omp.h.var b/openmp/runtime/src/include/omp.h.var
index 74f385feb3ea5..26c3df56a9ce3 100644
--- a/openmp/runtime/src/include/omp.h.var
+++ b/openmp/runtime/src/include/omp.h.var
@@ -380,6 +380,10 @@
omp_uintptr_t value;
} omp_alloctrait_t;
+ typedef enum {
+ omp_access_cgroup = 0,
+ } omp_access_t;
+
# if defined(_WIN32)
// On Windows cl and icl do not support 64-bit enum, let's use integer then.
typedef omp_uintptr_t omp_allocator_handle_t;
@@ -463,6 +467,9 @@
omp_allocator_handle_t allocator = omp_null_allocator,
omp_allocator_handle_t free_allocator = omp_null_allocator);
extern void __KAI_KMPC_CONVENTION omp_free(void * ptr, omp_allocator_handle_t a = omp_null_allocator);
+ extern void *__KAI_KMPC_CONVENTION omp_get_dyn_groupprivate_ptr(size_t offset = 0, int *is_fallback = NULL, omp_access_t access_group = omp_access_cgroup);
+ extern size_t __KAI_KMPC_CONVENTION omp_get_dyn_groupprivate_size(omp_access_t access_group = omp_access_cgroup);
+ extern size_t __KAI_KMPC_CONVENTION omp_get_groupprivate_limit(int device_num, omp_access_t access_group = omp_access_cgroup);
# else
extern void *__KAI_KMPC_CONVENTION omp_alloc(size_t size, omp_allocator_handle_t a);
extern void *__KAI_KMPC_CONVENTION omp_aligned_alloc(size_t align, size_t size,
@@ -473,6 +480,9 @@
extern void *__KAI_KMPC_CONVENTION omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
omp_allocator_handle_t free_allocator);
extern void __KAI_KMPC_CONVENTION omp_free(void *ptr, omp_allocator_handle_t a);
+ extern void *__KAI_KMPC_CONVENTION omp_get_dyn_groupprivate_ptr(size_t offset, int *is_fallback, omp_access_t access_group);
+ extern size_t __KAI_KMPC_CONVENTION omp_get_dyn_groupprivate_size(omp_access_t access_group);
+ extern size_t __KAI_KMPC_CONVENTION omp_get_groupprivate_limit(int device_num, omp_access_t access_group);
# endif
/* OpenMP TR11 routines to get memory spaces and allocators */
diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp
index 3ca32ba583fe2..9605bad457e11 100644
--- a/openmp/runtime/src/kmp_csupport.cpp
+++ b/openmp/runtime/src/kmp_csupport.cpp
@@ -4515,6 +4515,15 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator) {
}
/* end of OpenMP 5.1 Memory Management routines */
+void *omp_get_dyn_groupprivate_ptr(size_t offset, int *is_fallback,
+ omp_access_t access_group) {
+ if (is_fallback != NULL)
+ *is_fallback = 0;
+ return NULL;
+}
+
+size_t omp_get_dyn_groupprivate_size(omp_access_t access_group) { return 0; }
+
int __kmpc_get_target_offload(void) {
if (!__kmp_init_serial) {
__kmp_serial_initialize();
diff --git a/openmp/runtime/src/kmp_stub.cpp b/openmp/runtime/src/kmp_stub.cpp
index 06276d1bed1c7..a099f887b6ba4 100644
--- a/openmp/runtime/src/kmp_stub.cpp
+++ b/openmp/runtime/src/kmp_stub.cpp
@@ -454,6 +454,22 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator) {
#endif
}
+void *omp_get_dyn_groupprivate_ptr(size_t offset, int *is_fallback,
+ omp_access_t access_group) {
+ i;
+ return NULL;
+}
+
+size_t omp_get_dyn_groupprivate_size(omp_access_t access_group) {
+ i;
+ return 0;
+}
+
+size_t omp_get_groupprivate_limit(int device_num, omp_access_t access_group) {
+ i;
+ return 0;
+}
+
/* OpenMP 5.0 Affinity Format */
void omp_set_affinity_format(char const *format) { i; }
size_t omp_get_affinity_format(char *buffer, size_t size) {
>From f66e5faa93cf2f40bd2a6bd7a95abddf78fb6076 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Fri, 8 Aug 2025 11:04:06 -0700
Subject: [PATCH 03/38] [OpenMP] Add codegen support for dyn_groupprivate
clause
---
clang/lib/CodeGen/CGOpenMPRuntime.cpp | 40 ++++++++++++-------
.../llvm/Frontend/OpenMP/OMPIRBuilder.h | 7 +++-
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 10 ++++-
3 files changed, 39 insertions(+), 18 deletions(-)
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index a5f2f0efa2c3b..d9121827a813a 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -9489,18 +9489,30 @@ static llvm::Value *emitDeviceID(
return DeviceID;
}
-static llvm::Value *emitDynCGGroupMem(const OMPExecutableDirective &D,
- CodeGenFunction &CGF) {
- llvm::Value *DynCGroupMem = CGF.Builder.getInt32(0);
-
- if (auto *DynMemClause = D.getSingleClause<OMPXDynCGroupMemClause>()) {
- CodeGenFunction::RunCleanupsScope DynCGroupMemScope(CGF);
- llvm::Value *DynCGroupMemVal = CGF.EmitScalarExpr(
- DynMemClause->getSize(), /*IgnoreResultAssign=*/true);
- DynCGroupMem = CGF.Builder.CreateIntCast(DynCGroupMemVal, CGF.Int32Ty,
- /*isSigned=*/false);
- }
- return DynCGroupMem;
+static std::pair<llvm::Value *, bool>
+emitDynCGroupMem(const OMPExecutableDirective &D, CodeGenFunction &CGF) {
+ llvm::Value *DynGP = CGF.Builder.getInt32(0);
+ bool DynGPFallback = false;
+
+ if (auto *DynGPClause = D.getSingleClause<OMPDynGroupprivateClause>()) {
+ CodeGenFunction::RunCleanupsScope DynGPScope(CGF);
+ llvm::Value *DynGPVal =
+ CGF.EmitScalarExpr(DynGPClause->getSize(), /*IgnoreResultAssign=*/true);
+ DynGP = CGF.Builder.CreateIntCast(DynGPVal, CGF.Int32Ty,
+ /*isSigned=*/false);
+ DynGPFallback = (DynGPClause->getFirstDynGroupprivateModifier() !=
+ OMPC_DYN_GROUPPRIVATE_strict &&
+ DynGPClause->getSecondDynGroupprivateModifier() !=
+ OMPC_DYN_GROUPPRIVATE_strict);
+ } else if (auto *OMPXDynCGClause =
+ D.getSingleClause<OMPXDynCGroupMemClause>()) {
+ CodeGenFunction::RunCleanupsScope DynCGMemScope(CGF);
+ llvm::Value *DynCGMemVal = CGF.EmitScalarExpr(OMPXDynCGClause->getSize(),
+ /*IgnoreResultAssign=*/true);
+ DynGP = CGF.Builder.CreateIntCast(DynCGMemVal, CGF.Int32Ty,
+ /*isSigned=*/false);
+ }
+ return {DynGP, DynGPFallback};
}
static void genMapInfoForCaptures(
MappableExprsHandler &MEHandler, CodeGenFunction &CGF,
@@ -9710,7 +9722,7 @@ static void emitTargetCallKernelLaunch(
llvm::Value *RTLoc = OMPRuntime->emitUpdateLocation(CGF, D.getBeginLoc());
llvm::Value *NumIterations =
OMPRuntime->emitTargetNumIterationsCall(CGF, D, SizeEmitter);
- llvm::Value *DynCGGroupMem = emitDynCGGroupMem(D, CGF);
+ auto [DynCGroupMem, DynCGroupMemFallback] = emitDynCGroupMem(D, CGF);
llvm::OpenMPIRBuilder::InsertPointTy AllocaIP(
CGF.AllocaInsertPt->getParent(), CGF.AllocaInsertPt->getIterator());
@@ -9720,7 +9732,7 @@ static void emitTargetCallKernelLaunch(
llvm::OpenMPIRBuilder::TargetKernelArgs Args(
NumTargetItems, RTArgs, NumIterations, NumTeams, NumThreads,
- DynCGGroupMem, HasNoWait);
+ DynCGroupMem, HasNoWait, DynCGroupMemFallback);
llvm::OpenMPIRBuilder::InsertPointTy AfterIP =
cantFail(OMPRuntime->getOMPBuilder().emitKernelLaunch(
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 19a4058b64382..ebc50eecb551e 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -2341,17 +2341,20 @@ class OpenMPIRBuilder {
Value *DynCGGroupMem = nullptr;
/// True if the kernel has 'no wait' clause.
bool HasNoWait = false;
+ /// True if the dynamic shared memory may fallback.
+ bool MayFallbackDynCGroupMem = false;
// Constructors for TargetKernelArgs.
TargetKernelArgs() {}
TargetKernelArgs(unsigned NumTargetItems, TargetDataRTArgs RTArgs,
Value *NumIterations, ArrayRef<Value *> NumTeams,
ArrayRef<Value *> NumThreads, Value *DynCGGroupMem,
- bool HasNoWait)
+ bool HasNoWait, bool MayFallbackDynCGroupMem)
: NumTargetItems(NumTargetItems), RTArgs(RTArgs),
NumIterations(NumIterations), NumTeams(NumTeams),
NumThreads(NumThreads), DynCGGroupMem(DynCGGroupMem),
- HasNoWait(HasNoWait) {}
+ HasNoWait(HasNoWait),
+ MayFallbackDynCGroupMem(MayFallbackDynCGroupMem) {}
};
/// Create the kernel args vector used by emitTargetKernel. This function
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 170224616ac64..e600508d347cb 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -506,7 +506,13 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs,
auto Int32Ty = Type::getInt32Ty(Builder.getContext());
constexpr const size_t MaxDim = 3;
Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
- Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
+
+ Value *HasNoWaitFlag = Builder.getInt64(KernelArgs.HasNoWait);
+ Value *MayFallbackDynCGroupMemFlag =
+ Builder.getInt64(KernelArgs.MayFallbackDynCGroupMem);
+ MayFallbackDynCGroupMemFlag =
+ Builder.CreateShl(MayFallbackDynCGroupMemFlag, 2);
+ Value *Flags = Builder.CreateOr(HasNoWaitFlag, MayFallbackDynCGroupMemFlag);
assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
@@ -7891,7 +7897,7 @@ static void emitTargetCall(
KArgs = OpenMPIRBuilder::TargetKernelArgs(NumTargetItems, RTArgs, TripCount,
NumTeamsC, NumThreadsC,
- DynCGGroupMem, HasNoWait);
+ DynCGGroupMem, HasNoWait, false);
// Assume no error was returned because TaskBodyCB and
// EmitTargetCallFallbackCB don't produce any.
>From f20f4ba2290c0966e86dae733ad025d2fb0995d2 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Sat, 9 Aug 2025 22:50:23 -0700
Subject: [PATCH 04/38] Add fixes
---
offload/DeviceRTL/include/DeviceTypes.h | 5 +++++
offload/DeviceRTL/src/State.cpp | 19 ++++++++++---------
offload/include/omptarget.h | 15 +++++++++++----
offload/plugins-nextgen/amdgpu/src/rtl.cpp | 4 ++++
offload/plugins-nextgen/cuda/src/rtl.cpp | 4 ++++
openmp/runtime/src/kmp_csupport.cpp | 2 +-
6 files changed, 35 insertions(+), 14 deletions(-)
diff --git a/offload/DeviceRTL/include/DeviceTypes.h b/offload/DeviceRTL/include/DeviceTypes.h
index a43b506d6879e..042fef45917b0 100644
--- a/offload/DeviceRTL/include/DeviceTypes.h
+++ b/offload/DeviceRTL/include/DeviceTypes.h
@@ -163,8 +163,13 @@ typedef enum omp_allocator_handle_t {
///}
+/// The OpenMP access group type. The criterion for grupping tasks using a
+/// specific grouping property.
enum omp_access_t {
+ /// Groups the tasks based on the contention group to which they belong.
omp_access_cgroup = 0,
+ /// Groups the tasks based on the parallel region to which they bind.
+ omp_access_pteam = 1,
};
#endif
diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp
index 9e2a9999167b4..c6bc6a140f5f2 100644
--- a/offload/DeviceRTL/src/State.cpp
+++ b/offload/DeviceRTL/src/State.cpp
@@ -163,14 +163,15 @@ struct DynCGroupMemTy {
Size = 0;
Ptr = nullptr;
IsFallback = false;
- if (KLE) {
- Size = KLE->DynCGroupMemSize;
- if (void *Fallback = KLE->DynCGroupMemFallback) {
- Ptr = static_cast<char *>(Fallback) + Size * omp_get_team_num();
- IsFallback = true;
- } else {
- Ptr = static_cast<char *>(NativeDynCGroup);
- }
+ if (!KLE)
+ return;
+
+ Size = KLE->DynCGroupMemSize;
+ if (void *Fallback = KLE->DynCGroupMemFallback) {
+ Ptr = static_cast<char *>(Fallback) + Size * omp_get_team_num();
+ IsFallback = true;
+ } else {
+ Ptr = static_cast<char *>(NativeDynCGroup);
}
}
@@ -466,7 +467,7 @@ int omp_is_initial_device(void) { return 0; }
void *omp_get_dyn_groupprivate_ptr(size_t Offset, int *IsFallback,
omp_access_t) {
- if (IsFallback != NULL)
+ if (IsFallback != nullptr)
*IsFallback = DynCGroupMem.isFallback();
return DynCGroupMem.getPtr(Offset);
}
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index 45bb74ec367d6..ddb0f7f88d2e0 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -273,15 +273,22 @@ struct __tgt_target_non_contig {
extern "C" {
#endif
-typedef enum {
- omp_access_cgroup = 0,
-} omp_access_t;
+/// The OpenMP access group type. The criterion for grupping tasks using a
+/// specific grouping property.
+enum omp_access_t {
+ /// Groups the tasks based on the contention group to which they belong.
+ omp_access_cgroup = 0,
+ /// Groups the tasks based on the parallel region to which they bind.
+ omp_access_pteam = 1,
+};
void ompx_dump_mapping_tables(void);
int omp_get_num_devices(void);
int omp_get_device_num(void);
int omp_get_initial_device(void);
-size_t omp_get_groupprivate_limit(int device_num, omp_access_t access_group = omp_access_cgroup);
+size_t
+omp_get_groupprivate_limit(int device_num,
+ omp_access_t access_group = omp_access_cgroup);
void *omp_target_alloc(size_t Size, int DeviceNum);
void omp_target_free(void *DevicePtr, int DeviceNum);
int omp_target_is_present(const void *Ptr, int DeviceNum);
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index fa373c2029f0c..9751169b09c60 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -3441,6 +3441,10 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
KernelArgs.DynCGroupMem);
}
+ // Increase to the requested dynamic memory size for the device if needed.
+ DynBlockMemSize =
+ std::max(DynBlockMemSize, GenericDevice.getDynamicMemorySize());
+
// Push the kernel launch into the stream.
return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
getStaticBlockMemSize() + DynBlockMemSize,
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index eda7a85f750f0..b052197e2aa6a 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -1322,6 +1322,10 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
if (GenericDevice.getRPCServer())
GenericDevice.Plugin.getRPCServer().Thread->notify();
+ // Increase to the requested dynamic memory size for the device if needed.
+ DynBlockMemSize =
+ std::max(DynBlockMemSize, GenericDevice.getDynamicMemorySize());
+
// In case we require more memory than the current limit.
if (DynBlockMemSize >= MaxDynBlockMemSize) {
CUresult AttrResult = cuFuncSetAttribute(
diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp
index 9605bad457e11..3ac62e5893f8b 100644
--- a/openmp/runtime/src/kmp_csupport.cpp
+++ b/openmp/runtime/src/kmp_csupport.cpp
@@ -4517,7 +4517,7 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator) {
void *omp_get_dyn_groupprivate_ptr(size_t offset, int *is_fallback,
omp_access_t access_group) {
- if (is_fallback != NULL)
+ if (is_fallback != nullptr)
*is_fallback = 0;
return NULL;
}
>From c34e0627d5909a7f6e3822ddc9e7f6a844604f10 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Sun, 17 Aug 2025 00:26:46 -0700
Subject: [PATCH 05/38] [OpenMP][Flang] Add empty clause support for
dyn_groupprivate in Flang
---
flang/include/flang/Lower/OpenMP/Clauses.h | 1 +
flang/lib/Lower/OpenMP/Clauses.cpp | 2 ++
flang/lib/Parser/openmp-parsers.cpp | 2 ++
flang/lib/Semantics/check-omp-structure.cpp | 1 +
llvm/include/llvm/Frontend/OpenMP/ClauseT.h | 23 ++++++++++++-------
.../Frontend/OpenMPDecompositionTest.cpp | 1 +
6 files changed, 22 insertions(+), 8 deletions(-)
diff --git a/flang/include/flang/Lower/OpenMP/Clauses.h b/flang/include/flang/Lower/OpenMP/Clauses.h
index 7f317f05f67b7..1ab594ffcd209 100644
--- a/flang/include/flang/Lower/OpenMP/Clauses.h
+++ b/flang/include/flang/Lower/OpenMP/Clauses.h
@@ -219,6 +219,7 @@ using DistSchedule = tomp::clause::DistScheduleT<TypeTy, IdTy, ExprTy>;
using Doacross = tomp::clause::DoacrossT<TypeTy, IdTy, ExprTy>;
using DynamicAllocators =
tomp::clause::DynamicAllocatorsT<TypeTy, IdTy, ExprTy>;
+using DynGroupprivate = tomp::clause::DynGroupprivateT<TypeTy, IdTy, ExprTy>;
using Enter = tomp::clause::EnterT<TypeTy, IdTy, ExprTy>;
using Exclusive = tomp::clause::ExclusiveT<TypeTy, IdTy, ExprTy>;
using Fail = tomp::clause::FailT<TypeTy, IdTy, ExprTy>;
diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp
index 22a07219d3a50..1ab1252f33203 100644
--- a/flang/lib/Lower/OpenMP/Clauses.cpp
+++ b/flang/lib/Lower/OpenMP/Clauses.cpp
@@ -220,6 +220,7 @@ MAKE_EMPTY_CLASS(Acquire, Acquire);
MAKE_EMPTY_CLASS(Capture, Capture);
MAKE_EMPTY_CLASS(Compare, Compare);
MAKE_EMPTY_CLASS(DynamicAllocators, DynamicAllocators);
+MAKE_EMPTY_CLASS(DynGroupprivate, DynGroupprivate);
MAKE_EMPTY_CLASS(Full, Full);
MAKE_EMPTY_CLASS(Inbranch, Inbranch);
MAKE_EMPTY_CLASS(Mergeable, Mergeable);
@@ -769,6 +770,7 @@ Doacross make(const parser::OmpClause::Doacross &inp,
}
// DynamicAllocators: empty
+// DynGroupprivate: empty
Enter make(const parser::OmpClause::Enter &inp,
semantics::SemanticsContext &semaCtx) {
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index d70aaab82cbab..54082c6bc02f6 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -999,6 +999,8 @@ TYPE_PARSER( //
construct<OmpClause>(parenthesized(Parser<OmpDoacrossClause>{})) ||
"DYNAMIC_ALLOCATORS" >>
construct<OmpClause>(construct<OmpClause::DynamicAllocators>()) ||
+ "DYN_GROUPPRIVATE" >>
+ construct<OmpClause>(construct<OmpClause::DynGroupprivate>()) ||
"ENTER" >> construct<OmpClause>(construct<OmpClause::Enter>(
parenthesized(Parser<OmpObjectList>{}))) ||
"EXCLUSIVE" >> construct<OmpClause>(construct<OmpClause::Exclusive>(
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 2425265e196c6..3675bb6d9d38e 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -2545,6 +2545,7 @@ CHECK_SIMPLE_CLAUSE(Default, OMPC_default)
CHECK_SIMPLE_CLAUSE(Depobj, OMPC_depobj)
CHECK_SIMPLE_CLAUSE(DeviceType, OMPC_device_type)
CHECK_SIMPLE_CLAUSE(DistSchedule, OMPC_dist_schedule)
+CHECK_SIMPLE_CLAUSE(DynGroupprivate, OMPC_dyn_groupprivate)
CHECK_SIMPLE_CLAUSE(Exclusive, OMPC_exclusive)
CHECK_SIMPLE_CLAUSE(Final, OMPC_final)
CHECK_SIMPLE_CLAUSE(Flush, OMPC_flush)
diff --git a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
index de888ff86fe91..df75bccc2c867 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
@@ -574,6 +574,12 @@ struct DynamicAllocatorsT {
using EmptyTrait = std::true_type;
};
+// V6.1: `dyn_groupprivate` clause
+template <typename T, typename I, typename E> //
+struct DynGroupprivateT {
+ using EmptyTrait = std::true_type;
+};
+
// V5.2: [5.8.4] `enter` clause
template <typename T, typename I, typename E> //
struct EnterT {
@@ -1243,14 +1249,15 @@ using ExtensionClausesT =
template <typename T, typename I, typename E>
using EmptyClausesT = std::variant<
AcqRelT<T, I, E>, AcquireT<T, I, E>, CaptureT<T, I, E>, CompareT<T, I, E>,
- DynamicAllocatorsT<T, I, E>, FullT<T, I, E>, InbranchT<T, I, E>,
- MergeableT<T, I, E>, NogroupT<T, I, E>, NoOpenmpRoutinesT<T, I, E>,
- NoOpenmpT<T, I, E>, NoParallelismT<T, I, E>, NotinbranchT<T, I, E>,
- NowaitT<T, I, E>, ReadT<T, I, E>, RelaxedT<T, I, E>, ReleaseT<T, I, E>,
- ReverseOffloadT<T, I, E>, SeqCstT<T, I, E>, SimdT<T, I, E>,
- ThreadsT<T, I, E>, UnifiedAddressT<T, I, E>, UnifiedSharedMemoryT<T, I, E>,
- UnknownT<T, I, E>, UntiedT<T, I, E>, UseT<T, I, E>, WeakT<T, I, E>,
- WriteT<T, I, E>, NoOpenmpConstructsT<T, I, E>, SelfMapsT<T, I, E>>;
+ DynamicAllocatorsT<T, I, E>, DynGroupprivateT<T, I, E>, FullT<T, I, E>,
+ InbranchT<T, I, E>, MergeableT<T, I, E>, NogroupT<T, I, E>,
+ NoOpenmpRoutinesT<T, I, E>, NoOpenmpT<T, I, E>, NoParallelismT<T, I, E>,
+ NotinbranchT<T, I, E>, NowaitT<T, I, E>, ReadT<T, I, E>, RelaxedT<T, I, E>,
+ ReleaseT<T, I, E>, ReverseOffloadT<T, I, E>, SeqCstT<T, I, E>,
+ SimdT<T, I, E>, ThreadsT<T, I, E>, UnifiedAddressT<T, I, E>,
+ UnifiedSharedMemoryT<T, I, E>, UnknownT<T, I, E>, UntiedT<T, I, E>,
+ UseT<T, I, E>, WeakT<T, I, E>, WriteT<T, I, E>,
+ NoOpenmpConstructsT<T, I, E>, SelfMapsT<T, I, E>>;
template <typename T, typename I, typename E>
using IncompleteClausesT =
diff --git a/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp b/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
index 6189d0954891b..79d640968fe86 100644
--- a/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
@@ -88,6 +88,7 @@ using DistSchedule = tomp::clause::DistScheduleT<TypeTy, IdTy, ExprTy>;
using Doacross = tomp::clause::DoacrossT<TypeTy, IdTy, ExprTy>;
using DynamicAllocators =
tomp::clause::DynamicAllocatorsT<TypeTy, IdTy, ExprTy>;
+using DynGroupprivate = tomp::clause::DynGroupprivateT<TypeTy, IdTy, ExprTy>;
using Enter = tomp::clause::EnterT<TypeTy, IdTy, ExprTy>;
using Exclusive = tomp::clause::ExclusiveT<TypeTy, IdTy, ExprTy>;
using Fail = tomp::clause::FailT<TypeTy, IdTy, ExprTy>;
>From 0a7a96d0c47a7e707fecb7b01d2a8ac262b70990 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek at amd.com>
Date: Mon, 18 Aug 2025 16:01:33 -0500
Subject: [PATCH 06/38] Fix merge in ClauseT.h
---
llvm/include/llvm/Frontend/OpenMP/ClauseT.h | 17 ++++++++---------
1 file changed, 8 insertions(+), 9 deletions(-)
diff --git a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
index b30c30e590062..8ea50e7e8d416 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
@@ -1254,15 +1254,14 @@ using ExtensionClausesT =
template <typename T, typename I, typename E>
using EmptyClausesT = std::variant<
AcqRelT<T, I, E>, AcquireT<T, I, E>, CaptureT<T, I, E>, CompareT<T, I, E>,
- DynamicAllocatorsT<T, I, E>, DynGroupprivateT<T, I, E>, FullT<T, I, E>,
- InbranchT<T, I, E>, MergeableT<T, I, E>, NogroupT<T, I, E>,
- NoOpenmpRoutinesT<T, I, E>, NoOpenmpT<T, I, E>, NoParallelismT<T, I, E>,
- NotinbranchT<T, I, E>, NowaitT<T, I, E>, ReadT<T, I, E>, RelaxedT<T, I, E>,
- ReleaseT<T, I, E>, ReverseOffloadT<T, I, E>, SeqCstT<T, I, E>,
- SimdT<T, I, E>, ThreadsT<T, I, E>, UnifiedAddressT<T, I, E>,
- UnifiedSharedMemoryT<T, I, E>, UnknownT<T, I, E>, UntiedT<T, I, E>,
- UseT<T, I, E>, WeakT<T, I, E>, WriteT<T, I, E>,
- NoOpenmpConstructsT<T, I, E>, SelfMapsT<T, I, E>>;
+ DynamicAllocatorsT<T, I, E>, FullT<T, I, E>, InbranchT<T, I, E>,
+ MergeableT<T, I, E>, NogroupT<T, I, E>, NoOpenmpRoutinesT<T, I, E>,
+ NoOpenmpT<T, I, E>, NoParallelismT<T, I, E>, NotinbranchT<T, I, E>,
+ NowaitT<T, I, E>, ReadT<T, I, E>, RelaxedT<T, I, E>, ReleaseT<T, I, E>,
+ ReverseOffloadT<T, I, E>, SeqCstT<T, I, E>, SimdT<T, I, E>,
+ ThreadsT<T, I, E>, UnifiedAddressT<T, I, E>, UnifiedSharedMemoryT<T, I, E>,
+ UnknownT<T, I, E>, UntiedT<T, I, E>, UseT<T, I, E>, WeakT<T, I, E>,
+ WriteT<T, I, E>, NoOpenmpConstructsT<T, I, E>, SelfMapsT<T, I, E>>;
template <typename T, typename I, typename E>
using IncompleteClausesT =
>From b5b843914647679b0d58b619c652d60b3c1e83e8 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek at amd.com>
Date: Mon, 18 Aug 2025 16:03:46 -0500
Subject: [PATCH 07/38] more merge fixes
---
flang/lib/Lower/OpenMP/Clauses.cpp | 2 --
llvm/include/llvm/Frontend/OpenMP/OMP.td | 3 ++-
2 files changed, 2 insertions(+), 3 deletions(-)
diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp
index 729dba42a0777..1a16e1c87e250 100644
--- a/flang/lib/Lower/OpenMP/Clauses.cpp
+++ b/flang/lib/Lower/OpenMP/Clauses.cpp
@@ -220,7 +220,6 @@ MAKE_EMPTY_CLASS(Acquire, Acquire);
MAKE_EMPTY_CLASS(Capture, Capture);
MAKE_EMPTY_CLASS(Compare, Compare);
MAKE_EMPTY_CLASS(DynamicAllocators, DynamicAllocators);
-MAKE_EMPTY_CLASS(DynGroupprivate, DynGroupprivate);
MAKE_EMPTY_CLASS(Full, Full);
MAKE_EMPTY_CLASS(Inbranch, Inbranch);
MAKE_EMPTY_CLASS(Mergeable, Mergeable);
@@ -772,7 +771,6 @@ Doacross make(const parser::OmpClause::Doacross &inp,
}
// DynamicAllocators: empty
-// DynGroupprivate: empty
DynGroupprivate make(const parser::OmpClause::DynGroupprivate &inp,
semantics::SemanticsContext &semaCtx) {
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index b107e7f6f185a..56dd92fc2c023 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -2738,4 +2738,5 @@ def OMP_teams_loop : Directive<[Spelling<"teams loop">]> {
];
let leafConstructs = [OMP_Teams, OMP_loop];
let category = CA_Executable;
-}
\ No newline at end of file
+}
+
>From 6e4c54736e28b34ada5484952c794da54caa34c9 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek at amd.com>
Date: Mon, 18 Aug 2025 16:05:27 -0500
Subject: [PATCH 08/38] more merge fixes
---
llvm/include/llvm/Frontend/OpenMP/OMP.td | 2 --
1 file changed, 2 deletions(-)
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index 56dd92fc2c023..043fc591c9f76 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -2662,7 +2662,6 @@ def OMP_TeamsDistributeParallelForSimd
VersionedClause<OMPC_Collapse>,
VersionedClause<OMPC_Default>,
VersionedClause<OMPC_DistSchedule>,
- VersionedClause<OMPC_DynGroupprivate>,
VersionedClause<OMPC_FirstPrivate>,
VersionedClause<OMPC_If>,
VersionedClause<OMPC_LastPrivate>,
@@ -2739,4 +2738,3 @@ def OMP_teams_loop : Directive<[Spelling<"teams loop">]> {
let leafConstructs = [OMP_Teams, OMP_loop];
let category = CA_Executable;
}
-
>From 84fc963023b8638cf44c68efef21a5f8d0d8128b Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Sun, 24 Aug 2025 19:47:24 -0700
Subject: [PATCH 09/38] Add fixes and improvements after merge
---
clang/lib/AST/StmtProfile.cpp | 2 +-
.../target_dyn_groupprivate_messages.cpp | 6 +-
...target_teams_dyn_groupprivate_messages.cpp | 6 +-
.../teams_dyn_groupprivate_messages.cpp | 85 +++++++++++++++++++
4 files changed, 90 insertions(+), 9 deletions(-)
create mode 100644 clang/test/OpenMP/teams_dyn_groupprivate_messages.cpp
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index 1c8fec1e7328e..714c740c8fe13 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -958,7 +958,7 @@ void OMPClauseProfiler::VisitOMPXDynCGroupMemClause(
}
void OMPClauseProfiler::VisitOMPDynGroupprivateClause(
const OMPDynGroupprivateClause *C) {
- VistOMPClauseWithPreInit(C);
+ VisitOMPClauseWithPreInit(C);
if (auto *Size = C->getSize())
Profiler->VisitStmt(Size);
}
diff --git a/clang/test/OpenMP/target_dyn_groupprivate_messages.cpp b/clang/test/OpenMP/target_dyn_groupprivate_messages.cpp
index d5d855ee33e1f..f924d2bb45eaa 100644
--- a/clang/test/OpenMP/target_dyn_groupprivate_messages.cpp
+++ b/clang/test/OpenMP/target_dyn_groupprivate_messages.cpp
@@ -1,7 +1,5 @@
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 %s -Wuninitialized
-// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 %s -Wuninitialized
-// RUN: %clang_cc1 -verify -fopenmp %s -Wuninitialized
-// RUN: %clang_cc1 -verify -fopenmp-simd %s -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 %s -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 %s -Wuninitialized
void foo() {
}
diff --git a/clang/test/OpenMP/target_teams_dyn_groupprivate_messages.cpp b/clang/test/OpenMP/target_teams_dyn_groupprivate_messages.cpp
index 422dff547355c..d05bb433eab1c 100644
--- a/clang/test/OpenMP/target_teams_dyn_groupprivate_messages.cpp
+++ b/clang/test/OpenMP/target_teams_dyn_groupprivate_messages.cpp
@@ -1,7 +1,5 @@
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 %s -Wuninitialized
-// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 %s -Wuninitialized
-// RUN: %clang_cc1 -verify -fopenmp %s -Wuninitialized
-// RUN: %clang_cc1 -verify -fopenmp-simd %s -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 %s -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 %s -Wuninitialized
void foo() {
}
diff --git a/clang/test/OpenMP/teams_dyn_groupprivate_messages.cpp b/clang/test/OpenMP/teams_dyn_groupprivate_messages.cpp
new file mode 100644
index 0000000000000..a55a2a570a1b8
--- /dev/null
+++ b/clang/test/OpenMP/teams_dyn_groupprivate_messages.cpp
@@ -0,0 +1,85 @@
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 %s -Wuninitialized
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 %s -Wuninitialized
+
+void foo() {
+}
+
+bool foobool(int argc) {
+ return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, class S> // expected-note {{declared here}}
+int tmain(T argc, S **argv) {
+ T z;
+ #pragma omp teams dyn_groupprivate // expected-error {{expected '(' after 'dyn_groupprivate'}}
+ foo();
+ #pragma omp teams dyn_groupprivate ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+ foo();
+ #pragma omp teams dyn_groupprivate () // expected-error {{expected expression}}
+ foo();
+ #pragma omp teams dyn_groupprivate (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+ foo();
+ #pragma omp teams dyn_groupprivate (argc)) // expected-warning {{extra tokens at the end of '#pragma omp teams' are ignored}}
+ foo();
+ #pragma omp teams dyn_groupprivate (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+ foo();
+ #pragma omp teams dyn_groupprivate (foobool(argc)), dyn_groupprivate (true) // expected-error {{directive '#pragma omp teams' cannot contain more than one 'dyn_groupprivate' clause}}
+ foo();
+ #pragma omp teams dyn_groupprivate (S) // expected-error {{'S' does not refer to a value}}
+ foo();
+ #pragma omp teams dyn_groupprivate (argv[1]=2) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+ foo();
+ #pragma omp teams dyn_groupprivate (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+ foo();
+ #pragma omp teams dyn_groupprivate(argc+z)
+ foo();
+ return 0;
+}
+
+int main(int argc, char **argv) {
+constexpr int n = -1;
+int z;
+ #pragma omp teams dyn_groupprivate // expected-error {{expected '(' after 'dyn_groupprivate'}}
+ foo();
+ #pragma omp teams dyn_groupprivate ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+ foo();
+ #pragma omp teams dyn_groupprivate () // expected-error {{expected expression}}
+ foo();
+ #pragma omp teams dyn_groupprivate (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+ foo();
+ #pragma omp teams dyn_groupprivate (argc)) // expected-warning {{extra tokens at the end of '#pragma omp teams' are ignored}}
+ foo();
+ #pragma omp teams dyn_groupprivate (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+ foo();
+ #pragma omp teams dyn_groupprivate (foobool(argc)), dyn_groupprivate (true) // expected-error {{directive '#pragma omp teams' cannot contain more than one 'dyn_groupprivate' clause}}
+ foo();
+ #pragma omp teams dyn_groupprivate (S1) // expected-error {{'S1' does not refer to a value}}
+ foo();
+ #pragma omp teams dyn_groupprivate (argv[1]=2) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+ foo();
+ #pragma omp teams dyn_groupprivate (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+ foo();
+ #pragma omp teams dyn_groupprivate (1 0) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+ foo();
+ #pragma omp teams dyn_groupprivate(dyn_groupprivate(tmain(argc, argv) // expected-error2 {{expected ')'}} expected-note2 {{to match this '('}} expected-note {{in instantiation of function template specialization 'tmain<int, char>' requested here}}
+ foo();
+ #pragma omp teams dyn_groupprivate(-1) // expected-error {{argument to 'dyn_groupprivate' clause must be a non-negative integer value}}
+ foo();
+ #pragma omp teams dyn_groupprivate(cgrou) // expected-error {{use of undeclared identifier 'cgrou'}}
+ foo();
+ #pragma omp teams dyn_groupprivate(cgrou: argc) // expected-error {{use of undeclared identifier 'cgrou'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+ foo();
+ #pragma omp teams dyn_groupprivate(cgroup,cgroup: argc) // expected-error {{modifier 'cgroup' cannot be used along with modifier 'cgroup' in dyn_groupprivate}}
+ foo();
+ #pragma omp teams dyn_groupprivate(fallback,strict: argc) // expected-error {{modifier 'strict' cannot be used along with modifier 'fallback' in dyn_groupprivate}}
+ foo();
+ #pragma omp teams dyn_groupprivate(strict,fallback: argc) // expected-error {{modifier 'fallback' cannot be used along with modifier 'strict' in dyn_groupprivate}}
+ foo();
+ #pragma omp teams dyn_groupprivate(: argc) // expected-error {{expected ')'}} expected-error {{expected expression}} expected-note {{to match this '('}}
+ foo();
+
+ return tmain(argc, argv);
+}
+
>From 86f0cf0a5fcbea3dfbb07dfcaeb610380596113b Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Wed, 22 Oct 2025 22:14:16 -0700
Subject: [PATCH 10/38] Update syntax for fallback complex modifier
---
clang/include/clang/AST/OpenMPClause.h | 65 +++++++--------
.../clang/Basic/DiagnosticSemaKinds.td | 2 +-
clang/include/clang/Basic/OpenMPKinds.def | 11 ++-
clang/include/clang/Basic/OpenMPKinds.h | 10 ++-
clang/include/clang/Sema/SemaOpenMP.h | 2 +-
clang/lib/AST/OpenMPClause.cpp | 13 ++-
clang/lib/Basic/OpenMPKinds.cpp | 7 ++
clang/lib/Parse/ParseOpenMP.cpp | 79 ++++++++++++++-----
clang/lib/Sema/SemaOpenMP.cpp | 23 ++----
clang/lib/Sema/TreeTransform.h | 10 +--
clang/lib/Serialization/ASTReader.cpp | 11 +--
clang/lib/Serialization/ASTWriter.cpp | 8 +-
.../target_dyn_groupprivate_messages.cpp | 8 +-
...target_teams_dyn_groupprivate_messages.cpp | 8 +-
.../teams_dyn_groupprivate_messages.cpp | 8 +-
15 files changed, 158 insertions(+), 107 deletions(-)
diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h
index a3983120df069..f671884280134 100644
--- a/clang/include/clang/AST/OpenMPClause.h
+++ b/clang/include/clang/AST/OpenMPClause.h
@@ -9781,8 +9781,8 @@ class OMPDynGroupprivateClause : public OMPClause, public OMPClauseWithPreInit {
SourceLocation LParenLoc;
/// Modifiers for 'dyn_groupprivate' clause.
- enum { FIRST, SECOND, NUM_MODIFIERS };
- OpenMPDynGroupprivateClauseModifier Modifiers[NUM_MODIFIERS];
+ enum { SIMPLE, FALLBACK, NUM_MODIFIERS };
+ unsigned Modifiers[NUM_MODIFIERS];
/// Locations of modifiers.
SourceLocation ModifiersLoc[NUM_MODIFIERS];
@@ -9793,37 +9793,26 @@ class OMPDynGroupprivateClause : public OMPClause, public OMPClauseWithPreInit {
/// Set the first dyn_groupprivate modifier.
///
/// \param M The modifier.
- void setFirstDynGroupprivateModifier(OpenMPDynGroupprivateClauseModifier M) {
- Modifiers[FIRST] = M;
+ void setDynGroupprivateModifier(OpenMPDynGroupprivateClauseModifier M) {
+ Modifiers[SIMPLE] = M;
}
/// Set the second dyn_groupprivate modifier.
///
/// \param M The modifier.
- void setSecondDynGroupprivateModifier(OpenMPDynGroupprivateClauseModifier M) {
- Modifiers[SECOND] = M;
+ void setDynGroupprivateFallbackModifier(
+ OpenMPDynGroupprivateClauseFallbackModifier M) {
+ Modifiers[FALLBACK] = M;
}
/// Set location of the first dyn_groupprivate modifier.
- void setFirstDynGroupprivateModifierLoc(SourceLocation Loc) {
- ModifiersLoc[FIRST] = Loc;
+ void setDynGroupprivateModifierLoc(SourceLocation Loc) {
+ ModifiersLoc[SIMPLE] = Loc;
}
/// Set location of the second dyn_groupprivate modifier.
- void setSecondDynGroupprivateModifierLoc(SourceLocation Loc) {
- ModifiersLoc[SECOND] = Loc;
- }
-
- /// Set dyn_groupprivate modifier location.
- ///
- /// \param M The modifier location.
- void setDynGroupprivateModifer(OpenMPDynGroupprivateClauseModifier M) {
- if (Modifiers[FIRST] == OMPC_DYN_GROUPPRIVATE_unknown)
- Modifiers[FIRST] = M;
- else {
- assert(Modifiers[SECOND] == OMPC_DYN_GROUPPRIVATE_unknown);
- Modifiers[SECOND] = M;
- }
+ void setDynGroupprivateFallbackModifierLoc(SourceLocation Loc) {
+ ModifiersLoc[FALLBACK] = Loc;
}
/// Sets the location of '('.
@@ -9852,15 +9841,15 @@ class OMPDynGroupprivateClause : public OMPClause, public OMPClauseWithPreInit {
OpenMPDirectiveKind CaptureRegion,
OpenMPDynGroupprivateClauseModifier M1,
SourceLocation M1Loc,
- OpenMPDynGroupprivateClauseModifier M2,
+ OpenMPDynGroupprivateClauseFallbackModifier M2,
SourceLocation M2Loc)
: OMPClause(llvm::omp::OMPC_dyn_groupprivate, StartLoc, EndLoc),
OMPClauseWithPreInit(this), LParenLoc(LParenLoc), Size(Size) {
setPreInitStmt(HelperSize, CaptureRegion);
- Modifiers[FIRST] = M1;
- Modifiers[SECOND] = M2;
- ModifiersLoc[FIRST] = M1Loc;
- ModifiersLoc[SECOND] = M2Loc;
+ Modifiers[SIMPLE] = M1;
+ Modifiers[FALLBACK] = M2;
+ ModifiersLoc[SIMPLE] = M1Loc;
+ ModifiersLoc[FALLBACK] = M2Loc;
}
/// Build an empty clause.
@@ -9868,31 +9857,33 @@ class OMPDynGroupprivateClause : public OMPClause, public OMPClauseWithPreInit {
: OMPClause(llvm::omp::OMPC_dyn_groupprivate, SourceLocation(),
SourceLocation()),
OMPClauseWithPreInit(this) {
- Modifiers[FIRST] = OMPC_DYN_GROUPPRIVATE_unknown;
- Modifiers[SECOND] = OMPC_DYN_GROUPPRIVATE_unknown;
+ Modifiers[SIMPLE] = OMPC_DYN_GROUPPRIVATE_unknown;
+ Modifiers[FALLBACK] = OMPC_DYN_GROUPPRIVATE_FALLBACK_unknown;
}
/// Get the first modifier of the clause.
- OpenMPDynGroupprivateClauseModifier getFirstDynGroupprivateModifier() const {
- return Modifiers[FIRST];
+ OpenMPDynGroupprivateClauseModifier getDynGroupprivateModifier() const {
+ return static_cast<OpenMPDynGroupprivateClauseModifier>(Modifiers[SIMPLE]);
}
/// Get the second modifier of the clause.
- OpenMPDynGroupprivateClauseModifier getSecondDynGroupprivateModifier() const {
- return Modifiers[SECOND];
+ OpenMPDynGroupprivateClauseFallbackModifier
+ getDynGroupprivateFallbackModifier() const {
+ return static_cast<OpenMPDynGroupprivateClauseFallbackModifier>(
+ Modifiers[FALLBACK]);
}
/// Get location of '('.
SourceLocation getLParenLoc() { return LParenLoc; }
/// Get the first modifier location.
- SourceLocation getFirstDynGroupprivateModifierLoc() const {
- return ModifiersLoc[FIRST];
+ SourceLocation getDynGroupprivateModifierLoc() const {
+ return ModifiersLoc[SIMPLE];
}
/// Get the second modifier location.
- SourceLocation getSecondDynGroupprivateModifierLoc() const {
- return ModifiersLoc[SECOND];
+ SourceLocation getDynGroupprivateFallbackModifierLoc() const {
+ return ModifiersLoc[FALLBACK];
}
/// Get size.
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 47f1e43814fbb..0a66ab7cc3e53 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -12058,7 +12058,7 @@ def err_omp_unexpected_schedule_modifier : Error<
"modifier '%0' cannot be used along with modifier '%1'">;
def err_omp_schedule_nonmonotonic_static : Error<
"'nonmonotonic' modifier can only be specified with 'dynamic' or 'guided' schedule kind">;
-def err_omp_unexpected_dyn_groupprivate_modifier
+def err_omp_incompatible_dyn_groupprivate_modifier
: Error<"modifier '%0' cannot be used along with modifier '%1' in "
"dyn_groupprivate">;
def err_omp_simple_clause_incompatible_with_ordered : Error<
diff --git a/clang/include/clang/Basic/OpenMPKinds.def b/clang/include/clang/Basic/OpenMPKinds.def
index 3321e19cae9b1..3dd919cf9d2d8 100644
--- a/clang/include/clang/Basic/OpenMPKinds.def
+++ b/clang/include/clang/Basic/OpenMPKinds.def
@@ -86,6 +86,9 @@
#ifndef OPENMP_DYN_GROUPPRIVATE_MODIFIER
#define OPENMP_DYN_GROUPPRIVATE_MODIFIER(Name)
#endif
+#ifndef OPENMP_DYN_GROUPPRIVATE_FALLBACK_MODIFIER
+#define OPENMP_DYN_GROUPPRIVATE_FALLBACK_MODIFIER(Name)
+#endif
#ifndef OPENMP_NUMTASKS_MODIFIER
#define OPENMP_NUMTASKS_MODIFIER(Name)
#endif
@@ -232,8 +235,11 @@ OPENMP_GRAINSIZE_MODIFIER(strict)
// Modifiers for the 'dyn_groupprivate' clause.
OPENMP_DYN_GROUPPRIVATE_MODIFIER(cgroup)
-OPENMP_DYN_GROUPPRIVATE_MODIFIER(strict)
-OPENMP_DYN_GROUPPRIVATE_MODIFIER(fallback)
+
+// Fallback modifiers for the 'dyn_groupprivate' clause.
+OPENMP_DYN_GROUPPRIVATE_FALLBACK_MODIFIER(abort)
+OPENMP_DYN_GROUPPRIVATE_FALLBACK_MODIFIER(null)
+OPENMP_DYN_GROUPPRIVATE_FALLBACK_MODIFIER(default_mem)
// Modifiers for the 'num_tasks' clause.
OPENMP_NUMTASKS_MODIFIER(strict)
@@ -254,6 +260,7 @@ OPENMP_DOACROSS_MODIFIER(source_omp_cur_iteration)
#undef OPENMP_NUMTASKS_MODIFIER
#undef OPENMP_NUMTHREADS_MODIFIER
#undef OPENMP_DYN_GROUPPRIVATE_MODIFIER
+#undef OPENMP_DYN_GROUPPRIVATE_FALLBACK_MODIFIER
#undef OPENMP_GRAINSIZE_MODIFIER
#undef OPENMP_BIND_KIND
#undef OPENMP_ADJUST_ARGS_KIND
diff --git a/clang/include/clang/Basic/OpenMPKinds.h b/clang/include/clang/Basic/OpenMPKinds.h
index 3e164bf1adf22..6f9f30aef4311 100644
--- a/clang/include/clang/Basic/OpenMPKinds.h
+++ b/clang/include/clang/Basic/OpenMPKinds.h
@@ -223,9 +223,13 @@ enum OpenMPDynGroupprivateClauseModifier {
OMPC_DYN_GROUPPRIVATE_unknown
};
-/// Number of allowed dyn_groupprivate-modifiers.
-static constexpr unsigned NumberOfOMPDynGroupprivateClauseModifiers =
- OMPC_DYN_GROUPPRIVATE_unknown;
+enum OpenMPDynGroupprivateClauseFallbackModifier {
+ OMPC_DYN_GROUPPRIVATE_FALLBACK_unknown = OMPC_DYN_GROUPPRIVATE_unknown,
+#define OPENMP_DYN_GROUPPRIVATE_FALLBACK_MODIFIER(Name) \
+ OMPC_DYN_GROUPPRIVATE_FALLBACK_##Name,
+#include "clang/Basic/OpenMPKinds.def"
+ OMPC_DYN_GROUPPRIVATE_FALLBACK_last
+};
enum OpenMPNumTasksClauseModifier {
#define OPENMP_NUMTASKS_MODIFIER(Name) OMPC_NUMTASKS_##Name,
diff --git a/clang/include/clang/Sema/SemaOpenMP.h b/clang/include/clang/Sema/SemaOpenMP.h
index 3b161ff3c7d45..b44bc5ff8d71a 100644
--- a/clang/include/clang/Sema/SemaOpenMP.h
+++ b/clang/include/clang/Sema/SemaOpenMP.h
@@ -1388,7 +1388,7 @@ class SemaOpenMP : public SemaBase {
/// Called on a well-formed 'dyn_groupprivate' clause.
OMPClause *ActOnOpenMPDynGroupprivateClause(
OpenMPDynGroupprivateClauseModifier M1,
- OpenMPDynGroupprivateClauseModifier M2, Expr *Size,
+ OpenMPDynGroupprivateClauseFallbackModifier M2, Expr *Size,
SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation M1Loc,
SourceLocation M2Loc, SourceLocation EndLoc);
diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp
index 8f694bd857b27..749e9d2a7cc34 100644
--- a/clang/lib/AST/OpenMPClause.cpp
+++ b/clang/lib/AST/OpenMPClause.cpp
@@ -2726,15 +2726,14 @@ void OMPClausePrinter::VisitOMPXDynCGroupMemClause(
void OMPClausePrinter::VisitOMPDynGroupprivateClause(
OMPDynGroupprivateClause *Node) {
OS << "dyn_groupprivate(";
- if (Node->getFirstDynGroupprivateModifier() !=
- OMPC_DYN_GROUPPRIVATE_unknown) {
- OS << getOpenMPSimpleClauseTypeName(
- OMPC_dyn_groupprivate, Node->getFirstDynGroupprivateModifier());
- if (Node->getSecondDynGroupprivateModifier() !=
- OMPC_DYN_GROUPPRIVATE_unknown) {
+ if (Node->getDynGroupprivateModifier() != OMPC_DYN_GROUPPRIVATE_unknown) {
+ OS << getOpenMPSimpleClauseTypeName(OMPC_dyn_groupprivate,
+ Node->getDynGroupprivateModifier());
+ if (Node->getDynGroupprivateFallbackModifier() !=
+ OMPC_DYN_GROUPPRIVATE_FALLBACK_unknown) {
OS << ", ";
OS << getOpenMPSimpleClauseTypeName(
- OMPC_dyn_groupprivate, Node->getSecondDynGroupprivateModifier());
+ OMPC_dyn_groupprivate, Node->getDynGroupprivateFallbackModifier());
}
OS << ": ";
}
diff --git a/clang/lib/Basic/OpenMPKinds.cpp b/clang/lib/Basic/OpenMPKinds.cpp
index 59baab0da1655..eb2888f448036 100644
--- a/clang/lib/Basic/OpenMPKinds.cpp
+++ b/clang/lib/Basic/OpenMPKinds.cpp
@@ -175,6 +175,9 @@ unsigned clang::getOpenMPSimpleClauseType(OpenMPClauseKind Kind, StringRef Str,
return llvm::StringSwitch<unsigned>(Str)
#define OPENMP_DYN_GROUPPRIVATE_MODIFIER(Name) \
.Case(#Name, OMPC_DYN_GROUPPRIVATE_##Name)
+#define OPENMP_DYN_GROUPPRIVATE_FALLBACK_MODIFIER(Name) \
+ .Case(#Name, OMPC_DYN_GROUPPRIVATE_FALLBACK_##Name) \
+ .Case("fallback(" #Name ")", OMPC_DYN_GROUPPRIVATE_FALLBACK_##Name)
#include "clang/Basic/OpenMPKinds.def"
.Default(OMPC_DYN_GROUPPRIVATE_unknown);
}
@@ -518,10 +521,14 @@ const char *clang::getOpenMPSimpleClauseTypeName(OpenMPClauseKind Kind,
case OMPC_dyn_groupprivate:
switch (Type) {
case OMPC_DYN_GROUPPRIVATE_unknown:
+ case OMPC_DYN_GROUPPRIVATE_FALLBACK_last:
return "unknown";
#define OPENMP_DYN_GROUPPRIVATE_MODIFIER(Name) \
case OMPC_DYN_GROUPPRIVATE_##Name: \
return #Name;
+#define OPENMP_DYN_GROUPPRIVATE_FALLBACK_MODIFIER(Name) \
+ case OMPC_DYN_GROUPPRIVATE_FALLBACK_##Name: \
+ return "fallback(" #Name ")";
#include "clang/Basic/OpenMPKinds.def"
}
llvm_unreachable("Invalid OpenMP 'dyn_groupprivate' clause modifier");
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index c7e6a6267b453..115ed74227175 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -3837,31 +3837,72 @@ OMPClause *Parser::ParseOpenMPSingleExprWithArgClause(OpenMPDirectiveKind DKind,
KLoc.emplace_back();
}
} else if (Kind == OMPC_dyn_groupprivate) {
- enum { Modifier1, Modifier2, NumberOfElements };
- Arg.resize(NumberOfElements);
- KLoc.resize(NumberOfElements);
- Arg[Modifier1] = OMPC_DYN_GROUPPRIVATE_unknown;
- Arg[Modifier2] = OMPC_DYN_GROUPPRIVATE_unknown;
- unsigned Modifier = getOpenMPSimpleClauseType(
- Kind, Tok.isAnnotation() ? "" : PP.getSpelling(Tok), getLangOpts());
+ enum { SimpleModifier, ComplexModifier, NumberOfModifiers };
+ Arg.resize(NumberOfModifiers);
+ KLoc.resize(NumberOfModifiers);
+ Arg[SimpleModifier] = OMPC_DYN_GROUPPRIVATE_unknown;
+ Arg[ComplexModifier] = OMPC_DYN_GROUPPRIVATE_FALLBACK_unknown;
+
+ auto consumeModifier = [&]() {
+ unsigned Type = NumberOfModifiers;
+ unsigned Modifier;
+ SourceLocation Loc;
+ if (PP.getSpelling(Tok) == "fallback" && NextToken().is(tok::l_paren)) {
+ ConsumeToken();
+ BalancedDelimiterTracker ParenT(*this, tok::l_paren, tok::r_paren);
+ ParenT.consumeOpen();
- if (Modifier < OMPC_DYN_GROUPPRIVATE_unknown) {
- // Parse 'modifier'
- Arg[Modifier1] = Modifier;
- KLoc[Modifier1] = Tok.getLocation();
- if (Tok.isNot(tok::r_paren) && Tok.isNot(tok::comma) &&
- Tok.isNot(tok::annot_pragma_openmp_end))
- ConsumeAnyToken();
- if (Tok.is(tok::comma)) {
- // Parse ',' 'modifier'
- ConsumeAnyToken();
Modifier = getOpenMPSimpleClauseType(
Kind, Tok.isAnnotation() ? "" : PP.getSpelling(Tok), getLangOpts());
- Arg[Modifier2] = Modifier;
- KLoc[Modifier2] = Tok.getLocation();
+ if (Modifier <= OMPC_DYN_GROUPPRIVATE_FALLBACK_unknown ||
+ Modifier >= OMPC_DYN_GROUPPRIVATE_FALLBACK_last) {
+ Diag(Tok.getLocation(), diag::err_expected)
+ << "'abort', 'null' or 'default_mem' in fallback modifier";
+ SkipUntil(tok::r_paren);
+ return std::make_tuple(Type, Modifier, Loc);
+ }
+ Type = ComplexModifier;
+ Loc = Tok.getLocation();
if (Tok.isNot(tok::r_paren) && Tok.isNot(tok::comma) &&
Tok.isNot(tok::annot_pragma_openmp_end))
ConsumeAnyToken();
+ ParenT.consumeClose();
+ } else {
+ Modifier = getOpenMPSimpleClauseType(
+ Kind, Tok.isAnnotation() ? "" : PP.getSpelling(Tok), getLangOpts());
+ if (Modifier < OMPC_DYN_GROUPPRIVATE_unknown) {
+ Type = SimpleModifier;
+ Loc = Tok.getLocation();
+ if (Tok.isNot(tok::r_paren) && Tok.isNot(tok::comma) &&
+ Tok.isNot(tok::annot_pragma_openmp_end))
+ ConsumeAnyToken();
+ }
+ }
+ return std::make_tuple(Type, Modifier, Loc);
+ };
+
+ auto saveModifier = [&](unsigned Type, unsigned Modifier,
+ SourceLocation Loc) {
+ assert(Type < NumberOfModifiers);
+ if (!KLoc[Type].isValid()) {
+ Arg[Type] = Modifier;
+ KLoc[Type] = Loc;
+ } else
+ Diag(Loc, diag::err_omp_incompatible_dyn_groupprivate_modifier)
+ << getOpenMPSimpleClauseTypeName(OMPC_dyn_groupprivate, Modifier)
+ << getOpenMPSimpleClauseTypeName(OMPC_dyn_groupprivate, Arg[Type]);
+ };
+
+ // Parse 'modifier'
+ auto [Type1, Mod1, Loc1] = consumeModifier();
+ if (Type1 < NumberOfModifiers) {
+ saveModifier(Type1, Mod1, Loc1);
+ if (Tok.is(tok::comma)) {
+ // Parse ',' 'modifier'
+ ConsumeAnyToken();
+ auto [Type2, Mod2, Loc2] = consumeModifier();
+ if (Type2 < NumberOfModifiers)
+ saveModifier(Type2, Mod2, Loc2);
}
// Parse ':'
if (Tok.is(tok::colon))
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 67d55acca94df..2a5bab5e22355 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -16731,7 +16731,8 @@ OMPClause *SemaOpenMP::ActOnOpenMPSingleExprWithArgClause(
ArgumentLoc.size() == NumberOfElements);
Res = ActOnOpenMPDynGroupprivateClause(
static_cast<OpenMPDynGroupprivateClauseModifier>(Argument[Modifier1]),
- static_cast<OpenMPDynGroupprivateClauseModifier>(Argument[Modifier2]),
+ static_cast<OpenMPDynGroupprivateClauseFallbackModifier>(
+ Argument[Modifier2]),
Expr, StartLoc, LParenLoc, ArgumentLoc[Modifier1],
ArgumentLoc[Modifier2], EndLoc);
} break;
@@ -24179,12 +24180,12 @@ OMPClause *SemaOpenMP::ActOnOpenMPXDynCGroupMemClause(Expr *Size,
OMPClause *SemaOpenMP::ActOnOpenMPDynGroupprivateClause(
OpenMPDynGroupprivateClauseModifier M1,
- OpenMPDynGroupprivateClauseModifier M2, Expr *Size, SourceLocation StartLoc,
- SourceLocation LParenLoc, SourceLocation M1Loc, SourceLocation M2Loc,
- SourceLocation EndLoc) {
+ OpenMPDynGroupprivateClauseFallbackModifier M2, Expr *Size,
+ SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation M1Loc,
+ SourceLocation M2Loc, SourceLocation EndLoc) {
if ((M1Loc.isValid() && M1 == OMPC_DYN_GROUPPRIVATE_unknown) ||
- (M2Loc.isValid() && M2 == OMPC_DYN_GROUPPRIVATE_unknown)) {
+ (M2Loc.isValid() && M2 == OMPC_DYN_GROUPPRIVATE_FALLBACK_unknown)) {
std::string Values = getListOfPossibleValues(
OMPC_dyn_groupprivate, /*First=*/0, OMPC_DYN_GROUPPRIVATE_unknown);
Diag((M1Loc.isValid() && M1 == OMPC_DYN_GROUPPRIVATE_unknown) ? M1Loc
@@ -24194,18 +24195,6 @@ OMPClause *SemaOpenMP::ActOnOpenMPDynGroupprivateClause(
return nullptr;
}
- if ((M1Loc.isValid() && M2Loc.isValid() && M1 == M2) ||
- (M1 == OMPC_DYN_GROUPPRIVATE_strict &&
- M2 == OMPC_DYN_GROUPPRIVATE_fallback) ||
- (M1 == OMPC_DYN_GROUPPRIVATE_fallback &&
- M2 == OMPC_DYN_GROUPPRIVATE_strict)) {
-
- Diag(M2Loc, diag::err_omp_unexpected_dyn_groupprivate_modifier)
- << getOpenMPSimpleClauseTypeName(OMPC_dyn_groupprivate, M2)
- << getOpenMPSimpleClauseTypeName(OMPC_dyn_groupprivate, M1);
- return nullptr;
- }
-
Expr *ValExpr = Size;
Stmt *HelperValStmt = nullptr;
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 3da81923890ae..e24653069e313 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -2465,7 +2465,7 @@ class TreeTransform {
/// Subclasses may override this routine to provide different behavior.
OMPClause *RebuildOMPDynGroupprivateClause(
OpenMPDynGroupprivateClauseModifier M1,
- OpenMPDynGroupprivateClauseModifier M2, Expr *Size,
+ OpenMPDynGroupprivateClauseFallbackModifier M2, Expr *Size,
SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation M1Loc,
SourceLocation M2Loc, SourceLocation EndLoc) {
return getSema().OpenMP().ActOnOpenMPDynGroupprivateClause(
@@ -11727,10 +11727,10 @@ OMPClause *TreeTransform<Derived>::TransformOMPDynGroupprivateClause(
if (Size.isInvalid())
return nullptr;
return getDerived().RebuildOMPDynGroupprivateClause(
- C->getFirstDynGroupprivateModifier(),
- C->getSecondDynGroupprivateModifier(), Size.get(), C->getBeginLoc(),
- C->getLParenLoc(), C->getFirstDynGroupprivateModifierLoc(),
- C->getSecondDynGroupprivateModifierLoc(), C->getEndLoc());
+ C->getDynGroupprivateModifier(), C->getDynGroupprivateFallbackModifier(),
+ Size.get(), C->getBeginLoc(), C->getLParenLoc(),
+ C->getDynGroupprivateModifierLoc(),
+ C->getDynGroupprivateFallbackModifierLoc(), C->getEndLoc());
}
template <typename Derived>
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 61fd0db41c395..f82899beb3194 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -12714,14 +12714,15 @@ void OMPClauseReader::VisitOMPXDynCGroupMemClause(OMPXDynCGroupMemClause *C) {
void OMPClauseReader::VisitOMPDynGroupprivateClause(
OMPDynGroupprivateClause *C) {
VisitOMPClauseWithPreInit(C);
- C->setFirstDynGroupprivateModifier(
- static_cast<OpenMPDynGroupprivateClauseModifier>(Record.readInt()));
- C->setSecondDynGroupprivateModifier(
+ C->setDynGroupprivateModifier(
static_cast<OpenMPDynGroupprivateClauseModifier>(Record.readInt()));
+ C->setDynGroupprivateFallbackModifier(
+ static_cast<OpenMPDynGroupprivateClauseFallbackModifier>(
+ Record.readInt()));
C->setSize(Record.readSubExpr());
C->setLParenLoc(Record.readSourceLocation());
- C->setFirstDynGroupprivateModifierLoc(Record.readSourceLocation());
- C->setSecondDynGroupprivateModifierLoc(Record.readSourceLocation());
+ C->setDynGroupprivateModifierLoc(Record.readSourceLocation());
+ C->setDynGroupprivateFallbackModifierLoc(Record.readSourceLocation());
}
void OMPClauseReader::VisitOMPDoacrossClause(OMPDoacrossClause *C) {
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 24fc0ceebb055..f298f03683e66 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -8618,12 +8618,12 @@ void OMPClauseWriter::VisitOMPXDynCGroupMemClause(OMPXDynCGroupMemClause *C) {
void OMPClauseWriter::VisitOMPDynGroupprivateClause(
OMPDynGroupprivateClause *C) {
VisitOMPClauseWithPreInit(C);
- Record.push_back(C->getFirstDynGroupprivateModifier());
- Record.push_back(C->getSecondDynGroupprivateModifier());
+ Record.push_back(C->getDynGroupprivateModifier());
+ Record.push_back(C->getDynGroupprivateFallbackModifier());
Record.AddStmt(C->getSize());
Record.AddSourceLocation(C->getLParenLoc());
- Record.AddSourceLocation(C->getFirstDynGroupprivateModifierLoc());
- Record.AddSourceLocation(C->getSecondDynGroupprivateModifierLoc());
+ Record.AddSourceLocation(C->getDynGroupprivateModifierLoc());
+ Record.AddSourceLocation(C->getDynGroupprivateFallbackModifierLoc());
}
void OMPClauseWriter::VisitOMPDoacrossClause(OMPDoacrossClause *C) {
diff --git a/clang/test/OpenMP/target_dyn_groupprivate_messages.cpp b/clang/test/OpenMP/target_dyn_groupprivate_messages.cpp
index f924d2bb45eaa..385bd5e89829d 100644
--- a/clang/test/OpenMP/target_dyn_groupprivate_messages.cpp
+++ b/clang/test/OpenMP/target_dyn_groupprivate_messages.cpp
@@ -73,9 +73,13 @@ int z;
foo();
#pragma omp target dyn_groupprivate(cgroup,cgroup: argc) // expected-error {{modifier 'cgroup' cannot be used along with modifier 'cgroup' in dyn_groupprivate}}
foo();
- #pragma omp target dyn_groupprivate(fallback,strict: argc) // expected-error {{modifier 'strict' cannot be used along with modifier 'fallback' in dyn_groupprivate}}
+ #pragma omp target dyn_groupprivate(fallback(default_mem),fallback(abort): argc) // expected-error {{modifier 'fallback(abort)' cannot be used along with modifier 'fallback(default_mem)' in dyn_groupprivate}}
foo();
- #pragma omp target dyn_groupprivate(strict,fallback: argc) // expected-error {{modifier 'fallback' cannot be used along with modifier 'strict' in dyn_groupprivate}}
+ #pragma omp target dyn_groupprivate(fallback(abort),fallback(null): argc) // expected-error {{modifier 'fallback(null)' cannot be used along with modifier 'fallback(abort)' in dyn_groupprivate}}
+ foo();
+ #pragma omp target dyn_groupprivate(fallback(cgroup): argc) // expected-error {{expected 'abort', 'null' or 'default_mem' in fallback modifier}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+ foo();
+ #pragma omp target dyn_groupprivate(fallback(): argc) // expected-error {{expected 'abort', 'null' or 'default_mem' in fallback modifier}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
foo();
#pragma omp target dyn_groupprivate(: argc) // expected-error {{expected ')'}} expected-error {{expected expression}} expected-note {{to match this '('}}
foo();
diff --git a/clang/test/OpenMP/target_teams_dyn_groupprivate_messages.cpp b/clang/test/OpenMP/target_teams_dyn_groupprivate_messages.cpp
index d05bb433eab1c..ac2cc0dde5073 100644
--- a/clang/test/OpenMP/target_teams_dyn_groupprivate_messages.cpp
+++ b/clang/test/OpenMP/target_teams_dyn_groupprivate_messages.cpp
@@ -73,9 +73,13 @@ int z;
foo();
#pragma omp target teams dyn_groupprivate(cgroup,cgroup: argc) // expected-error {{modifier 'cgroup' cannot be used along with modifier 'cgroup' in dyn_groupprivate}}
foo();
- #pragma omp target teams dyn_groupprivate(fallback,strict: argc) // expected-error {{modifier 'strict' cannot be used along with modifier 'fallback' in dyn_groupprivate}}
+ #pragma omp target dyn_groupprivate(fallback(default_mem),fallback(abort): argc) // expected-error {{modifier 'fallback(abort)' cannot be used along with modifier 'fallback(default_mem)' in dyn_groupprivate}}
foo();
- #pragma omp target teams dyn_groupprivate(strict,fallback: argc) // expected-error {{modifier 'fallback' cannot be used along with modifier 'strict' in dyn_groupprivate}}
+ #pragma omp target dyn_groupprivate(fallback(abort),fallback(null): argc) // expected-error {{modifier 'fallback(null)' cannot be used along with modifier 'fallback(abort)' in dyn_groupprivate}}
+ foo();
+ #pragma omp target dyn_groupprivate(fallback(cgroup): argc) // expected-error {{expected 'abort', 'null' or 'default_mem' in fallback modifier}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+ foo();
+ #pragma omp target dyn_groupprivate(fallback(): argc) // expected-error {{expected 'abort', 'null' or 'default_mem' in fallback modifier}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
foo();
#pragma omp target teams dyn_groupprivate(: argc) // expected-error {{expected ')'}} expected-error {{expected expression}} expected-note {{to match this '('}}
foo();
diff --git a/clang/test/OpenMP/teams_dyn_groupprivate_messages.cpp b/clang/test/OpenMP/teams_dyn_groupprivate_messages.cpp
index a55a2a570a1b8..701ebfb43eec6 100644
--- a/clang/test/OpenMP/teams_dyn_groupprivate_messages.cpp
+++ b/clang/test/OpenMP/teams_dyn_groupprivate_messages.cpp
@@ -73,9 +73,13 @@ int z;
foo();
#pragma omp teams dyn_groupprivate(cgroup,cgroup: argc) // expected-error {{modifier 'cgroup' cannot be used along with modifier 'cgroup' in dyn_groupprivate}}
foo();
- #pragma omp teams dyn_groupprivate(fallback,strict: argc) // expected-error {{modifier 'strict' cannot be used along with modifier 'fallback' in dyn_groupprivate}}
+ #pragma omp target dyn_groupprivate(fallback(default_mem),fallback(abort): argc) // expected-error {{modifier 'fallback(abort)' cannot be used along with modifier 'fallback(default_mem)' in dyn_groupprivate}}
foo();
- #pragma omp teams dyn_groupprivate(strict,fallback: argc) // expected-error {{modifier 'fallback' cannot be used along with modifier 'strict' in dyn_groupprivate}}
+ #pragma omp target dyn_groupprivate(fallback(abort),fallback(null): argc) // expected-error {{modifier 'fallback(null)' cannot be used along with modifier 'fallback(abort)' in dyn_groupprivate}}
+ foo();
+ #pragma omp target dyn_groupprivate(fallback(cgroup): argc) // expected-error {{expected 'abort', 'null' or 'default_mem' in fallback modifier}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+ foo();
+ #pragma omp target dyn_groupprivate(fallback(): argc) // expected-error {{expected 'abort', 'null' or 'default_mem' in fallback modifier}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
foo();
#pragma omp teams dyn_groupprivate(: argc) // expected-error {{expected ')'}} expected-error {{expected expression}} expected-note {{to match this '('}}
foo();
>From 3a2fe705a6c11e484ed232a166fd332634fd8a70 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Fri, 24 Oct 2025 18:57:14 -0700
Subject: [PATCH 11/38] Update for fallback complex modifier
---
clang/lib/CodeGen/CGOpenMPRuntime.cpp | 26 ++++++++++++++-----
.../llvm/Frontend/OpenMP/OMPConstants.h | 10 +++++++
.../llvm/Frontend/OpenMP/OMPIRBuilder.h | 11 ++++----
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 16 ++++++------
4 files changed, 44 insertions(+), 19 deletions(-)
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 48168193778b8..2d08e1a04ff03 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -10000,10 +10000,10 @@ static llvm::Value *emitDeviceID(
return DeviceID;
}
-static std::pair<llvm::Value *, bool>
+static std::pair<llvm::Value *, OMPDynGroupprivateFallbackType>
emitDynCGroupMem(const OMPExecutableDirective &D, CodeGenFunction &CGF) {
llvm::Value *DynGP = CGF.Builder.getInt32(0);
- bool DynGPFallback = false;
+ OMPDynGroupprivateFallbackType DynGPFallback;
if (auto *DynGPClause = D.getSingleClause<OMPDynGroupprivateClause>()) {
CodeGenFunction::RunCleanupsScope DynGPScope(CGF);
@@ -10011,10 +10011,22 @@ emitDynCGroupMem(const OMPExecutableDirective &D, CodeGenFunction &CGF) {
CGF.EmitScalarExpr(DynGPClause->getSize(), /*IgnoreResultAssign=*/true);
DynGP = CGF.Builder.CreateIntCast(DynGPVal, CGF.Int32Ty,
/*isSigned=*/false);
- DynGPFallback = (DynGPClause->getFirstDynGroupprivateModifier() !=
- OMPC_DYN_GROUPPRIVATE_strict &&
- DynGPClause->getSecondDynGroupprivateModifier() !=
- OMPC_DYN_GROUPPRIVATE_strict);
+ auto FallbackModifier = DynGPClause->getDynGroupprivateFallbackModifier();
+ switch (FallbackModifier) {
+ case OMPC_DYN_GROUPPRIVATE_FALLBACK_abort:
+ DynGPFallback = OMPDynGroupprivateFallbackType::Abort;
+ break;
+ case OMPC_DYN_GROUPPRIVATE_FALLBACK_null:
+ DynGPFallback = OMPDynGroupprivateFallbackType::Null;
+ break;
+ case OMPC_DYN_GROUPPRIVATE_FALLBACK_default_mem:
+ case OMPC_DYN_GROUPPRIVATE_FALLBACK_unknown:
+ // This is the default for dyn_groupprivate.
+ DynGPFallback = OMPDynGroupprivateFallbackType::DefaultMem;
+ break;
+ default:
+ llvm_unreachable("Unknown fallback modifier for OpenMP dyn_groupprivate");
+ }
} else if (auto *OMPXDynCGClause =
D.getSingleClause<OMPXDynCGroupMemClause>()) {
CodeGenFunction::RunCleanupsScope DynCGMemScope(CGF);
@@ -10022,9 +10034,11 @@ emitDynCGroupMem(const OMPExecutableDirective &D, CodeGenFunction &CGF) {
/*IgnoreResultAssign=*/true);
DynGP = CGF.Builder.CreateIntCast(DynCGMemVal, CGF.Int32Ty,
/*isSigned=*/false);
+ DynGPFallback = OMPDynGroupprivateFallbackType::Abort;
}
return {DynGP, DynGPFallback};
}
+
static void genMapInfoForCaptures(
MappableExprsHandler &MEHandler, CodeGenFunction &CGF,
const CapturedStmt &CS, llvm::SmallVectorImpl<llvm::Value *> &CapturedVars,
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
index 7bec7e0c6736d..1ac9ac040468c 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
@@ -190,6 +190,16 @@ enum class OMPScheduleType {
LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue */ ModifierMask)
};
+/// The fallback types for the dyn_groupprivate clause.
+enum class OMPDynGroupprivateFallbackType : uint64_t {
+ /// Abort the execution.
+ Abort = 0,
+ /// Return null pointer.
+ Null = 1,
+ /// Allocate from a implementation defined memory space.
+ DefaultMem = 2
+};
+
// Default OpenMP mapper name suffix.
inline constexpr const char *OmpDefaultMapperName = ".omp.default.mapper";
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index f20b58409302f..aa370606c6539 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -2449,20 +2449,21 @@ class OpenMPIRBuilder {
Value *DynCGGroupMem = nullptr;
/// True if the kernel has 'no wait' clause.
bool HasNoWait = false;
- /// True if the dynamic shared memory may fallback.
- bool MayFallbackDynCGroupMem = false;
+ /// The fallback mechanism for the shared memory.
+ omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback =
+ omp::OMPDynGroupprivateFallbackType::Abort;
// Constructors for TargetKernelArgs.
TargetKernelArgs() {}
TargetKernelArgs(unsigned NumTargetItems, TargetDataRTArgs RTArgs,
Value *NumIterations, ArrayRef<Value *> NumTeams,
ArrayRef<Value *> NumThreads, Value *DynCGGroupMem,
- bool HasNoWait, bool MayFallbackDynCGroupMem)
+ bool HasNoWait,
+ omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback)
: NumTargetItems(NumTargetItems), RTArgs(RTArgs),
NumIterations(NumIterations), NumTeams(NumTeams),
NumThreads(NumThreads), DynCGGroupMem(DynCGGroupMem),
- HasNoWait(HasNoWait),
- MayFallbackDynCGroupMem(MayFallbackDynCGroupMem) {}
+ HasNoWait(HasNoWait), DynCGroupMemFallback(DynCGroupMemFallback) {}
};
/// Create the kernel args vector used by emitTargetKernel. This function
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 4cfe038c36eee..9f80e3eb61bfc 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -532,11 +532,11 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs,
Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
Value *HasNoWaitFlag = Builder.getInt64(KernelArgs.HasNoWait);
- Value *MayFallbackDynCGroupMemFlag =
- Builder.getInt64(KernelArgs.MayFallbackDynCGroupMem);
- MayFallbackDynCGroupMemFlag =
- Builder.CreateShl(MayFallbackDynCGroupMemFlag, 2);
- Value *Flags = Builder.CreateOr(HasNoWaitFlag, MayFallbackDynCGroupMemFlag);
+
+ Value *DynCGroupMemFallbackFlag =
+ Builder.getInt64(static_cast<uint64_t>(KernelArgs.DynCGroupMemFallback));
+ DynCGroupMemFallbackFlag = Builder.CreateShl(DynCGroupMemFallbackFlag, 2);
+ Value *Flags = Builder.CreateOr(HasNoWaitFlag, DynCGroupMemFallbackFlag);
assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
@@ -8368,9 +8368,9 @@ static void emitTargetCall(
// TODO: Use correct DynCGGroupMem
Value *DynCGGroupMem = Builder.getInt32(0);
- KArgs = OpenMPIRBuilder::TargetKernelArgs(NumTargetItems, RTArgs, TripCount,
- NumTeamsC, NumThreadsC,
- DynCGGroupMem, HasNoWait, false);
+ KArgs = OpenMPIRBuilder::TargetKernelArgs(
+ NumTargetItems, RTArgs, TripCount, NumTeamsC, NumThreadsC,
+ DynCGGroupMem, HasNoWait, OMPDynGroupprivateFallbackType::Abort);
// Assume no error was returned because TaskBodyCB and
// EmitTargetCallFallbackCB don't produce any.
>From 5f68ea0d103c89f0770dc833399b060b7cb5e0b4 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Sun, 26 Oct 2025 13:24:22 -0700
Subject: [PATCH 12/38] Add support for null fallback
---
offload/include/Shared/APITypes.h | 5 +-
offload/include/Shared/Environment.h | 15 +++++-
.../common/include/PluginInterface.h | 3 +-
.../common/src/PluginInterface.cpp | 47 +++++++++++------
...rivate_strict.cpp => dyn_groupprivate.cpp} | 50 +++++++++++++++----
openmp/device/src/State.cpp | 16 +++---
6 files changed, 97 insertions(+), 39 deletions(-)
rename offload/test/offloading/{dyn_groupprivate_strict.cpp => dyn_groupprivate.cpp} (64%)
diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index 66fba27fb0497..6183686290bd4 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -102,9 +102,8 @@ struct KernelArgsTy {
struct {
uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause.
uint64_t IsCUDA : 1; // Was this kernel spawned via CUDA.
- uint64_t AllowDynCGroupMemFallback : 1; // Allow fallback for dynamic cgroup
- // mem fallback.
- uint64_t Unused : 61;
+ uint64_t DynCGroupMemFallback : 2; // The fallback for dynamic cgroup mem.
+ uint64_t Unused : 60;
} Flags = {0, 0, 0, 0};
// The number of teams (for x,y,z dimension).
uint32_t NumTeams[3] = {0, 0, 0};
diff --git a/offload/include/Shared/Environment.h b/offload/include/Shared/Environment.h
index 0670ac1090da4..87b4d5e0f0952 100644
--- a/offload/include/Shared/Environment.h
+++ b/offload/include/Shared/Environment.h
@@ -92,12 +92,25 @@ struct KernelEnvironmentTy {
DynamicEnvironmentTy *DynamicEnv = nullptr;
};
+/// The fallback types for the dynamic cgroup memory.
+enum class DynCGroupMemFallbackType : unsigned char {
+ /// None. Used for indicating that no fallback was triggered.
+ None = 0,
+ /// Abort the execution.
+ Abort = None,
+ /// Return null pointer.
+ Null = 1,
+ /// Allocate from a implementation defined memory space.
+ DefaultMem = 2
+};
+
struct KernelLaunchEnvironmentTy {
void *ReductionBuffer = nullptr;
- void *DynCGroupMemFallback = nullptr;
+ void *DynCGroupMemFbPtr = nullptr;
uint32_t ReductionCnt = 0;
uint32_t ReductionIterCnt = 0;
uint32_t DynCGroupMemSize = 0;
+ DynCGroupMemFallbackType DynCGroupMemFb = DynCGroupMemFallbackType::None;
};
#endif // OMPTARGET_SHARED_ENVIRONMENT_H
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 2b215bae1bc7d..64e358d7dc1fd 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -392,7 +392,8 @@ struct GenericKernelTy {
/// Return a device pointer to a new kernel launch environment.
Expected<KernelLaunchEnvironmentTy *> getKernelLaunchEnvironment(
GenericDeviceTy &GenericDevice, const KernelArgsTy &KernelArgs,
- void *FallbackBlockMem, AsyncInfoWrapperTy &AsyncInfo) const;
+ uint32_t BlockMemSize, DynCGroupMemFallbackType DynBlockMemFb,
+ void *DynBlockMemFbPtr, AsyncInfoWrapperTy &AsyncInfoWrapper) const;
/// Indicate whether an execution mode is valid.
static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) {
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 5973193f1a7fb..99a7939077729 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -437,7 +437,8 @@ Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,
Expected<KernelLaunchEnvironmentTy *>
GenericKernelTy::getKernelLaunchEnvironment(
GenericDeviceTy &GenericDevice, const KernelArgsTy &KernelArgs,
- void *FallbackBlockMem, AsyncInfoWrapperTy &AsyncInfoWrapper) const {
+ uint32_t BlockMemSize, DynCGroupMemFallbackType DynBlockMemFb,
+ void *DynBlockMemFbPtr, AsyncInfoWrapperTy &AsyncInfoWrapper) const {
// Ctor/Dtor have no arguments, replaying uses the original kernel launch
// environment. Older versions of the compiler do not generate a kernel
// launch environment.
@@ -479,8 +480,9 @@ GenericKernelTy::getKernelLaunchEnvironment(
LocalKLE.ReductionBuffer = nullptr;
}
- LocalKLE.DynCGroupMemSize = KernelArgs.DynCGroupMem;
- LocalKLE.DynCGroupMemFallback = FallbackBlockMem;
+ LocalKLE.DynCGroupMemSize = BlockMemSize;
+ LocalKLE.DynCGroupMemFbPtr = DynBlockMemFbPtr;
+ LocalKLE.DynCGroupMemFb = DynBlockMemFb;
INFO(OMP_INFOTYPE_DATA_TRANSFER, GenericDevice.getDeviceId(),
"Copying data from host to device, HstPtr=" DPxMOD ", TgtPtr=" DPxMOD
@@ -539,28 +541,43 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
if (StaticBlockMemSize > MaxBlockMemSize)
return Plugin::error(ErrorCode::INVALID_ARGUMENT,
"Static block memory size exceeds maximum");
- else if (!KernelArgs.Flags.AllowDynCGroupMemFallback &&
+ else if (static_cast<DynCGroupMemFallbackType>(
+ KernelArgs.Flags.DynCGroupMemFallback) ==
+ DynCGroupMemFallbackType::Abort &&
TotalBlockMemSize > MaxBlockMemSize)
return Plugin::error(
ErrorCode::INVALID_ARGUMENT,
"Static and dynamic block memory size exceeds maximum");
- void *FallbackBlockMem = nullptr;
+ void *DynBlockMemFbPtr = nullptr;
+ uint32_t DynBlockMemLaunchSize = DynBlockMemSize;
+
+ DynCGroupMemFallbackType DynBlockMemFb = DynCGroupMemFallbackType::None;
if (DynBlockMemSize && (!GenericDevice.hasNativeBlockSharedMem() ||
TotalBlockMemSize > MaxBlockMemSize)) {
- auto AllocOrErr = GenericDevice.dataAlloc(
- NumBlocks[0] * DynBlockMemSize,
- /*HostPtr=*/nullptr, TargetAllocTy::TARGET_ALLOC_DEVICE);
- if (!AllocOrErr)
- return AllocOrErr.takeError();
+ // Launch without native dynamic block memory.
+ DynBlockMemLaunchSize = 0;
+ DynBlockMemFb = static_cast<DynCGroupMemFallbackType>(
+ KernelArgs.Flags.DynCGroupMemFallback);
+ if (DynBlockMemFb == DynCGroupMemFallbackType::DefaultMem) {
+ // Get global memory as fallback.
+ auto AllocOrErr = GenericDevice.dataAlloc(
+ NumBlocks[0] * DynBlockMemSize,
+ /*HostPtr=*/nullptr, TargetAllocTy::TARGET_ALLOC_DEVICE);
+ if (!AllocOrErr)
+ return AllocOrErr.takeError();
- FallbackBlockMem = *AllocOrErr;
- AsyncInfoWrapper.freeAllocationAfterSynchronization(FallbackBlockMem);
- DynBlockMemSize = 0;
+ DynBlockMemFbPtr = *AllocOrErr;
+ AsyncInfoWrapper.freeAllocationAfterSynchronization(DynBlockMemFbPtr);
+ } else {
+ // Do not provide any memory as fallback.
+ DynBlockMemSize = 0;
+ }
}
auto KernelLaunchEnvOrErr = getKernelLaunchEnvironment(
- GenericDevice, KernelArgs, FallbackBlockMem, AsyncInfoWrapper);
+ GenericDevice, KernelArgs, DynBlockMemSize, DynBlockMemFb,
+ DynBlockMemFbPtr, AsyncInfoWrapper);
if (!KernelLaunchEnvOrErr)
return KernelLaunchEnvOrErr.takeError();
@@ -591,7 +608,7 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
printLaunchInfo(GenericDevice, KernelArgs, NumThreads, NumBlocks))
return Err;
- return launchImpl(GenericDevice, NumThreads, NumBlocks, DynBlockMemSize,
+ return launchImpl(GenericDevice, NumThreads, NumBlocks, DynBlockMemLaunchSize,
KernelArgs, LaunchParams, AsyncInfoWrapper);
}
diff --git a/offload/test/offloading/dyn_groupprivate_strict.cpp b/offload/test/offloading/dyn_groupprivate.cpp
similarity index 64%
rename from offload/test/offloading/dyn_groupprivate_strict.cpp
rename to offload/test/offloading/dyn_groupprivate.cpp
index a35f8dd2b0595..45f7f16f2ebf2 100644
--- a/offload/test/offloading/dyn_groupprivate_strict.cpp
+++ b/offload/test/offloading/dyn_groupprivate.cpp
@@ -1,4 +1,7 @@
-// RUN: %libomptarget-compilexx-run-and-check-generic
+// RUN: %libomptarget-compilexx-generic -fopenmp-version=61
+// RUN: %libomptarget-run-generic | %fcheck-generic
+// RUN: %libomptarget-compileoptxx-generic -fopenmp-version=61
+// RUN: %libomptarget-run-generic | %fcheck-generic
// REQUIRES: gpu
#include <omp.h>
@@ -9,8 +12,9 @@
int main() {
int Result[N], NumThreads;
+// Verify the groupprivate buffer works as expected.
#pragma omp target teams num_teams(1) thread_limit(N) \
- dyn_groupprivate(strict : N * sizeof(Result[0])) \
+ dyn_groupprivate(fallback(abort) : N * sizeof(Result[0])) \
map(from : Result, NumThreads)
{
int Buffer[N];
@@ -51,8 +55,8 @@ int main() {
size_t MaxSize = omp_get_groupprivate_limit(0, omp_access_cgroup);
size_t ExceededSize = MaxSize + 10;
-// Verify that the fallback modifier works.
-#pragma omp target dyn_groupprivate(fallback : ExceededSize) \
+// Verify that the fallback(default_mem) modifier works.
+#pragma omp target dyn_groupprivate(fallback(default_mem) : ExceededSize) \
map(tofrom : Failed)
{
int IsFallback;
@@ -66,13 +70,35 @@ int main() {
++Failed;
}
-// Verify that the default modifier is fallback.
+// Verify that the fallback(null) modifier works.
+#pragma omp target dyn_groupprivate(fallback(null) : ExceededSize) \
+ map(tofrom : Failed)
+ {
+ int IsFallback;
+ if ((TmpPtr = omp_get_dyn_groupprivate_ptr(0, &IsFallback)))
+ ++Failed;
+ if ((TmpSize = omp_get_dyn_groupprivate_size()))
+ ++Failed;
+ if (!IsFallback)
+ ++Failed;
+ }
+
+// Verify that the default modifier is fallback(default_mem).
#pragma omp target dyn_groupprivate(ExceededSize)
{
+ int IsFallback;
+ if (!omp_get_dyn_groupprivate_ptr(0, &IsFallback))
+ ++Failed;
+ if (!omp_get_dyn_groupprivate_size())
+ ++Failed;
+ if (omp_get_dyn_groupprivate_size() != ExceededSize)
+ ++Failed;
+ if (!IsFallback)
+ ++Failed;
}
-// Verify that the strict modifier works.
-#pragma omp target dyn_groupprivate(strict : N) map(tofrom : Failed)
+// Verify that the fallback(abort) modifier works.
+#pragma omp target dyn_groupprivate(fallback(abort) : N) map(tofrom : Failed)
{
int IsFallback;
if (!omp_get_dyn_groupprivate_ptr(0, &IsFallback))
@@ -85,8 +111,9 @@ int main() {
++Failed;
}
-// Verify that the fallback does not trigger when not needed.
-#pragma omp target dyn_groupprivate(fallback : N) map(tofrom : Failed)
+// Verify that the fallback(default_mem) does not trigger when not needed.
+#pragma omp target dyn_groupprivate(fallback(default_mem) : N) \
+ map(tofrom : Failed)
{
int IsFallback;
if (!omp_get_dyn_groupprivate_ptr(0, &IsFallback))
@@ -100,7 +127,7 @@ int main() {
}
// Verify that the clause works when passing a zero size.
-#pragma omp target dyn_groupprivate(strict : 0) map(tofrom : Failed)
+#pragma omp target dyn_groupprivate(fallback(abort) : 0) map(tofrom : Failed)
{
int IsFallback;
if (omp_get_dyn_groupprivate_ptr(0, &IsFallback))
@@ -112,7 +139,8 @@ int main() {
}
// Verify that the clause works when passing a zero size.
-#pragma omp target dyn_groupprivate(fallback : 0) map(tofrom : Failed)
+#pragma omp target dyn_groupprivate(fallback(default_mem) : 0) \
+ map(tofrom : Failed)
{
int IsFallback;
if (omp_get_dyn_groupprivate_ptr(0, &IsFallback))
diff --git a/openmp/device/src/State.cpp b/openmp/device/src/State.cpp
index 268afa1f841d3..a34ecf6f54681 100644
--- a/openmp/device/src/State.cpp
+++ b/openmp/device/src/State.cpp
@@ -162,27 +162,27 @@ struct DynCGroupMemTy {
void init(KernelLaunchEnvironmentTy *KLE, void *NativeDynCGroup) {
Size = 0;
Ptr = nullptr;
- IsFallback = false;
+ Fallback = DynCGroupMemFallbackType::None;
if (!KLE)
return;
Size = KLE->DynCGroupMemSize;
- if (void *Fallback = KLE->DynCGroupMemFallback) {
- Ptr = static_cast<char *>(Fallback) + Size * omp_get_team_num();
- IsFallback = true;
- } else {
+ Fallback = KLE->DynCGroupMemFb;
+ if (Fallback == DynCGroupMemFallbackType::None)
Ptr = static_cast<char *>(NativeDynCGroup);
- }
+ else if (Fallback == DynCGroupMemFallbackType::DefaultMem)
+ Ptr = static_cast<char *>(KLE->DynCGroupMemFbPtr) +
+ Size * omp_get_team_num();
}
char *getPtr(size_t Offset) const { return Ptr + Offset; }
- bool isFallback() const { return IsFallback; }
+ bool isFallback() const { return Fallback != DynCGroupMemFallbackType::None; }
size_t getSize() const { return Size; }
private:
char *Ptr;
size_t Size;
- bool IsFallback;
+ DynCGroupMemFallbackType Fallback;
};
[[clang::loader_uninitialized]] static Local<DynCGroupMemTy> DynCGroupMem;
>From 9fb6e274d622f14dd79ef5003ab3da122dd6518b Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Sun, 26 Oct 2025 20:04:27 -0700
Subject: [PATCH 13/38] Fix test
---
offload/test/offloading/dyn_groupprivate.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/offload/test/offloading/dyn_groupprivate.cpp b/offload/test/offloading/dyn_groupprivate.cpp
index 45f7f16f2ebf2..f36c35e611eb8 100644
--- a/offload/test/offloading/dyn_groupprivate.cpp
+++ b/offload/test/offloading/dyn_groupprivate.cpp
@@ -75,9 +75,9 @@ int main() {
map(tofrom : Failed)
{
int IsFallback;
- if ((TmpPtr = omp_get_dyn_groupprivate_ptr(0, &IsFallback)))
+ if (omp_get_dyn_groupprivate_ptr(0, &IsFallback))
++Failed;
- if ((TmpSize = omp_get_dyn_groupprivate_size()))
+ if (omp_get_dyn_groupprivate_size())
++Failed;
if (!IsFallback)
++Failed;
>From 4662a4ffde75d5cef92f94cceb3c83195d883da7 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Sun, 26 Oct 2025 20:05:11 -0700
Subject: [PATCH 14/38] Fix parsing
---
clang/lib/Parse/ParseOpenMP.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 71a5cc9950368..0df6a5ae663a6 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -4020,7 +4020,7 @@ OMPClause *Parser::ParseOpenMPSingleExprWithArgClause(OpenMPDirectiveKind DKind,
unsigned Type = NumberOfModifiers;
unsigned Modifier;
SourceLocation Loc;
- if (PP.getSpelling(Tok) == "fallback" && NextToken().is(tok::l_paren)) {
+ if (!Tok.isAnnotation() && PP.getSpelling(Tok) == "fallback" && NextToken().is(tok::l_paren)) {
ConsumeToken();
BalancedDelimiterTracker ParenT(*this, tok::l_paren, tok::r_paren);
ParenT.consumeOpen();
>From 64e7abc6d6483be586d089544f1d816c4b514feb Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Sun, 26 Oct 2025 20:10:03 -0700
Subject: [PATCH 15/38] Fix format
---
clang/lib/Parse/ParseOpenMP.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 0df6a5ae663a6..60d332b5d6532 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -4020,7 +4020,8 @@ OMPClause *Parser::ParseOpenMPSingleExprWithArgClause(OpenMPDirectiveKind DKind,
unsigned Type = NumberOfModifiers;
unsigned Modifier;
SourceLocation Loc;
- if (!Tok.isAnnotation() && PP.getSpelling(Tok) == "fallback" && NextToken().is(tok::l_paren)) {
+ if (!Tok.isAnnotation() && PP.getSpelling(Tok) == "fallback" &&
+ NextToken().is(tok::l_paren)) {
ConsumeToken();
BalancedDelimiterTracker ParenT(*this, tok::l_paren, tok::r_paren);
ParenT.consumeOpen();
>From 7b53c9a678f6d3bed3befd6c524815717b53d268 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Mon, 27 Oct 2025 09:02:55 -0700
Subject: [PATCH 16/38] Fix review comments
---
clang/lib/Parse/ParseOpenMP.cpp | 15 ++++++++-------
1 file changed, 8 insertions(+), 7 deletions(-)
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 60d332b5d6532..2c7d3abfc1f7a 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -4016,7 +4016,7 @@ OMPClause *Parser::ParseOpenMPSingleExprWithArgClause(OpenMPDirectiveKind DKind,
Arg[SimpleModifier] = OMPC_DYN_GROUPPRIVATE_unknown;
Arg[ComplexModifier] = OMPC_DYN_GROUPPRIVATE_FALLBACK_unknown;
- auto consumeModifier = [&]() {
+ auto ConsumeModifier = [&]() {
unsigned Type = NumberOfModifiers;
unsigned Modifier;
SourceLocation Loc;
@@ -4055,28 +4055,29 @@ OMPClause *Parser::ParseOpenMPSingleExprWithArgClause(OpenMPDirectiveKind DKind,
return std::make_tuple(Type, Modifier, Loc);
};
- auto saveModifier = [&](unsigned Type, unsigned Modifier,
+ auto SaveModifier = [&](unsigned Type, unsigned Modifier,
SourceLocation Loc) {
assert(Type < NumberOfModifiers);
if (!KLoc[Type].isValid()) {
Arg[Type] = Modifier;
KLoc[Type] = Loc;
- } else
+ } else {
Diag(Loc, diag::err_omp_incompatible_dyn_groupprivate_modifier)
<< getOpenMPSimpleClauseTypeName(OMPC_dyn_groupprivate, Modifier)
<< getOpenMPSimpleClauseTypeName(OMPC_dyn_groupprivate, Arg[Type]);
+ }
};
// Parse 'modifier'
- auto [Type1, Mod1, Loc1] = consumeModifier();
+ auto [Type1, Mod1, Loc1] = ConsumeModifier();
if (Type1 < NumberOfModifiers) {
- saveModifier(Type1, Mod1, Loc1);
+ SaveModifier(Type1, Mod1, Loc1);
if (Tok.is(tok::comma)) {
// Parse ',' 'modifier'
ConsumeAnyToken();
- auto [Type2, Mod2, Loc2] = consumeModifier();
+ auto [Type2, Mod2, Loc2] = ConsumeModifier();
if (Type2 < NumberOfModifiers)
- saveModifier(Type2, Mod2, Loc2);
+ SaveModifier(Type2, Mod2, Loc2);
}
// Parse ':'
if (Tok.is(tok::colon))
>From 79b34f1994c4f19d2c226b09a352370bbadba756 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Mon, 27 Oct 2025 09:06:33 -0700
Subject: [PATCH 17/38] Fix more review comments
---
clang/lib/Parse/ParseOpenMP.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 2c7d3abfc1f7a..a44281f370caa 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -4057,7 +4057,7 @@ OMPClause *Parser::ParseOpenMPSingleExprWithArgClause(OpenMPDirectiveKind DKind,
auto SaveModifier = [&](unsigned Type, unsigned Modifier,
SourceLocation Loc) {
- assert(Type < NumberOfModifiers);
+ assert(Type < NumberOfModifiers && "Unexpected modifier type");
if (!KLoc[Type].isValid()) {
Arg[Type] = Modifier;
KLoc[Type] = Loc;
>From c4905e056722d56d2b096c6e7e30c9580b2a4461 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Thu, 30 Oct 2025 14:30:00 -0700
Subject: [PATCH 18/38] Fix initialization of a fallback variable
---
clang/lib/CodeGen/CGOpenMPRuntime.cpp | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 2d08e1a04ff03..74076eb5a9733 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -10003,7 +10003,7 @@ static llvm::Value *emitDeviceID(
static std::pair<llvm::Value *, OMPDynGroupprivateFallbackType>
emitDynCGroupMem(const OMPExecutableDirective &D, CodeGenFunction &CGF) {
llvm::Value *DynGP = CGF.Builder.getInt32(0);
- OMPDynGroupprivateFallbackType DynGPFallback;
+ auto DynGPFallback = OMPDynGroupprivateFallbackType::Abort;
if (auto *DynGPClause = D.getSingleClause<OMPDynGroupprivateClause>()) {
CodeGenFunction::RunCleanupsScope DynGPScope(CGF);
@@ -10034,7 +10034,6 @@ emitDynCGroupMem(const OMPExecutableDirective &D, CodeGenFunction &CGF) {
/*IgnoreResultAssign=*/true);
DynGP = CGF.Builder.CreateIntCast(DynCGMemVal, CGF.Int32Ty,
/*isSigned=*/false);
- DynGPFallback = OMPDynGroupprivateFallbackType::Abort;
}
return {DynGP, DynGPFallback};
}
>From c439e4412e7ad983b8f923ec202c46d07f3b683c Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Thu, 30 Oct 2025 15:13:32 -0700
Subject: [PATCH 19/38] Add codegen test for dyn_groupprivate
---
.../target_dyn_groupprivate_codegen.cpp | 2633 +++++++++++++++++
1 file changed, 2633 insertions(+)
create mode 100644 clang/test/OpenMP/target_dyn_groupprivate_codegen.cpp
diff --git a/clang/test/OpenMP/target_dyn_groupprivate_codegen.cpp b/clang/test/OpenMP/target_dyn_groupprivate_codegen.cpp
new file mode 100644
index 0000000000000..758f35d629ace
--- /dev/null
+++ b/clang/test/OpenMP/target_dyn_groupprivate_codegen.cpp
@@ -0,0 +1,2633 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// Test host codegen.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK3
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK3
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK9
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK9
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK11
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK11
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}"
+
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+
+
+
+// We have 6 target regions
+
+
+
+// Check target registration is registered as a Ctor.
+
+
+template<typename tx>
+tx ftemplate(int n) {
+ tx a = 0;
+
+ #pragma omp target teams dyn_groupprivate(tx(20))
+ {
+ }
+
+ short b = 1;
+ #pragma omp target teams num_teams(b) dyn_groupprivate(1024)
+ {
+ a += b;
+ }
+
+ return a;
+}
+
+static
+int fstatic(int n) {
+
+ #pragma omp target teams distribute parallel for simd num_teams(n) dyn_groupprivate(n*32)
+ for (int i = 0; i < n ; ++i) {
+ }
+
+ #pragma omp target teams dyn_groupprivate(fallback(default_mem): 32+n) nowait
+ {
+ }
+
+ return n+1;
+}
+
+struct S1 {
+ double a;
+
+ int r1(int n){
+ int b = 1;
+
+ #pragma omp target teams dyn_groupprivate(fallback(null): n-b)
+ {
+ this->a = (double)b + 1.5;
+ }
+
+ #pragma omp target dyn_groupprivate(fallback(abort): 1024)
+ {
+ this->a = 2.5;
+ }
+
+ return (int)a;
+ }
+};
+
+int bar(int n){
+ int a = 0;
+
+ S1 S;
+ a += S.r1(n);
+
+ a += fstatic(n);
+
+ a += ftemplate<int>(n);
+
+ return a;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+// Check that the offloading functions are emitted and that the parallel function
+// is appropriately guarded.
+
+
+
+
+
+
+#endif
+
+// CHECK1-LABEL: define {{[^@]+}}@_Z3bari
+// CHECK1-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[S:%.*]] = alloca [[STRUCT_S1:%.*]], align 8
+// CHECK1-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[A]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: [[CALL:%.*]] = call noundef signext i32 @_ZN2S12r1Ei(ptr noundef nonnull align 8 dereferenceable(8) [[S]], i32 noundef signext [[TMP0]])
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[A]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CALL]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: [[CALL1:%.*]] = call noundef signext i32 @_ZL7fstatici(i32 noundef signext [[TMP2]])
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// CHECK1-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP3]], [[CALL1]]
+// CHECK1-NEXT: store i32 [[ADD2]], ptr [[A]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: [[CALL3:%.*]] = call noundef signext i32 @_Z9ftemplateIiET_i(i32 noundef signext [[TMP4]])
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[A]], align 4
+// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP5]], [[CALL3]]
+// CHECK1-NEXT: store i32 [[ADD4]], ptr [[A]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[A]], align 4
+// CHECK1-NEXT: ret i32 [[TMP6]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_ZN2S12r1Ei
+// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] comdat {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[B:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[B_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 8
+// CHECK1-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_BASEPTRS3:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_PTRS4:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS5:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: store i32 1, ptr [[B]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[B]], align 4
+// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP0]], [[TMP1]]
+// CHECK1-NEXT: store i32 [[SUB]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[B]], align 4
+// CHECK1-NEXT: store i32 [[TMP2]], ptr [[B_CASTED]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load i64, ptr [[B_CASTED]], align 8
+// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 4
+// CHECK1-NEXT: [[TMP5:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8
+// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT: store ptr [[THIS1]], ptr [[TMP6]], align 8
+// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT: store ptr [[A]], ptr [[TMP7]], align 8
+// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK1-NEXT: store ptr null, ptr [[TMP8]], align 8
+// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK1-NEXT: store i64 [[TMP3]], ptr [[TMP9]], align 8
+// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK1-NEXT: store i64 [[TMP3]], ptr [[TMP10]], align 8
+// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK1-NEXT: store ptr null, ptr [[TMP11]], align 8
+// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK1-NEXT: store i64 [[TMP5]], ptr [[TMP12]], align 8
+// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK1-NEXT: store i64 [[TMP5]], ptr [[TMP13]], align 8
+// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
+// CHECK1-NEXT: store ptr null, ptr [[TMP14]], align 8
+// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK1-NEXT: store i32 3, ptr [[TMP18]], align 4
+// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK1-NEXT: store i32 3, ptr [[TMP19]], align 4
+// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK1-NEXT: store ptr [[TMP15]], ptr [[TMP20]], align 8
+// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK1-NEXT: store ptr [[TMP16]], ptr [[TMP21]], align 8
+// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK1-NEXT: store ptr @.offload_sizes, ptr [[TMP22]], align 8
+// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK1-NEXT: store ptr @.offload_maptypes, ptr [[TMP23]], align 8
+// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK1-NEXT: store ptr null, ptr [[TMP24]], align 8
+// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK1-NEXT: store ptr null, ptr [[TMP25]], align 8
+// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK1-NEXT: store i64 0, ptr [[TMP26]], align 8
+// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK1-NEXT: store i64 4, ptr [[TMP27]], align 8
+// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4
+// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP29]], align 4
+// CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK1-NEXT: store i32 [[TMP17]], ptr [[TMP30]], align 4
+// CHECK1-NEXT: [[TMP31:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.region_id, ptr [[KERNEL_ARGS]])
+// CHECK1-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
+// CHECK1-NEXT: br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK1: omp_offload.failed:
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88(ptr [[THIS1]], i64 [[TMP3]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]]
+// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK1: omp_offload.cont:
+// CHECK1-NEXT: [[A2:%.*]] = getelementptr inbounds nuw [[STRUCT_S1]], ptr [[THIS1]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
+// CHECK1-NEXT: store ptr [[THIS1]], ptr [[TMP33]], align 8
+// CHECK1-NEXT: [[TMP34:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
+// CHECK1-NEXT: store ptr [[A2]], ptr [[TMP34]], align 8
+// CHECK1-NEXT: [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS5]], i64 0, i64 0
+// CHECK1-NEXT: store ptr null, ptr [[TMP35]], align 8
+// CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP37:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
+// CHECK1-NEXT: store i32 3, ptr [[TMP38]], align 4
+// CHECK1-NEXT: [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
+// CHECK1-NEXT: store i32 1, ptr [[TMP39]], align 4
+// CHECK1-NEXT: [[TMP40:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
+// CHECK1-NEXT: store ptr [[TMP36]], ptr [[TMP40]], align 8
+// CHECK1-NEXT: [[TMP41:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 3
+// CHECK1-NEXT: store ptr [[TMP37]], ptr [[TMP41]], align 8
+// CHECK1-NEXT: [[TMP42:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 4
+// CHECK1-NEXT: store ptr @.offload_sizes.1, ptr [[TMP42]], align 8
+// CHECK1-NEXT: [[TMP43:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 5
+// CHECK1-NEXT: store ptr @.offload_maptypes.2, ptr [[TMP43]], align 8
+// CHECK1-NEXT: [[TMP44:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 6
+// CHECK1-NEXT: store ptr null, ptr [[TMP44]], align 8
+// CHECK1-NEXT: [[TMP45:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 7
+// CHECK1-NEXT: store ptr null, ptr [[TMP45]], align 8
+// CHECK1-NEXT: [[TMP46:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 8
+// CHECK1-NEXT: store i64 0, ptr [[TMP46]], align 8
+// CHECK1-NEXT: [[TMP47:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 9
+// CHECK1-NEXT: store i64 0, ptr [[TMP47]], align 8
+// CHECK1-NEXT: [[TMP48:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 10
+// CHECK1-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP48]], align 4
+// CHECK1-NEXT: [[TMP49:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 11
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP49]], align 4
+// CHECK1-NEXT: [[TMP50:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 12
+// CHECK1-NEXT: store i32 1024, ptr [[TMP50]], align 4
+// CHECK1-NEXT: [[TMP51:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93.region_id, ptr [[KERNEL_ARGS6]])
+// CHECK1-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0
+// CHECK1-NEXT: br i1 [[TMP52]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]]
+// CHECK1: omp_offload.failed7:
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93(ptr [[THIS1]]) #[[ATTR2]]
+// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT8]]
+// CHECK1: omp_offload.cont8:
+// CHECK1-NEXT: [[A9:%.*]] = getelementptr inbounds nuw [[STRUCT_S1]], ptr [[THIS1]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP53:%.*]] = load double, ptr [[A9]], align 8
+// CHECK1-NEXT: [[CONV:%.*]] = fptosi double [[TMP53]] to i32
+// CHECK1-NEXT: ret i32 [[CONV]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_ZL7fstatici
+// CHECK1-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 8
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_4:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_6:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED8:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_BASEPTRS9:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_PTRS10:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS11:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT: [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK1-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP2]], 32
+// CHECK1-NEXT: store i32 [[MUL]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP3]], ptr [[N_CASTED]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 [[TMP7]], ptr [[DOTCAPTURE_EXPR__CASTED2]], align 4
+// CHECK1-NEXT: [[TMP8:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED2]], align 8
+// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT: store i64 [[TMP4]], ptr [[TMP9]], align 8
+// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT: store i64 [[TMP4]], ptr [[TMP10]], align 8
+// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK1-NEXT: store ptr null, ptr [[TMP11]], align 8
+// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK1-NEXT: store i64 [[TMP6]], ptr [[TMP12]], align 8
+// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK1-NEXT: store i64 [[TMP6]], ptr [[TMP13]], align 8
+// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK1-NEXT: store ptr null, ptr [[TMP14]], align 8
+// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK1-NEXT: store i64 [[TMP8]], ptr [[TMP15]], align 8
+// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK1-NEXT: store i64 [[TMP8]], ptr [[TMP16]], align 8
+// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
+// CHECK1-NEXT: store ptr null, ptr [[TMP17]], align 8
+// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP21]], ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP22]], 0
+// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK1-NEXT: [[SUB5:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK1-NEXT: store i32 [[SUB5]], ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP23]], 1
+// CHECK1-NEXT: [[TMP24:%.*]] = zext i32 [[ADD]] to i64
+// CHECK1-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP20]], 0
+// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK1-NEXT: store i32 3, ptr [[TMP27]], align 4
+// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK1-NEXT: store i32 3, ptr [[TMP28]], align 4
+// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK1-NEXT: store ptr [[TMP18]], ptr [[TMP29]], align 8
+// CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK1-NEXT: store ptr [[TMP19]], ptr [[TMP30]], align 8
+// CHECK1-NEXT: [[TMP31:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK1-NEXT: store ptr @.offload_sizes.3, ptr [[TMP31]], align 8
+// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK1-NEXT: store ptr @.offload_maptypes.4, ptr [[TMP32]], align 8
+// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK1-NEXT: store ptr null, ptr [[TMP33]], align 8
+// CHECK1-NEXT: [[TMP34:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK1-NEXT: store ptr null, ptr [[TMP34]], align 8
+// CHECK1-NEXT: [[TMP35:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK1-NEXT: store i64 [[TMP24]], ptr [[TMP35]], align 8
+// CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK1-NEXT: store i64 8, ptr [[TMP36]], align 8
+// CHECK1-NEXT: [[TMP37:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK1-NEXT: store [3 x i32] [[TMP26]], ptr [[TMP37]], align 4
+// CHECK1-NEXT: [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP38]], align 4
+// CHECK1-NEXT: [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK1-NEXT: store i32 [[TMP25]], ptr [[TMP39]], align 4
+// CHECK1-NEXT: [[TMP40:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 [[TMP20]], i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.region_id, ptr [[KERNEL_ARGS]])
+// CHECK1-NEXT: [[TMP41:%.*]] = icmp ne i32 [[TMP40]], 0
+// CHECK1-NEXT: br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK1: omp_offload.failed:
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71(i64 [[TMP4]], i64 [[TMP6]], i64 [[TMP8]]) #[[ATTR2]]
+// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK1: omp_offload.cont:
+// CHECK1-NEXT: [[TMP42:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: [[ADD7:%.*]] = add nsw i32 32, [[TMP42]]
+// CHECK1-NEXT: store i32 [[ADD7]], ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK1-NEXT: [[TMP43:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK1-NEXT: store i32 [[TMP43]], ptr [[DOTCAPTURE_EXPR__CASTED8]], align 4
+// CHECK1-NEXT: [[TMP44:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED8]], align 8
+// CHECK1-NEXT: [[TMP45:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS9]], i32 0, i32 0
+// CHECK1-NEXT: store i64 [[TMP44]], ptr [[TMP45]], align 8
+// CHECK1-NEXT: [[TMP46:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS10]], i32 0, i32 0
+// CHECK1-NEXT: store i64 [[TMP44]], ptr [[TMP46]], align 8
+// CHECK1-NEXT: [[TMP47:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS11]], i64 0, i64 0
+// CHECK1-NEXT: store ptr null, ptr [[TMP47]], align 8
+// CHECK1-NEXT: [[TMP48:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS9]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP49:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS10]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP50:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[AGG_CAPTURED]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP51:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK1-NEXT: store i32 [[TMP51]], ptr [[TMP50]], align 4
+// CHECK1-NEXT: [[TMP52:%.*]] = call ptr @__kmpc_omp_target_task_alloc(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i64 64, i64 4, ptr @.omp_task_entry., i64 -1)
+// CHECK1-NEXT: [[TMP53:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP52]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP54:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP53]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP55:%.*]] = load ptr, ptr [[TMP54]], align 8
+// CHECK1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP55]], ptr align 4 [[AGG_CAPTURED]], i64 4, i1 false)
+// CHECK1-NEXT: [[TMP56:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP52]], i32 0, i32 1
+// CHECK1-NEXT: [[TMP57:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T:%.*]], ptr [[TMP56]], i32 0, i32 0
+// CHECK1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP57]], ptr align 8 [[TMP48]], i64 8, i1 false)
+// CHECK1-NEXT: [[TMP58:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP56]], i32 0, i32 1
+// CHECK1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP58]], ptr align 8 [[TMP49]], i64 8, i1 false)
+// CHECK1-NEXT: [[TMP59:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP56]], i32 0, i32 2
+// CHECK1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP59]], ptr align 8 @.offload_sizes.5, i64 8, i1 false)
+// CHECK1-NEXT: [[TMP60:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[TMP0]], ptr [[TMP52]])
+// CHECK1-NEXT: [[TMP61:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP61]], 1
+// CHECK1-NEXT: ret i32 [[ADD12]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@_Z9ftemplateIiET_i
+// CHECK1-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0]] comdat {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK1-NEXT: [[B:%.*]] = alloca i16, align 2
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i16, align 2
+// CHECK1-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[B_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 8
+// CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 8
+// CHECK1-NEXT: [[KERNEL_ARGS1:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK1-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[A]], align 4
+// CHECK1-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK1-NEXT: store i32 3, ptr [[TMP0]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK1-NEXT: store i32 0, ptr [[TMP1]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK1-NEXT: store ptr null, ptr [[TMP2]], align 8
+// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK1-NEXT: store ptr null, ptr [[TMP3]], align 8
+// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK1-NEXT: store ptr null, ptr [[TMP4]], align 8
+// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK1-NEXT: store ptr null, ptr [[TMP5]], align 8
+// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK1-NEXT: store ptr null, ptr [[TMP6]], align 8
+// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK1-NEXT: store ptr null, ptr [[TMP7]], align 8
+// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK1-NEXT: store i64 0, ptr [[TMP8]], align 8
+// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK1-NEXT: store i64 8, ptr [[TMP9]], align 8
+// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP10]], align 4
+// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP11]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK1-NEXT: store i32 20, ptr [[TMP12]], align 4
+// CHECK1-NEXT: [[TMP13:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.region_id, ptr [[KERNEL_ARGS]])
+// CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
+// CHECK1-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK1: omp_offload.failed:
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55() #[[ATTR2]]
+// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK1: omp_offload.cont:
+// CHECK1-NEXT: store i16 1, ptr [[B]], align 2
+// CHECK1-NEXT: [[TMP15:%.*]] = load i16, ptr [[B]], align 2
+// CHECK1-NEXT: store i16 [[TMP15]], ptr [[DOTCAPTURE_EXPR_]], align 2
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[A]], align 4
+// CHECK1-NEXT: store i32 [[TMP16]], ptr [[A_CASTED]], align 4
+// CHECK1-NEXT: [[TMP17:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK1-NEXT: [[TMP18:%.*]] = load i16, ptr [[B]], align 2
+// CHECK1-NEXT: store i16 [[TMP18]], ptr [[B_CASTED]], align 2
+// CHECK1-NEXT: [[TMP19:%.*]] = load i64, ptr [[B_CASTED]], align 8
+// CHECK1-NEXT: [[TMP20:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR_]], align 2
+// CHECK1-NEXT: store i16 [[TMP20]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 2
+// CHECK1-NEXT: [[TMP21:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8
+// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT: store i64 [[TMP17]], ptr [[TMP22]], align 8
+// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT: store i64 [[TMP17]], ptr [[TMP23]], align 8
+// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK1-NEXT: store ptr null, ptr [[TMP24]], align 8
+// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK1-NEXT: store i64 [[TMP19]], ptr [[TMP25]], align 8
+// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK1-NEXT: store i64 [[TMP19]], ptr [[TMP26]], align 8
+// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1
+// CHECK1-NEXT: store ptr null, ptr [[TMP27]], align 8
+// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK1-NEXT: store i64 [[TMP21]], ptr [[TMP28]], align 8
+// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK1-NEXT: store i64 [[TMP21]], ptr [[TMP29]], align 8
+// CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2
+// CHECK1-NEXT: store ptr null, ptr [[TMP30]], align 8
+// CHECK1-NEXT: [[TMP31:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP33:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR_]], align 2
+// CHECK1-NEXT: [[TMP34:%.*]] = sext i16 [[TMP33]] to i32
+// CHECK1-NEXT: [[TMP35:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP34]], 0
+// CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 0
+// CHECK1-NEXT: store i32 3, ptr [[TMP36]], align 4
+// CHECK1-NEXT: [[TMP37:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 1
+// CHECK1-NEXT: store i32 3, ptr [[TMP37]], align 4
+// CHECK1-NEXT: [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 2
+// CHECK1-NEXT: store ptr [[TMP31]], ptr [[TMP38]], align 8
+// CHECK1-NEXT: [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 3
+// CHECK1-NEXT: store ptr [[TMP32]], ptr [[TMP39]], align 8
+// CHECK1-NEXT: [[TMP40:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 4
+// CHECK1-NEXT: store ptr @.offload_sizes.7, ptr [[TMP40]], align 8
+// CHECK1-NEXT: [[TMP41:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 5
+// CHECK1-NEXT: store ptr @.offload_maptypes.8, ptr [[TMP41]], align 8
+// CHECK1-NEXT: [[TMP42:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 6
+// CHECK1-NEXT: store ptr null, ptr [[TMP42]], align 8
+// CHECK1-NEXT: [[TMP43:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 7
+// CHECK1-NEXT: store ptr null, ptr [[TMP43]], align 8
+// CHECK1-NEXT: [[TMP44:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 8
+// CHECK1-NEXT: store i64 0, ptr [[TMP44]], align 8
+// CHECK1-NEXT: [[TMP45:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 9
+// CHECK1-NEXT: store i64 8, ptr [[TMP45]], align 8
+// CHECK1-NEXT: [[TMP46:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 10
+// CHECK1-NEXT: store [3 x i32] [[TMP35]], ptr [[TMP46]], align 4
+// CHECK1-NEXT: [[TMP47:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 11
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP47]], align 4
+// CHECK1-NEXT: [[TMP48:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 12
+// CHECK1-NEXT: store i32 1024, ptr [[TMP48]], align 4
+// CHECK1-NEXT: [[TMP49:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 [[TMP34]], i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.region_id, ptr [[KERNEL_ARGS1]])
+// CHECK1-NEXT: [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0
+// CHECK1-NEXT: br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED2:%.*]], label [[OMP_OFFLOAD_CONT3:%.*]]
+// CHECK1: omp_offload.failed2:
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60(i64 [[TMP17]], i64 [[TMP19]], i64 [[TMP21]]) #[[ATTR2]]
+// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT3]]
+// CHECK1: omp_offload.cont3:
+// CHECK1-NEXT: [[TMP51:%.*]] = load i32, ptr [[A]], align 4
+// CHECK1-NEXT: ret i32 [[TMP51]]
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88
+// CHECK1-SAME: (ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[B_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: store i64 [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP1]], ptr [[B_CASTED]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load i64, ptr [[B_CASTED]], align 8
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined, ptr [[TMP0]], i64 [[TMP2]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: store i64 [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK1-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP1]] to double
+// CHECK1-NEXT: [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
+// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CHECK1-NEXT: store double [[ADD]], ptr [[A]], align 8
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93
+// CHECK1-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CHECK1-NEXT: store double 2.500000e+00, ptr [[A]], align 8
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71
+// CHECK1-SAME: (i64 noundef [[N:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK1-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK1-NEXT: call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 [[TMP1]], i32 0)
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP2]], ptr [[N_CASTED]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined, i64 [[TMP3]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
+// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK1-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
+// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK1: omp.precond.then:
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 [[TMP3]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP5]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
+// CHECK1-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP8]], [[COND_TRUE]] ], [ [[TMP9]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP10]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17:![0-9]+]]
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP11]], [[TMP12]]
+// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT: [[TMP14:%.*]] = zext i32 [[TMP13]] to i64
+// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT: [[TMP16:%.*]] = zext i32 [[TMP15]] to i64
+// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT: store i32 [[TMP17]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT: [[TMP18:%.*]] = load i64, ptr [[N_CASTED]], align 8, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined, i64 [[TMP14]], i64 [[TMP16]], i64 [[TMP18]]), !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP22]])
+// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
+// CHECK1-NEXT: br i1 [[TMP24]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK1: .omp.final.then:
+// CHECK1-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[SUB6:%.*]] = sub nsw i32 [[TMP25]], 0
+// CHECK1-NEXT: [[DIV7:%.*]] = sdiv i32 [[SUB6]], 1
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV7]], 1
+// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT: store i32 [[ADD8]], ptr [[I3]], align 4
+// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]]
+// CHECK1: .omp.final.done:
+// CHECK1-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK1: omp.precond.end:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[I4:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
+// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK1-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
+// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK1: omp.precond.then:
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: store i32 [[TMP3]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP4]] to i32
+// CHECK1-NEXT: [[TMP5:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32
+// CHECK1-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[CONV3]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP7]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+// CHECK1-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1: cond.true:
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT: br label [[COND_END:%.*]]
+// CHECK1: cond.false:
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: br label [[COND_END]]
+// CHECK1: cond.end:
+// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP10]], [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ]
+// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT: store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK1: omp.inner.for.cond:
+// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21:![0-9]+]]
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK1-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
+// CHECK1-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1: omp.inner.for.body:
+// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK1: omp.body.continue:
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK1: omp.inner.for.inc:
+// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK1-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP16]], 1
+// CHECK1-NEXT: store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
+// CHECK1: omp.inner.for.end:
+// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK1: omp.loop.exit:
+// CHECK1-NEXT: [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
+// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP18]])
+// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK1-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
+// CHECK1-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK1: .omp.final.then:
+// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0
+// CHECK1-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1
+// CHECK1-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1
+// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]]
+// CHECK1-NEXT: store i32 [[ADD11]], ptr [[I4]], align 4
+// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]]
+// CHECK1: .omp.final.done:
+// CHECK1-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK1: omp.precond.end:
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75
+// CHECK1-SAME: (i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined)
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@.omp_task_privates_map.
+// CHECK1-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]], ptr noalias noundef [[TMP2:%.*]], ptr noalias noundef [[TMP3:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK1-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK1-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 8
+// CHECK1-NEXT: store ptr [[TMP3]], ptr [[DOTADDR3]], align 8
+// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T:%.*]], ptr [[TMP4]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK1-NEXT: store ptr [[TMP5]], ptr [[TMP6]], align 8
+// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP4]], i32 0, i32 1
+// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[DOTADDR2]], align 8
+// CHECK1-NEXT: store ptr [[TMP7]], ptr [[TMP8]], align 8
+// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP4]], i32 0, i32 2
+// CHECK1-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTADDR3]], align 8
+// CHECK1-NEXT: store ptr [[TMP9]], ptr [[TMP10]], align 8
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@.omp_task_entry.
+// CHECK1-SAME: (i32 noundef signext [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTPRIVATES__ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTCOPY_FN__ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTTASK_T__ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[__CONTEXT_ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTFIRSTPRIV_PTR_ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTFIRSTPRIV_PTR_ADDR1_I:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTFIRSTPRIV_PTR_ADDR2_I:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[KERNEL_ARGS_I:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED_I:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store i32 [[TMP0]], ptr [[DOTADDR]], align 4
+// CHECK1-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTADDR]], align 4
+// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP3]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2
+// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
+// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP3]], i32 0, i32 1
+// CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META24:![0-9]+]])
+// CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META27:![0-9]+]])
+// CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META29:![0-9]+]])
+// CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META31:![0-9]+]])
+// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META33:![0-9]+]]
+// CHECK1-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias [[META33]]
+// CHECK1-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META33]]
+// CHECK1-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META33]]
+// CHECK1-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias [[META33]]
+// CHECK1-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META33]]
+// CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META33]]
+// CHECK1-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META33]]
+// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META33]]
+// CHECK1-NEXT: call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]]) #[[ATTR2]]
+// CHECK1-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias [[META33]]
+// CHECK1-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias [[META33]]
+// CHECK1-NEXT: [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias [[META33]]
+// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP9]], align 4
+// CHECK1-NEXT: store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META33]]
+// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
+// CHECK1-NEXT: store i32 1, ptr [[TMP16]], align 4, !noalias [[META33]]
+// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
+// CHECK1-NEXT: store ptr [[TMP12]], ptr [[TMP17]], align 8, !noalias [[META33]]
+// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3
+// CHECK1-NEXT: store ptr [[TMP13]], ptr [[TMP18]], align 8, !noalias [[META33]]
+// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4
+// CHECK1-NEXT: store ptr [[TMP14]], ptr [[TMP19]], align 8, !noalias [[META33]]
+// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5
+// CHECK1-NEXT: store ptr @.offload_maptypes.6, ptr [[TMP20]], align 8, !noalias [[META33]]
+// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6
+// CHECK1-NEXT: store ptr null, ptr [[TMP21]], align 8, !noalias [[META33]]
+// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7
+// CHECK1-NEXT: store ptr null, ptr [[TMP22]], align 8, !noalias [[META33]]
+// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8
+// CHECK1-NEXT: store i64 0, ptr [[TMP23]], align 8, !noalias [[META33]]
+// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9
+// CHECK1-NEXT: store i64 9, ptr [[TMP24]], align 8, !noalias [[META33]]
+// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP25]], align 4, !noalias [[META33]]
+// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11
+// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP26]], align 4, !noalias [[META33]]
+// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12
+// CHECK1-NEXT: store i32 [[TMP15]], ptr [[TMP27]], align 4, !noalias [[META33]]
+// CHECK1-NEXT: [[TMP28:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.region_id, ptr [[KERNEL_ARGS_I]])
+// CHECK1-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
+// CHECK1-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]]
+// CHECK1: omp_offload.failed.i:
+// CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[TMP9]], align 4
+// CHECK1-NEXT: store i32 [[TMP30]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias [[META33]]
+// CHECK1-NEXT: [[TMP31:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 8, !noalias [[META33]]
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75(i64 [[TMP31]]) #[[ATTR2]]
+// CHECK1-NEXT: br label [[DOTOMP_OUTLINED__EXIT]]
+// CHECK1: .omp_outlined..exit:
+// CHECK1-NEXT: ret i32 0
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55
+// CHECK1-SAME: () #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined)
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60
+// CHECK1-SAME: (i64 noundef [[A:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[B_CASTED:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK1-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: store i64 [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK1-NEXT: [[TMP1:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR__ADDR]], align 2
+// CHECK1-NEXT: [[TMP2:%.*]] = sext i16 [[TMP1]] to i32
+// CHECK1-NEXT: call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 [[TMP2]], i32 0)
+// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK1-NEXT: store i32 [[TMP3]], ptr [[A_CASTED]], align 4
+// CHECK1-NEXT: [[TMP4:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK1-NEXT: [[TMP5:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK1-NEXT: store i16 [[TMP5]], ptr [[B_CASTED]], align 2
+// CHECK1-NEXT: [[TMP6:%.*]] = load i64, ptr [[B_CASTED]], align 8
+// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined, i64 [[TMP4]], i64 [[TMP6]])
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK1-NEXT: entry:
+// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK1-NEXT: store i64 [[B]], ptr [[B_ADDR]], align 8
+// CHECK1-NEXT: [[TMP0:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK1-NEXT: [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CONV]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[A_ADDR]], align 4
+// CHECK1-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_Z3bari
+// CHECK3-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[S:%.*]] = alloca [[STRUCT_S1:%.*]], align 4
+// CHECK3-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[A]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: [[CALL:%.*]] = call noundef i32 @_ZN2S12r1Ei(ptr noundef nonnull align 4 dereferenceable(8) [[S]], i32 noundef [[TMP0]])
+// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[A]], align 4
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CALL]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[A]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: [[CALL1:%.*]] = call noundef i32 @_ZL7fstatici(i32 noundef [[TMP2]])
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+// CHECK3-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP3]], [[CALL1]]
+// CHECK3-NEXT: store i32 [[ADD2]], ptr [[A]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: [[CALL3:%.*]] = call noundef i32 @_Z9ftemplateIiET_i(i32 noundef [[TMP4]])
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[A]], align 4
+// CHECK3-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP5]], [[CALL3]]
+// CHECK3-NEXT: store i32 [[ADD4]], ptr [[A]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[A]], align 4
+// CHECK3-NEXT: ret i32 [[TMP6]]
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_ZN2S12r1Ei
+// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[B:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[B_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 4
+// CHECK3-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK3-NEXT: [[DOTOFFLOAD_BASEPTRS3:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_PTRS4:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_MAPPERS5:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT: [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[B]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[B]], align 4
+// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP0]], [[TMP1]]
+// CHECK3-NEXT: store i32 [[SUB]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[B]], align 4
+// CHECK3-NEXT: store i32 [[TMP2]], ptr [[B_CASTED]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[B_CASTED]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 4
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED]], align 4
+// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT: store ptr [[THIS1]], ptr [[TMP6]], align 4
+// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT: store ptr [[A]], ptr [[TMP7]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK3-NEXT: store ptr null, ptr [[TMP8]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK3-NEXT: store i32 [[TMP3]], ptr [[TMP9]], align 4
+// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK3-NEXT: store i32 [[TMP3]], ptr [[TMP10]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK3-NEXT: store ptr null, ptr [[TMP11]], align 4
+// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK3-NEXT: store i32 [[TMP5]], ptr [[TMP12]], align 4
+// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK3-NEXT: store i32 [[TMP5]], ptr [[TMP13]], align 4
+// CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
+// CHECK3-NEXT: store ptr null, ptr [[TMP14]], align 4
+// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK3-NEXT: store i32 3, ptr [[TMP18]], align 4
+// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK3-NEXT: store i32 3, ptr [[TMP19]], align 4
+// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK3-NEXT: store ptr [[TMP15]], ptr [[TMP20]], align 4
+// CHECK3-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK3-NEXT: store ptr [[TMP16]], ptr [[TMP21]], align 4
+// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK3-NEXT: store ptr @.offload_sizes, ptr [[TMP22]], align 4
+// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK3-NEXT: store ptr @.offload_maptypes, ptr [[TMP23]], align 4
+// CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK3-NEXT: store ptr null, ptr [[TMP24]], align 4
+// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK3-NEXT: store ptr null, ptr [[TMP25]], align 4
+// CHECK3-NEXT: [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK3-NEXT: store i64 0, ptr [[TMP26]], align 8
+// CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK3-NEXT: store i64 4, ptr [[TMP27]], align 8
+// CHECK3-NEXT: [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4
+// CHECK3-NEXT: [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP29]], align 4
+// CHECK3-NEXT: [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK3-NEXT: store i32 [[TMP17]], ptr [[TMP30]], align 4
+// CHECK3-NEXT: [[TMP31:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.region_id, ptr [[KERNEL_ARGS]])
+// CHECK3-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
+// CHECK3-NEXT: br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK3: omp_offload.failed:
+// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88(ptr [[THIS1]], i32 [[TMP3]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]]
+// CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK3: omp_offload.cont:
+// CHECK3-NEXT: [[A2:%.*]] = getelementptr inbounds nuw [[STRUCT_S1]], ptr [[THIS1]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP33:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
+// CHECK3-NEXT: store ptr [[THIS1]], ptr [[TMP33]], align 4
+// CHECK3-NEXT: [[TMP34:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
+// CHECK3-NEXT: store ptr [[A2]], ptr [[TMP34]], align 4
+// CHECK3-NEXT: [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS5]], i32 0, i32 0
+// CHECK3-NEXT: store ptr null, ptr [[TMP35]], align 4
+// CHECK3-NEXT: [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP37:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0
+// CHECK3-NEXT: store i32 3, ptr [[TMP38]], align 4
+// CHECK3-NEXT: [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1
+// CHECK3-NEXT: store i32 1, ptr [[TMP39]], align 4
+// CHECK3-NEXT: [[TMP40:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2
+// CHECK3-NEXT: store ptr [[TMP36]], ptr [[TMP40]], align 4
+// CHECK3-NEXT: [[TMP41:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 3
+// CHECK3-NEXT: store ptr [[TMP37]], ptr [[TMP41]], align 4
+// CHECK3-NEXT: [[TMP42:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 4
+// CHECK3-NEXT: store ptr @.offload_sizes.1, ptr [[TMP42]], align 4
+// CHECK3-NEXT: [[TMP43:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 5
+// CHECK3-NEXT: store ptr @.offload_maptypes.2, ptr [[TMP43]], align 4
+// CHECK3-NEXT: [[TMP44:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 6
+// CHECK3-NEXT: store ptr null, ptr [[TMP44]], align 4
+// CHECK3-NEXT: [[TMP45:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 7
+// CHECK3-NEXT: store ptr null, ptr [[TMP45]], align 4
+// CHECK3-NEXT: [[TMP46:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 8
+// CHECK3-NEXT: store i64 0, ptr [[TMP46]], align 8
+// CHECK3-NEXT: [[TMP47:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 9
+// CHECK3-NEXT: store i64 0, ptr [[TMP47]], align 8
+// CHECK3-NEXT: [[TMP48:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 10
+// CHECK3-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP48]], align 4
+// CHECK3-NEXT: [[TMP49:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 11
+// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP49]], align 4
+// CHECK3-NEXT: [[TMP50:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 12
+// CHECK3-NEXT: store i32 1024, ptr [[TMP50]], align 4
+// CHECK3-NEXT: [[TMP51:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93.region_id, ptr [[KERNEL_ARGS6]])
+// CHECK3-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0
+// CHECK3-NEXT: br i1 [[TMP52]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]]
+// CHECK3: omp_offload.failed7:
+// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93(ptr [[THIS1]]) #[[ATTR2]]
+// CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT8]]
+// CHECK3: omp_offload.cont8:
+// CHECK3-NEXT: [[A9:%.*]] = getelementptr inbounds nuw [[STRUCT_S1]], ptr [[THIS1]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP53:%.*]] = load double, ptr [[A9]], align 4
+// CHECK3-NEXT: [[CONV:%.*]] = fptosi double [[TMP53]] to i32
+// CHECK3-NEXT: ret i32 [[CONV]]
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_ZL7fstatici
+// CHECK3-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR__CASTED2:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_4:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_6:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR__CASTED8:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_BASEPTRS9:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_PTRS10:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_MAPPERS11:%.*]] = alloca [1 x ptr], align 4
+// CHECK3-NEXT: [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK3-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP2]], 32
+// CHECK3-NEXT: store i32 [[MUL]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP3]], ptr [[N_CASTED]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED]], align 4
+// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: store i32 [[TMP7]], ptr [[DOTCAPTURE_EXPR__CASTED2]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED2]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT: store i32 [[TMP4]], ptr [[TMP9]], align 4
+// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT: store i32 [[TMP4]], ptr [[TMP10]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK3-NEXT: store ptr null, ptr [[TMP11]], align 4
+// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK3-NEXT: store i32 [[TMP6]], ptr [[TMP12]], align 4
+// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK3-NEXT: store i32 [[TMP6]], ptr [[TMP13]], align 4
+// CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK3-NEXT: store ptr null, ptr [[TMP14]], align 4
+// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK3-NEXT: store i32 [[TMP8]], ptr [[TMP15]], align 4
+// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK3-NEXT: store i32 [[TMP8]], ptr [[TMP16]], align 4
+// CHECK3-NEXT: [[TMP17:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
+// CHECK3-NEXT: store ptr null, ptr [[TMP17]], align 4
+// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[TMP21:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP21]], ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK3-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP22]], 0
+// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK3-NEXT: [[SUB5:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK3-NEXT: store i32 [[SUB5]], ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK3-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP23]], 1
+// CHECK3-NEXT: [[TMP24:%.*]] = zext i32 [[ADD]] to i64
+// CHECK3-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP20]], 0
+// CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK3-NEXT: store i32 3, ptr [[TMP27]], align 4
+// CHECK3-NEXT: [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK3-NEXT: store i32 3, ptr [[TMP28]], align 4
+// CHECK3-NEXT: [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK3-NEXT: store ptr [[TMP18]], ptr [[TMP29]], align 4
+// CHECK3-NEXT: [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK3-NEXT: store ptr [[TMP19]], ptr [[TMP30]], align 4
+// CHECK3-NEXT: [[TMP31:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK3-NEXT: store ptr @.offload_sizes.3, ptr [[TMP31]], align 4
+// CHECK3-NEXT: [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK3-NEXT: store ptr @.offload_maptypes.4, ptr [[TMP32]], align 4
+// CHECK3-NEXT: [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK3-NEXT: store ptr null, ptr [[TMP33]], align 4
+// CHECK3-NEXT: [[TMP34:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK3-NEXT: store ptr null, ptr [[TMP34]], align 4
+// CHECK3-NEXT: [[TMP35:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK3-NEXT: store i64 [[TMP24]], ptr [[TMP35]], align 8
+// CHECK3-NEXT: [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK3-NEXT: store i64 8, ptr [[TMP36]], align 8
+// CHECK3-NEXT: [[TMP37:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK3-NEXT: store [3 x i32] [[TMP26]], ptr [[TMP37]], align 4
+// CHECK3-NEXT: [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP38]], align 4
+// CHECK3-NEXT: [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK3-NEXT: store i32 [[TMP25]], ptr [[TMP39]], align 4
+// CHECK3-NEXT: [[TMP40:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 [[TMP20]], i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.region_id, ptr [[KERNEL_ARGS]])
+// CHECK3-NEXT: [[TMP41:%.*]] = icmp ne i32 [[TMP40]], 0
+// CHECK3-NEXT: br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK3: omp_offload.failed:
+// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71(i32 [[TMP4]], i32 [[TMP6]], i32 [[TMP8]]) #[[ATTR2]]
+// CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK3: omp_offload.cont:
+// CHECK3-NEXT: [[TMP42:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 32, [[TMP42]]
+// CHECK3-NEXT: store i32 [[ADD7]], ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK3-NEXT: [[TMP43:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK3-NEXT: store i32 [[TMP43]], ptr [[DOTCAPTURE_EXPR__CASTED8]], align 4
+// CHECK3-NEXT: [[TMP44:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED8]], align 4
+// CHECK3-NEXT: [[TMP45:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS9]], i32 0, i32 0
+// CHECK3-NEXT: store i32 [[TMP44]], ptr [[TMP45]], align 4
+// CHECK3-NEXT: [[TMP46:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS10]], i32 0, i32 0
+// CHECK3-NEXT: store i32 [[TMP44]], ptr [[TMP46]], align 4
+// CHECK3-NEXT: [[TMP47:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS11]], i32 0, i32 0
+// CHECK3-NEXT: store ptr null, ptr [[TMP47]], align 4
+// CHECK3-NEXT: [[TMP48:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS9]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP49:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS10]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP50:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[AGG_CAPTURED]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP51:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK3-NEXT: store i32 [[TMP51]], ptr [[TMP50]], align 4
+// CHECK3-NEXT: [[TMP52:%.*]] = call ptr @__kmpc_omp_target_task_alloc(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 36, i32 4, ptr @.omp_task_entry., i64 -1)
+// CHECK3-NEXT: [[TMP53:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP52]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP54:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP53]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP55:%.*]] = load ptr, ptr [[TMP54]], align 4
+// CHECK3-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP55]], ptr align 4 [[AGG_CAPTURED]], i32 4, i1 false)
+// CHECK3-NEXT: [[TMP56:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP52]], i32 0, i32 1
+// CHECK3-NEXT: [[TMP57:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T:%.*]], ptr [[TMP56]], i32 0, i32 0
+// CHECK3-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP57]], ptr align 4 @.offload_sizes.5, i32 8, i1 false)
+// CHECK3-NEXT: [[TMP58:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP56]], i32 0, i32 1
+// CHECK3-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP58]], ptr align 4 [[TMP48]], i32 4, i1 false)
+// CHECK3-NEXT: [[TMP59:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP56]], i32 0, i32 2
+// CHECK3-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP59]], ptr align 4 [[TMP49]], i32 4, i1 false)
+// CHECK3-NEXT: [[TMP60:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[TMP0]], ptr [[TMP52]])
+// CHECK3-NEXT: [[TMP61:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP61]], 1
+// CHECK3-NEXT: ret i32 [[ADD12]]
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@_Z9ftemplateIiET_i
+// CHECK3-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0]] comdat {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[A:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK3-NEXT: [[B:%.*]] = alloca i16, align 2
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i16, align 2
+// CHECK3-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[B_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 4
+// CHECK3-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 4
+// CHECK3-NEXT: [[KERNEL_ARGS1:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK3-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[A]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK3-NEXT: store i32 3, ptr [[TMP0]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK3-NEXT: store i32 0, ptr [[TMP1]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK3-NEXT: store ptr null, ptr [[TMP2]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK3-NEXT: store ptr null, ptr [[TMP3]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK3-NEXT: store ptr null, ptr [[TMP4]], align 4
+// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK3-NEXT: store ptr null, ptr [[TMP5]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK3-NEXT: store ptr null, ptr [[TMP6]], align 4
+// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK3-NEXT: store ptr null, ptr [[TMP7]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK3-NEXT: store i64 0, ptr [[TMP8]], align 8
+// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK3-NEXT: store i64 8, ptr [[TMP9]], align 8
+// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP10]], align 4
+// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP11]], align 4
+// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK3-NEXT: store i32 20, ptr [[TMP12]], align 4
+// CHECK3-NEXT: [[TMP13:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.region_id, ptr [[KERNEL_ARGS]])
+// CHECK3-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
+// CHECK3-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK3: omp_offload.failed:
+// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55() #[[ATTR2]]
+// CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]]
+// CHECK3: omp_offload.cont:
+// CHECK3-NEXT: store i16 1, ptr [[B]], align 2
+// CHECK3-NEXT: [[TMP15:%.*]] = load i16, ptr [[B]], align 2
+// CHECK3-NEXT: store i16 [[TMP15]], ptr [[DOTCAPTURE_EXPR_]], align 2
+// CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[A]], align 4
+// CHECK3-NEXT: store i32 [[TMP16]], ptr [[A_CASTED]], align 4
+// CHECK3-NEXT: [[TMP17:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// CHECK3-NEXT: [[TMP18:%.*]] = load i16, ptr [[B]], align 2
+// CHECK3-NEXT: store i16 [[TMP18]], ptr [[B_CASTED]], align 2
+// CHECK3-NEXT: [[TMP19:%.*]] = load i32, ptr [[B_CASTED]], align 4
+// CHECK3-NEXT: [[TMP20:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR_]], align 2
+// CHECK3-NEXT: store i16 [[TMP20]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 2
+// CHECK3-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED]], align 4
+// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT: store i32 [[TMP17]], ptr [[TMP22]], align 4
+// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT: store i32 [[TMP17]], ptr [[TMP23]], align 4
+// CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0
+// CHECK3-NEXT: store ptr null, ptr [[TMP24]], align 4
+// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1
+// CHECK3-NEXT: store i32 [[TMP19]], ptr [[TMP25]], align 4
+// CHECK3-NEXT: [[TMP26:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1
+// CHECK3-NEXT: store i32 [[TMP19]], ptr [[TMP26]], align 4
+// CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1
+// CHECK3-NEXT: store ptr null, ptr [[TMP27]], align 4
+// CHECK3-NEXT: [[TMP28:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2
+// CHECK3-NEXT: store i32 [[TMP21]], ptr [[TMP28]], align 4
+// CHECK3-NEXT: [[TMP29:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2
+// CHECK3-NEXT: store i32 [[TMP21]], ptr [[TMP29]], align 4
+// CHECK3-NEXT: [[TMP30:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2
+// CHECK3-NEXT: store ptr null, ptr [[TMP30]], align 4
+// CHECK3-NEXT: [[TMP31:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP32:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP33:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR_]], align 2
+// CHECK3-NEXT: [[TMP34:%.*]] = sext i16 [[TMP33]] to i32
+// CHECK3-NEXT: [[TMP35:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP34]], 0
+// CHECK3-NEXT: [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 0
+// CHECK3-NEXT: store i32 3, ptr [[TMP36]], align 4
+// CHECK3-NEXT: [[TMP37:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 1
+// CHECK3-NEXT: store i32 3, ptr [[TMP37]], align 4
+// CHECK3-NEXT: [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 2
+// CHECK3-NEXT: store ptr [[TMP31]], ptr [[TMP38]], align 4
+// CHECK3-NEXT: [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 3
+// CHECK3-NEXT: store ptr [[TMP32]], ptr [[TMP39]], align 4
+// CHECK3-NEXT: [[TMP40:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 4
+// CHECK3-NEXT: store ptr @.offload_sizes.7, ptr [[TMP40]], align 4
+// CHECK3-NEXT: [[TMP41:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 5
+// CHECK3-NEXT: store ptr @.offload_maptypes.8, ptr [[TMP41]], align 4
+// CHECK3-NEXT: [[TMP42:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 6
+// CHECK3-NEXT: store ptr null, ptr [[TMP42]], align 4
+// CHECK3-NEXT: [[TMP43:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 7
+// CHECK3-NEXT: store ptr null, ptr [[TMP43]], align 4
+// CHECK3-NEXT: [[TMP44:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 8
+// CHECK3-NEXT: store i64 0, ptr [[TMP44]], align 8
+// CHECK3-NEXT: [[TMP45:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 9
+// CHECK3-NEXT: store i64 8, ptr [[TMP45]], align 8
+// CHECK3-NEXT: [[TMP46:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 10
+// CHECK3-NEXT: store [3 x i32] [[TMP35]], ptr [[TMP46]], align 4
+// CHECK3-NEXT: [[TMP47:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 11
+// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP47]], align 4
+// CHECK3-NEXT: [[TMP48:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 12
+// CHECK3-NEXT: store i32 1024, ptr [[TMP48]], align 4
+// CHECK3-NEXT: [[TMP49:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 [[TMP34]], i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.region_id, ptr [[KERNEL_ARGS1]])
+// CHECK3-NEXT: [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0
+// CHECK3-NEXT: br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED2:%.*]], label [[OMP_OFFLOAD_CONT3:%.*]]
+// CHECK3: omp_offload.failed2:
+// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60(i32 [[TMP17]], i32 [[TMP19]], i32 [[TMP21]]) #[[ATTR2]]
+// CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT3]]
+// CHECK3: omp_offload.cont3:
+// CHECK3-NEXT: [[TMP51:%.*]] = load i32, ptr [[A]], align 4
+// CHECK3-NEXT: ret i32 [[TMP51]]
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88
+// CHECK3-SAME: (ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[B_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP1]], ptr [[B_CASTED]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[B_CASTED]], align 4
+// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined, ptr [[TMP0]], i32 [[TMP2]])
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK3-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP1]] to double
+// CHECK3-NEXT: [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
+// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CHECK3-NEXT: store double [[ADD]], ptr [[A]], align 4
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93
+// CHECK3-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CHECK3-NEXT: store double 2.500000e+00, ptr [[A]], align 4
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71
+// CHECK3-SAME: (i32 noundef [[N:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK3-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK3-NEXT: call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 [[TMP1]], i32 0)
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP2]], ptr [[N_CASTED]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined, i32 [[TMP3]])
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
+// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK3-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
+// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK3: omp.precond.then:
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: store i32 [[TMP3]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP5]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
+// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK3: cond.true:
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: br label [[COND_END:%.*]]
+// CHECK3: cond.false:
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: br label [[COND_END]]
+// CHECK3: cond.end:
+// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP8]], [[COND_TRUE]] ], [ [[TMP9]], [[COND_FALSE]] ]
+// CHECK3-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP10]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3: omp.inner.for.cond:
+// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18:![0-9]+]]
+// CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP11]], [[TMP12]]
+// CHECK3-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3: omp.inner.for.body:
+// CHECK3-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT: store i32 [[TMP15]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined, i32 [[TMP13]], i32 [[TMP14]], i32 [[TMP16]]), !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3: omp.inner.for.inc:
+// CHECK3-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
+// CHECK3: omp.inner.for.end:
+// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3: omp.loop.exit:
+// CHECK3-NEXT: [[TMP19:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP20]])
+// CHECK3-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
+// CHECK3-NEXT: br i1 [[TMP22]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK3: .omp.final.then:
+// CHECK3-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[SUB6:%.*]] = sub nsw i32 [[TMP23]], 0
+// CHECK3-NEXT: [[DIV7:%.*]] = sdiv i32 [[SUB6]], 1
+// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV7]], 1
+// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK3-NEXT: store i32 [[ADD8]], ptr [[I3]], align 4
+// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]]
+// CHECK3: .omp.final.done:
+// CHECK3-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK3: omp.precond.end:
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
+// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK3-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
+// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK3: omp.precond.then:
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: store i32 [[TMP3]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP7]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK3: cond.true:
+// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT: br label [[COND_END:%.*]]
+// CHECK3: cond.false:
+// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: br label [[COND_END]]
+// CHECK3: cond.end:
+// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP10]], [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ]
+// CHECK3-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK3-NEXT: store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK3: omp.inner.for.cond:
+// CHECK3-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
+// CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK3-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
+// CHECK3-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3: omp.inner.for.body:
+// CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK3: omp.body.continue:
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK3: omp.inner.for.inc:
+// CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK3-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP16]], 1
+// CHECK3-NEXT: store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+// CHECK3: omp.inner.for.end:
+// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK3: omp.loop.exit:
+// CHECK3-NEXT: [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
+// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP18]])
+// CHECK3-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK3-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
+// CHECK3-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK3: .omp.final.then:
+// CHECK3-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK3-NEXT: [[SUB7:%.*]] = sub nsw i32 [[TMP21]], 0
+// CHECK3-NEXT: [[DIV8:%.*]] = sdiv i32 [[SUB7]], 1
+// CHECK3-NEXT: [[MUL9:%.*]] = mul nsw i32 [[DIV8]], 1
+// CHECK3-NEXT: [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
+// CHECK3-NEXT: store i32 [[ADD10]], ptr [[I3]], align 4
+// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]]
+// CHECK3: .omp.final.done:
+// CHECK3-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK3: omp.precond.end:
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75
+// CHECK3-SAME: (i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined)
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@.omp_task_privates_map.
+// CHECK3-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]], ptr noalias noundef [[TMP2:%.*]], ptr noalias noundef [[TMP3:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTADDR3:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4
+// CHECK3-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 4
+// CHECK3-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 4
+// CHECK3-NEXT: store ptr [[TMP3]], ptr [[DOTADDR3]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 4
+// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T:%.*]], ptr [[TMP4]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTADDR3]], align 4
+// CHECK3-NEXT: store ptr [[TMP5]], ptr [[TMP6]], align 4
+// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP4]], i32 0, i32 1
+// CHECK3-NEXT: [[TMP8:%.*]] = load ptr, ptr [[DOTADDR1]], align 4
+// CHECK3-NEXT: store ptr [[TMP7]], ptr [[TMP8]], align 4
+// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP4]], i32 0, i32 2
+// CHECK3-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
+// CHECK3-NEXT: store ptr [[TMP9]], ptr [[TMP10]], align 4
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@.omp_task_entry.
+// CHECK3-SAME: (i32 noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTPRIVATES__ADDR_I:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTCOPY_FN__ADDR_I:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTTASK_T__ADDR_I:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[__CONTEXT_ADDR_I:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTFIRSTPRIV_PTR_ADDR_I:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTFIRSTPRIV_PTR_ADDR1_I:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTFIRSTPRIV_PTR_ADDR2_I:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[KERNEL_ARGS_I:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR__CASTED_I:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: store i32 [[TMP0]], ptr [[DOTADDR]], align 4
+// CHECK3-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 4
+// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTADDR]], align 4
+// CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP3]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2
+// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
+// CHECK3-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
+// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP3]], i32 0, i32 1
+// CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META25:![0-9]+]])
+// CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META28:![0-9]+]])
+// CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META30:![0-9]+]])
+// CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META32:![0-9]+]])
+// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META34:![0-9]+]]
+// CHECK3-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT: call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]]) #[[ATTR2]]
+// CHECK3-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT: [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP9]], align 4
+// CHECK3-NEXT: store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
+// CHECK3-NEXT: store i32 1, ptr [[TMP16]], align 4, !noalias [[META34]]
+// CHECK3-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
+// CHECK3-NEXT: store ptr [[TMP12]], ptr [[TMP17]], align 4, !noalias [[META34]]
+// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3
+// CHECK3-NEXT: store ptr [[TMP13]], ptr [[TMP18]], align 4, !noalias [[META34]]
+// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4
+// CHECK3-NEXT: store ptr [[TMP14]], ptr [[TMP19]], align 4, !noalias [[META34]]
+// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5
+// CHECK3-NEXT: store ptr @.offload_maptypes.6, ptr [[TMP20]], align 4, !noalias [[META34]]
+// CHECK3-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6
+// CHECK3-NEXT: store ptr null, ptr [[TMP21]], align 4, !noalias [[META34]]
+// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7
+// CHECK3-NEXT: store ptr null, ptr [[TMP22]], align 4, !noalias [[META34]]
+// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8
+// CHECK3-NEXT: store i64 0, ptr [[TMP23]], align 8, !noalias [[META34]]
+// CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9
+// CHECK3-NEXT: store i64 9, ptr [[TMP24]], align 8, !noalias [[META34]]
+// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10
+// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP25]], align 4, !noalias [[META34]]
+// CHECK3-NEXT: [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11
+// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP26]], align 4, !noalias [[META34]]
+// CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12
+// CHECK3-NEXT: store i32 [[TMP15]], ptr [[TMP27]], align 4, !noalias [[META34]]
+// CHECK3-NEXT: [[TMP28:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.region_id, ptr [[KERNEL_ARGS_I]])
+// CHECK3-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
+// CHECK3-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]]
+// CHECK3: omp_offload.failed.i:
+// CHECK3-NEXT: [[TMP30:%.*]] = load i32, ptr [[TMP9]], align 4
+// CHECK3-NEXT: store i32 [[TMP30]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias [[META34]]
+// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75(i32 [[TMP31]]) #[[ATTR2]]
+// CHECK3-NEXT: br label [[DOTOMP_OUTLINED__EXIT]]
+// CHECK3: .omp_outlined..exit:
+// CHECK3-NEXT: ret i32 0
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55
+// CHECK3-SAME: () #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined)
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60
+// CHECK3-SAME: (i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[B_CASTED:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK3-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK3-NEXT: [[TMP1:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR__ADDR]], align 2
+// CHECK3-NEXT: [[TMP2:%.*]] = sext i16 [[TMP1]] to i32
+// CHECK3-NEXT: call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 [[TMP2]], i32 0)
+// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[TMP3]], ptr [[A_CASTED]], align 4
+// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// CHECK3-NEXT: [[TMP5:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK3-NEXT: store i16 [[TMP5]], ptr [[B_CASTED]], align 2
+// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[B_CASTED]], align 4
+// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined, i32 [[TMP4]], i32 [[TMP6]])
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK3-NEXT: entry:
+// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK3-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK3-NEXT: [[TMP0:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CONV]]
+// CHECK3-NEXT: store i32 [[ADD]], ptr [[A_ADDR]], align 4
+// CHECK3-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]])
+// CHECK9-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK9-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK9-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK9-NEXT: store i64 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 8
+// CHECK9-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK9-NEXT: call void @__kmpc_push_num_teams(ptr @[[GLOB3]], i32 [[TMP0]], i32 [[TMP1]], i32 0)
+// CHECK9-NEXT: [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK9-NEXT: store i32 [[TMP2]], ptr [[N_CASTED]], align 4
+// CHECK9-NEXT: [[TMP3:%.*]] = load i64, ptr [[N_CASTED]], align 8
+// CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined, i64 [[TMP3]])
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK9-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK9-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK9-NEXT: store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
+// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK9-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK9-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK9-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK9-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
+// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK9: omp.precond.then:
+// CHECK9-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK9-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK9-NEXT: store i32 [[TMP3]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK9-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK9-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+// CHECK9-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP5]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK9-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK9-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
+// CHECK9-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK9: cond.true:
+// CHECK9-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK9-NEXT: br label [[COND_END:%.*]]
+// CHECK9: cond.false:
+// CHECK9-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT: br label [[COND_END]]
+// CHECK9: cond.end:
+// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ [[TMP8]], [[COND_TRUE]] ], [ [[TMP9]], [[COND_FALSE]] ]
+// CHECK9-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK9-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK9-NEXT: store i32 [[TMP10]], ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK9: omp.inner.for.cond:
+// CHECK9-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
+// CHECK9-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK9-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP11]], [[TMP12]]
+// CHECK9-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK9: omp.inner.for.body:
+// CHECK9-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK9-NEXT: [[TMP14:%.*]] = zext i32 [[TMP13]] to i64
+// CHECK9-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK9-NEXT: [[TMP16:%.*]] = zext i32 [[TMP15]] to i64
+// CHECK9-NEXT: [[TMP17:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK9-NEXT: store i32 [[TMP17]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK9-NEXT: [[TMP18:%.*]] = load i64, ptr [[N_CASTED]], align 8, !llvm.access.group [[ACC_GRP16]]
+// CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined, i64 [[TMP14]], i64 [[TMP16]], i64 [[TMP18]]), !llvm.access.group [[ACC_GRP16]]
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK9: omp.inner.for.inc:
+// CHECK9-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK9-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
+// CHECK9-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK9: omp.inner.for.end:
+// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK9: omp.loop.exit:
+// CHECK9-NEXT: [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
+// CHECK9-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP22]])
+// CHECK9-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK9-NEXT: [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
+// CHECK9-NEXT: br i1 [[TMP24]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK9: .omp.final.then:
+// CHECK9-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT: [[SUB6:%.*]] = sub nsw i32 [[TMP25]], 0
+// CHECK9-NEXT: [[DIV7:%.*]] = sdiv i32 [[SUB6]], 1
+// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV7]], 1
+// CHECK9-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK9-NEXT: store i32 [[ADD8]], ptr [[I3]], align 4
+// CHECK9-NEXT: br label [[DOTOMP_FINAL_DONE]]
+// CHECK9: .omp.final.done:
+// CHECK9-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK9: omp.precond.end:
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: [[I4:%.*]] = alloca i32, align 4
+// CHECK9-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK9-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK9-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK9-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8
+// CHECK9-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK9-NEXT: store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
+// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK9-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK9-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK9-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK9-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
+// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK9: omp.precond.then:
+// CHECK9-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK9-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK9-NEXT: store i32 [[TMP3]], ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT: [[TMP4:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8
+// CHECK9-NEXT: [[CONV:%.*]] = trunc i64 [[TMP4]] to i32
+// CHECK9-NEXT: [[TMP5:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8
+// CHECK9-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32
+// CHECK9-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4
+// CHECK9-NEXT: store i32 [[CONV3]], ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK9-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK9-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
+// CHECK9-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP7]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK9-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK9-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+// CHECK9-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK9: cond.true:
+// CHECK9-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK9-NEXT: br label [[COND_END:%.*]]
+// CHECK9: cond.false:
+// CHECK9-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT: br label [[COND_END]]
+// CHECK9: cond.end:
+// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ [[TMP10]], [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ]
+// CHECK9-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK9-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK9-NEXT: store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK9: omp.inner.for.cond:
+// CHECK9-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20:![0-9]+]]
+// CHECK9-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK9-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
+// CHECK9-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK9: omp.inner.for.body:
+// CHECK9-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1
+// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK9-NEXT: store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK9: omp.body.continue:
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK9: omp.inner.for.inc:
+// CHECK9-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK9-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP16]], 1
+// CHECK9-NEXT: store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]]
+// CHECK9: omp.inner.for.end:
+// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK9: omp.loop.exit:
+// CHECK9-NEXT: [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
+// CHECK9-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP18]])
+// CHECK9-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK9-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
+// CHECK9-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK9: .omp.final.then:
+// CHECK9-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK9-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0
+// CHECK9-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1
+// CHECK9-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1
+// CHECK9-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]]
+// CHECK9-NEXT: store i32 [[ADD11]], ptr [[I4]], align 4
+// CHECK9-NEXT: br label [[DOTOMP_FINAL_DONE]]
+// CHECK9: .omp.final.done:
+// CHECK9-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK9: omp.precond.end:
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK9-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined)
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[B_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK9-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK9-NEXT: store i64 [[B]], ptr [[B_ADDR]], align 8
+// CHECK9-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK9-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK9-NEXT: store i32 [[TMP1]], ptr [[B_CASTED]], align 4
+// CHECK9-NEXT: [[TMP2:%.*]] = load i64, ptr [[B_CASTED]], align 8
+// CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined, ptr [[TMP0]], i64 [[TMP2]])
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK9-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK9-NEXT: store i64 [[B]], ptr [[B_ADDR]], align 8
+// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK9-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK9-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP1]] to double
+// CHECK9-NEXT: [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
+// CHECK9-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CHECK9-NEXT: store double [[ADD]], ptr [[A]], align 8
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR0]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK9-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK9-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CHECK9-NEXT: store double 2.500000e+00, ptr [[A]], align 8
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined)
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[B_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]])
+// CHECK9-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK9-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK9-NEXT: store i64 [[B]], ptr [[B_ADDR]], align 8
+// CHECK9-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
+// CHECK9-NEXT: [[TMP1:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR__ADDR]], align 2
+// CHECK9-NEXT: [[TMP2:%.*]] = sext i16 [[TMP1]] to i32
+// CHECK9-NEXT: call void @__kmpc_push_num_teams(ptr @[[GLOB3]], i32 [[TMP0]], i32 [[TMP2]], i32 0)
+// CHECK9-NEXT: [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK9-NEXT: store i32 [[TMP3]], ptr [[A_CASTED]], align 4
+// CHECK9-NEXT: [[TMP4:%.*]] = load i64, ptr [[A_CASTED]], align 8
+// CHECK9-NEXT: [[TMP5:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK9-NEXT: store i16 [[TMP5]], ptr [[B_CASTED]], align 2
+// CHECK9-NEXT: [[TMP6:%.*]] = load i64, ptr [[B_CASTED]], align 8
+// CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined, i64 [[TMP4]], i64 [[TMP6]])
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK9-NEXT: entry:
+// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK9-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK9-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8
+// CHECK9-NEXT: store i64 [[B]], ptr [[B_ADDR]], align 8
+// CHECK9-NEXT: [[TMP0:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK9-NEXT: [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+// CHECK9-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CONV]]
+// CHECK9-NEXT: store i32 [[ADD]], ptr [[A_ADDR]], align 4
+// CHECK9-NEXT: ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[N:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK11-NEXT: entry:
+// CHECK11-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]])
+// CHECK11-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK11-NEXT: store i32 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 4
+// CHECK11-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK11-NEXT: call void @__kmpc_push_num_teams(ptr @[[GLOB3]], i32 [[TMP0]], i32 [[TMP1]], i32 0)
+// CHECK11-NEXT: [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[TMP2]], ptr [[N_CASTED]], align 4
+// CHECK11-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_CASTED]], align 4
+// CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined, i32 [[TMP3]])
+// CHECK11-NEXT: ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// CHECK11-NEXT: entry:
+// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK11-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK11-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
+// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK11-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK11-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK11-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK11-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
+// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK11: omp.precond.then:
+// CHECK11-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK11-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK11-NEXT: store i32 [[TMP3]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK11-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK11-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK11-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+// CHECK11-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP5]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK11-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK11-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK11-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
+// CHECK11-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK11: cond.true:
+// CHECK11-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK11-NEXT: br label [[COND_END:%.*]]
+// CHECK11: cond.false:
+// CHECK11-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK11-NEXT: br label [[COND_END]]
+// CHECK11: cond.end:
+// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ [[TMP8]], [[COND_TRUE]] ], [ [[TMP9]], [[COND_FALSE]] ]
+// CHECK11-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK11-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK11-NEXT: store i32 [[TMP10]], ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK11: omp.inner.for.cond:
+// CHECK11-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17:![0-9]+]]
+// CHECK11-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK11-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP11]], [[TMP12]]
+// CHECK11-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK11: omp.inner.for.body:
+// CHECK11-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK11-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK11-NEXT: [[TMP15:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK11-NEXT: store i32 [[TMP15]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK11-NEXT: [[TMP16:%.*]] = load i32, ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined, i32 [[TMP13]], i32 [[TMP14]], i32 [[TMP16]]), !llvm.access.group [[ACC_GRP17]]
+// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK11: omp.inner.for.inc:
+// CHECK11-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK11-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
+// CHECK11-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]]
+// CHECK11: omp.inner.for.end:
+// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK11: omp.loop.exit:
+// CHECK11-NEXT: [[TMP19:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
+// CHECK11-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP20]])
+// CHECK11-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK11-NEXT: [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
+// CHECK11-NEXT: br i1 [[TMP22]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK11: .omp.final.then:
+// CHECK11-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT: [[SUB6:%.*]] = sub nsw i32 [[TMP23]], 0
+// CHECK11-NEXT: [[DIV7:%.*]] = sdiv i32 [[SUB6]], 1
+// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV7]], 1
+// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK11-NEXT: store i32 [[ADD8]], ptr [[I3]], align 4
+// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]]
+// CHECK11: .omp.final.done:
+// CHECK11-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK11: omp.precond.end:
+// CHECK11-NEXT: ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0]] {
+// CHECK11-NEXT: entry:
+// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[I3:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK11-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK11-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK11-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK11-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0
+// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK11-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK11-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK11-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK11-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP2]]
+// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK11: omp.precond.then:
+// CHECK11-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK11-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK11-NEXT: store i32 [[TMP3]], ptr [[DOTOMP_UB]], align 4
+// CHECK11-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4
+// CHECK11-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4
+// CHECK11-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_LB]], align 4
+// CHECK11-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_UB]], align 4
+// CHECK11-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK11-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK11-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
+// CHECK11-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP7]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK11-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK11-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK11-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+// CHECK11-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK11: cond.true:
+// CHECK11-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK11-NEXT: br label [[COND_END:%.*]]
+// CHECK11: cond.false:
+// CHECK11-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK11-NEXT: br label [[COND_END]]
+// CHECK11: cond.end:
+// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ [[TMP10]], [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ]
+// CHECK11-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK11-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK11-NEXT: store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4
+// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK11: omp.inner.for.cond:
+// CHECK11-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21:![0-9]+]]
+// CHECK11-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK11-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
+// CHECK11-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK11: omp.inner.for.body:
+// CHECK11-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1
+// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK11-NEXT: store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK11: omp.body.continue:
+// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK11: omp.inner.for.inc:
+// CHECK11-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK11-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP16]], 1
+// CHECK11-NEXT: store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
+// CHECK11: omp.inner.for.end:
+// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK11: omp.loop.exit:
+// CHECK11-NEXT: [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4
+// CHECK11-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP18]])
+// CHECK11-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK11-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0
+// CHECK11-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// CHECK11: .omp.final.then:
+// CHECK11-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK11-NEXT: [[SUB7:%.*]] = sub nsw i32 [[TMP21]], 0
+// CHECK11-NEXT: [[DIV8:%.*]] = sdiv i32 [[SUB7]], 1
+// CHECK11-NEXT: [[MUL9:%.*]] = mul nsw i32 [[DIV8]], 1
+// CHECK11-NEXT: [[ADD10:%.*]] = add nsw i32 0, [[MUL9]]
+// CHECK11-NEXT: store i32 [[ADD10]], ptr [[I3]], align 4
+// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]]
+// CHECK11: .omp.final.done:
+// CHECK11-NEXT: br label [[OMP_PRECOND_END]]
+// CHECK11: omp.precond.end:
+// CHECK11-NEXT: ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK11-NEXT: entry:
+// CHECK11-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined)
+// CHECK11-NEXT: ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
+// CHECK11-NEXT: entry:
+// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK11-NEXT: ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK11-NEXT: entry:
+// CHECK11-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[B_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK11-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK11-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[TMP1]], ptr [[B_CASTED]], align 4
+// CHECK11-NEXT: [[TMP2:%.*]] = load i32, ptr [[B_CASTED]], align 4
+// CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined, ptr [[TMP0]], i32 [[TMP2]])
+// CHECK11-NEXT: ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK11-NEXT: entry:
+// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK11-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK11-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK11-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP1]] to double
+// CHECK11-NEXT: [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
+// CHECK11-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CHECK11-NEXT: store double [[ADD]], ptr [[A]], align 4
+// CHECK11-NEXT: ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR0]] {
+// CHECK11-NEXT: entry:
+// CHECK11-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK11-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
+// CHECK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
+// CHECK11-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CHECK11-NEXT: store double 2.500000e+00, ptr [[A]], align 4
+// CHECK11-NEXT: ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
+// CHECK11-NEXT: entry:
+// CHECK11-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined)
+// CHECK11-NEXT: ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
+// CHECK11-NEXT: entry:
+// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK11-NEXT: ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK11-NEXT: entry:
+// CHECK11-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[B_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]])
+// CHECK11-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
+// CHECK11-NEXT: [[TMP1:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR__ADDR]], align 2
+// CHECK11-NEXT: [[TMP2:%.*]] = sext i16 [[TMP1]] to i32
+// CHECK11-NEXT: call void @__kmpc_push_num_teams(ptr @[[GLOB3]], i32 [[TMP0]], i32 [[TMP2]], i32 0)
+// CHECK11-NEXT: [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[TMP3]], ptr [[A_CASTED]], align 4
+// CHECK11-NEXT: [[TMP4:%.*]] = load i32, ptr [[A_CASTED]], align 4
+// CHECK11-NEXT: [[TMP5:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK11-NEXT: store i16 [[TMP5]], ptr [[B_CASTED]], align 2
+// CHECK11-NEXT: [[TMP6:%.*]] = load i32, ptr [[B_CASTED]], align 4
+// CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined, i32 [[TMP4]], i32 [[TMP6]])
+// CHECK11-NEXT: ret void
+//
+//
+// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK11-NEXT: entry:
+// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
+// CHECK11-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
+// CHECK11-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK11-NEXT: store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK11-NEXT: [[TMP0:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK11-NEXT: [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+// CHECK11-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CONV]]
+// CHECK11-NEXT: store i32 [[ADD]], ptr [[A_ADDR]], align 4
+// CHECK11-NEXT: ret void
+//
>From b76e32c4b08fcd3285cb74271c7dae9495207263 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Sat, 1 Nov 2025 20:51:11 -0700
Subject: [PATCH 20/38] Fix review comments
---
clang/lib/Sema/SemaOpenMP.cpp | 10 +++++++---
clang/lib/Serialization/ASTReader.cpp | 5 ++---
2 files changed, 9 insertions(+), 6 deletions(-)
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index e8855103b061f..15d9f711a46e8 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -17697,7 +17697,8 @@ OMPClause *SemaOpenMP::ActOnOpenMPSingleExprWithArgClause(
static_cast<OpenMPScheduleClauseKind>(Argument[ScheduleKind]), Expr,
StartLoc, LParenLoc, ArgumentLoc[Modifier1], ArgumentLoc[Modifier2],
ArgumentLoc[ScheduleKind], DelimLoc, EndLoc);
- } break;
+ break;
+ }
case OMPC_if:
assert(Argument.size() == 1 && ArgumentLoc.size() == 1);
Res = ActOnOpenMPIfClause(static_cast<OpenMPDirectiveKind>(Argument.back()),
@@ -17756,14 +17757,17 @@ OMPClause *SemaOpenMP::ActOnOpenMPSingleExprWithArgClause(
case OMPC_dyn_groupprivate: {
enum { Modifier1, Modifier2, NumberOfElements };
assert(Argument.size() == NumberOfElements &&
- ArgumentLoc.size() == NumberOfElements);
+ ArgumentLoc.size() == NumberOfElements &&
+ "Modifiers for dyn_groupprivate clause and their locations are "
+ "expected.");
Res = ActOnOpenMPDynGroupprivateClause(
static_cast<OpenMPDynGroupprivateClauseModifier>(Argument[Modifier1]),
static_cast<OpenMPDynGroupprivateClauseFallbackModifier>(
Argument[Modifier2]),
Expr, StartLoc, LParenLoc, ArgumentLoc[Modifier1],
ArgumentLoc[Modifier2], EndLoc);
- } break;
+ break;
+ }
case OMPC_num_threads:
assert(Argument.size() == 1 && ArgumentLoc.size() == 1 &&
"Modifier for num_threads clause and its location are expected.");
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 46f10998ca76b..ac72cbb8ae459 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -12729,10 +12729,9 @@ void OMPClauseReader::VisitOMPDynGroupprivateClause(
OMPDynGroupprivateClause *C) {
VisitOMPClauseWithPreInit(C);
C->setDynGroupprivateModifier(
- static_cast<OpenMPDynGroupprivateClauseModifier>(Record.readInt()));
+ Record.readEnum<OpenMPDynGroupprivateClauseModifier>());
C->setDynGroupprivateFallbackModifier(
- static_cast<OpenMPDynGroupprivateClauseFallbackModifier>(
- Record.readInt()));
+ Record.readEnum<OpenMPDynGroupprivateClauseFallbackModifier>());
C->setSize(Record.readSubExpr());
C->setLParenLoc(Record.readSourceLocation());
C->setDynGroupprivateModifierLoc(Record.readSourceLocation());
>From a3cd7ef4260b992ea5026a372c7484e2b28a34ba Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Tue, 4 Nov 2025 12:20:02 -0800
Subject: [PATCH 21/38] Add cgroup mem parameters in createTarget
---
.../llvm/Frontend/OpenMP/OMPIRBuilder.h | 15 ++++++++++----
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 20 +++++++++++--------
2 files changed, 23 insertions(+), 12 deletions(-)
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index aa370606c6539..daa0c53157dec 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -2446,7 +2446,7 @@ class OpenMPIRBuilder {
/// The number of threads.
ArrayRef<Value *> NumThreads;
/// The size of the dynamic shared memory.
- Value *DynCGGroupMem = nullptr;
+ Value *DynCGroupMem = nullptr;
/// True if the kernel has 'no wait' clause.
bool HasNoWait = false;
/// The fallback mechanism for the shared memory.
@@ -2457,12 +2457,12 @@ class OpenMPIRBuilder {
TargetKernelArgs() {}
TargetKernelArgs(unsigned NumTargetItems, TargetDataRTArgs RTArgs,
Value *NumIterations, ArrayRef<Value *> NumTeams,
- ArrayRef<Value *> NumThreads, Value *DynCGGroupMem,
+ ArrayRef<Value *> NumThreads, Value *DynCGroupMem,
bool HasNoWait,
omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback)
: NumTargetItems(NumTargetItems), RTArgs(RTArgs),
NumIterations(NumIterations), NumTeams(NumTeams),
- NumThreads(NumThreads), DynCGGroupMem(DynCGGroupMem),
+ NumThreads(NumThreads), DynCGroupMem(DynCGroupMem),
HasNoWait(HasNoWait), DynCGroupMemFallback(DynCGroupMemFallback) {}
};
@@ -3248,6 +3248,10 @@ class OpenMPIRBuilder {
/// dependency information as passed in the depend clause
/// \param HasNowait Whether the target construct has a `nowait` clause or
/// not.
+ /// \param DynCGroupMem The size of the dynamic groupprivate memory for each
+ /// cgroup.
+ /// \param DynCGroupMem The fallback mechanism to execute if the requested
+ /// cgroup memory cannot be provided.
LLVM_ABI InsertPointOrErrorTy createTarget(
const LocationDescription &Loc, bool IsOffloadEntry,
OpenMPIRBuilder::InsertPointTy AllocaIP,
@@ -3259,7 +3263,10 @@ class OpenMPIRBuilder {
TargetBodyGenCallbackTy BodyGenCB,
TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
CustomMapperCallbackTy CustomMapperCB,
- const SmallVector<DependData> &Dependencies, bool HasNowait = false);
+ const SmallVector<DependData> &Dependencies, bool HasNowait = false,
+ Value *DynCGroupMem = nullptr,
+ omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback =
+ omp::OMPDynGroupprivateFallbackType::Abort);
/// Returns __kmpc_for_static_init_* runtime function for the specified
/// size \a IVSize and sign \a IVSigned. Will create a distribute call
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 9f80e3eb61bfc..5db9dbf8097e0 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -565,7 +565,7 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs,
Flags,
NumTeams3D,
NumThreads3D,
- KernelArgs.DynCGGroupMem};
+ KernelArgs.DynCGroupMem};
}
void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
@@ -8229,7 +8229,8 @@ static void emitTargetCall(
OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB,
OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB,
const SmallVector<llvm::OpenMPIRBuilder::DependData> &Dependencies,
- bool HasNoWait) {
+ bool HasNoWait, Value *DynCGroupMem,
+ OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
// Generate a function call to the host fallback implementation of the target
// region. This is called by the host when no offload entry was generated for
// the target region and when the offloading call fails at runtime.
@@ -8365,12 +8366,13 @@ static void emitTargetCall(
/*isSigned=*/false)
: Builder.getInt64(0);
- // TODO: Use correct DynCGGroupMem
- Value *DynCGGroupMem = Builder.getInt32(0);
+ // Request zero groupprivate bytes by default.
+ if (!DynCGroupMem)
+ DynCGroupMem = Builder.getInt32(0);
KArgs = OpenMPIRBuilder::TargetKernelArgs(
- NumTargetItems, RTArgs, TripCount, NumTeamsC, NumThreadsC,
- DynCGGroupMem, HasNoWait, OMPDynGroupprivateFallbackType::Abort);
+ NumTargetItems, RTArgs, TripCount, NumTeamsC, NumThreadsC, DynCGroupMem,
+ HasNoWait, DynCGroupMemFallback);
// Assume no error was returned because TaskBodyCB and
// EmitTargetCallFallbackCB don't produce any.
@@ -8419,7 +8421,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget(
OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc,
OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
CustomMapperCallbackTy CustomMapperCB,
- const SmallVector<DependData> &Dependencies, bool HasNowait) {
+ const SmallVector<DependData> &Dependencies, bool HasNowait,
+ Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
if (!updateToLocation(Loc))
return InsertPointTy();
@@ -8442,7 +8445,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget(
if (!Config.isTargetDevice())
emitTargetCall(*this, Builder, AllocaIP, Info, DefaultAttrs, RuntimeAttrs,
IfCond, OutlinedFn, OutlinedFnID, Inputs, GenMapInfoCB,
- CustomMapperCB, Dependencies, HasNowait);
+ CustomMapperCB, Dependencies, HasNowait, DynCGroupMem,
+ DynCGroupMemFallback);
return Builder.saveIP();
}
>From 42eaac1e9536fa2bb05b7db014788aa54439cebf Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Fri, 7 Nov 2025 17:02:59 -0800
Subject: [PATCH 22/38] Fix format
---
offload/include/omptarget.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index 6665637426ca1..f0079e7e76bd1 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -104,7 +104,7 @@ enum TargetAllocTy : int32_t {
};
inline KernelArgsTy CTorDTorKernelArgs = {
- 1, 0, nullptr, nullptr, nullptr, nullptr, nullptr,
+ 1, 0, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, 0, {0, 0, 0, 0}, {1, 0, 0}, {1, 0, 0}, 0};
struct DeviceTy;
>From 1c4fd2e4f29bb0de34a0dd2b8f3df19a7b4c4b1e Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Wed, 12 Nov 2025 17:40:18 -0800
Subject: [PATCH 23/38] Fix review comments
---
offload/include/omptarget.h | 4 ++--
offload/liboffload/API/Device.td | 1 +
offload/liboffload/src/OffloadImpl.cpp | 9 +++++++++
offload/libomptarget/OpenMP/API.cpp | 3 +++
offload/plugins-nextgen/amdgpu/src/rtl.cpp | 3 ---
offload/plugins-nextgen/common/include/PluginInterface.h | 9 +++------
offload/plugins-nextgen/cuda/src/rtl.cpp | 3 ---
offload/tools/deviceinfo/llvm-offload-device-info.cpp | 2 ++
offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp | 4 ++++
.../unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp | 2 ++
openmp/runtime/src/include/omp.h.var | 1 +
openmp/runtime/src/kmp_stub.cpp | 2 ++
12 files changed, 29 insertions(+), 14 deletions(-)
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index 48263a5a3f8dd..adecebe442d4c 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -281,8 +281,8 @@ int omp_get_num_devices(void);
int omp_get_device_num(void);
int omp_get_initial_device(void);
size_t
-omp_get_groupprivate_limit(int device_num,
- omp_access_t access_group = omp_access_cgroup);
+omp_get_groupprivate_limit(int DeviceNum,
+ omp_access_t AccessGroup = omp_access_cgroup);
void *omp_target_alloc(size_t Size, int DeviceNum);
void omp_target_free(void *DevicePtr, int DeviceNum);
int omp_target_is_present(const void *Ptr, int DeviceNum);
diff --git a/offload/liboffload/API/Device.td b/offload/liboffload/API/Device.td
index e9c154818c4a1..a918cff6de26e 100644
--- a/offload/liboffload/API/Device.td
+++ b/offload/liboffload/API/Device.td
@@ -43,6 +43,7 @@ def ol_device_info_t : Enum {
TaggedEtor<"ADDRESS_BITS", "uint32_t", "Number of bits used to represent an address in device memory">,
TaggedEtor<"MAX_MEM_ALLOC_SIZE", "uint64_t", "The maximum size of memory object allocation in bytes">,
TaggedEtor<"GLOBAL_MEM_SIZE", "uint64_t", "The size of global device memory in bytes">,
+ TaggedEtor<"WORK_GROUP_SHARED_MEM_SIZE", "uint64_t", "The maximum size of shared memory per work group in bytes">,
];
list<TaggedEtor> fp_configs = !foreach(type, ["Single", "Double", "Half"], TaggedEtor<type # "_FP_CONFIG", "ol_device_fp_capability_flags_t", type # " precision floating point capability">);
list<TaggedEtor> native_vec_widths = !foreach(type, ["char","short","int","long","float","double","half"], TaggedEtor<"NATIVE_VECTOR_WIDTH_" # type, "uint32_t", "Native vector width for " # type>);
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 84bc414396811..844ba18e3080c 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -495,6 +495,14 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device,
return Info.write(static_cast<uint32_t>(Value));
}
+ case OL_DEVICE_INFO_WORK_GROUP_SHARED_MEM_SIZE: {
+ // Uint64 values
+ if (!std::holds_alternative<uint64_t>(Entry->Value))
+ return makeError(ErrorCode::BACKEND_FAILURE,
+ "plugin returned incorrect type");
+ return Info.write(std::get<uint64_t>(Entry->Value));
+ }
+
case OL_DEVICE_INFO_MAX_WORK_SIZE_PER_DIMENSION:
case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE_PER_DIMENSION: {
// {x, y, z} triples
@@ -590,6 +598,7 @@ Error olGetDeviceInfoImplDetailHost(ol_device_handle_t Device,
return Info.write<uint32_t>(std::numeric_limits<uintptr_t>::digits);
case OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE:
case OL_DEVICE_INFO_GLOBAL_MEM_SIZE:
+ case OL_DEVICE_INFO_WORK_GROUP_SHARED_MEM_SIZE:
return Info.write<uint64_t>(0);
default:
return createOffloadError(ErrorCode::INVALID_ENUMERATION,
diff --git a/offload/libomptarget/OpenMP/API.cpp b/offload/libomptarget/OpenMP/API.cpp
index b03dd61424ccd..db2b45fb0b1b3 100644
--- a/offload/libomptarget/OpenMP/API.cpp
+++ b/offload/libomptarget/OpenMP/API.cpp
@@ -83,6 +83,9 @@ EXTERN size_t omp_get_groupprivate_limit(int DeviceNum,
if (DeviceNum == omp_get_initial_device())
return 0;
+ if (AccessGroup != omp_access_cgroup)
+ return 0;
+
auto DeviceOrErr = PM->getDevice(DeviceNum);
if (!DeviceOrErr)
FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 2575eb46c255b..ecc7fa8a306e6 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2198,9 +2198,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
}
}
- // Supports block shared memory natively.
- HasNativeBlockSharedMem = true;
-
return Plugin::success();
}
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index bddf0bdfafa9b..9f2b3b3f2ba63 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -808,8 +808,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
/// Get the total shared memory per block that can be used in any kernel.
uint32_t getMaxBlockSharedMemSize() const { return MaxBlockSharedMemSize; }
- /// Indicate whether the device has native block shared memory.
- bool hasNativeBlockSharedMem() const { return HasNativeBlockSharedMem; }
+ /// Indicate whether the device supports block shared memory natively.
+ bool hasNativeBlockSharedMem() const { return MaxBlockSharedMemSize > 0; }
/// Set the context of the device if needed, before calling device-specific
/// functions. Plugins may implement this function as a no-op if not needed.
@@ -1269,11 +1269,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
std::atomic<bool> OmptInitialized;
#endif
- /// The total per-block shared memory that a kernel may use.
+ /// The total per-block native shared memory that a kernel may use.
uint32_t MaxBlockSharedMemSize = 0;
-
- /// Whether the device has native block shared memory.
- bool HasNativeBlockSharedMem = false;
};
/// Class implementing common functionalities of offload plugins. Each plugin
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index e8c9986576f3c..af1f3ca1c6b5d 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -393,9 +393,6 @@ struct CUDADeviceTy : public GenericDeviceTy {
return Err;
MaxBlockSharedMemSize = MaxSharedMem;
- // Supports block shared memory natively.
- HasNativeBlockSharedMem = true;
-
return Plugin::success();
}
diff --git a/offload/tools/deviceinfo/llvm-offload-device-info.cpp b/offload/tools/deviceinfo/llvm-offload-device-info.cpp
index 42ffb97d6d77c..190c2b64f9979 100644
--- a/offload/tools/deviceinfo/llvm-offload-device-info.cpp
+++ b/offload/tools/deviceinfo/llvm-offload-device-info.cpp
@@ -205,6 +205,8 @@ ol_result_t printDevice(std::ostream &S, ol_device_handle_t D) {
S, D, OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, "Max Mem Allocation Size", "B"));
OFFLOAD_ERR(printDeviceValue<uint64_t>(S, D, OL_DEVICE_INFO_GLOBAL_MEM_SIZE,
"Global Mem Size", "B"));
+ OFFLOAD_ERR(printDeviceValue<uint64_t>(S, D, OL_DEVICE_INFO_WORK_GROUP_SHARED_MEM_SIZE,
+ "Work Group Shared Mem Size", "B"));
OFFLOAD_ERR(
(printDeviceValue<ol_device_fp_capability_flags_t, PrintKind::FP_FLAGS>(
S, D, OL_DEVICE_INFO_SINGLE_FP_CONFIG,
diff --git a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp
index 30eafee026316..23b5c356055e5 100644
--- a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp
+++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp
@@ -217,6 +217,10 @@ OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(GlobalMemSize, uint64_t,
OL_DEVICE_INFO_GLOBAL_MEM_SIZE, 0);
OL_DEVICE_INFO_TEST_HOST_SUCCESS(GlobalMemSize, uint64_t,
OL_DEVICE_INFO_GLOBAL_MEM_SIZE);
+OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(SharedMemSize, uint64_t,
+ OL_DEVICE_INFO_WORK_GROUP_SHARED_MEM_SIZE, 0);
+OL_DEVICE_INFO_TEST_HOST_SUCCESS(SharedMemSize, uint64_t,
+ OL_DEVICE_INFO_WORK_GROUP_SHARED_MEM_SIZE);
TEST_P(olGetDeviceInfoTest, InvalidNullHandleDevice) {
ol_device_type_t DeviceType;
diff --git a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp
index 79a18c1d133dc..11d20004e91fb 100644
--- a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp
+++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp
@@ -71,6 +71,8 @@ OL_DEVICE_INFO_SIZE_TEST_EQ(MaxMemAllocSize, uint64_t,
OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE);
OL_DEVICE_INFO_SIZE_TEST_EQ(GlobalMemSize, uint64_t,
OL_DEVICE_INFO_GLOBAL_MEM_SIZE);
+OL_DEVICE_INFO_SIZE_TEST_EQ(SharedMemSize, uint64_t,
+ OL_DEVICE_INFO_WORK_GROUP_SHARED_MEM_SIZE);
TEST_P(olGetDeviceInfoSizeTest, SuccessMaxWorkGroupSizePerDimension) {
size_t Size = 0;
diff --git a/openmp/runtime/src/include/omp.h.var b/openmp/runtime/src/include/omp.h.var
index 26c3df56a9ce3..70103a7f7e2cb 100644
--- a/openmp/runtime/src/include/omp.h.var
+++ b/openmp/runtime/src/include/omp.h.var
@@ -382,6 +382,7 @@
typedef enum {
omp_access_cgroup = 0,
+ omp_access_pteam = 1
} omp_access_t;
# if defined(_WIN32)
diff --git a/openmp/runtime/src/kmp_stub.cpp b/openmp/runtime/src/kmp_stub.cpp
index a099f887b6ba4..c930c8305ab3b 100644
--- a/openmp/runtime/src/kmp_stub.cpp
+++ b/openmp/runtime/src/kmp_stub.cpp
@@ -457,6 +457,8 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator) {
void *omp_get_dyn_groupprivate_ptr(size_t offset, int *is_fallback,
omp_access_t access_group) {
i;
+ if (is_fallback)
+ is_fallback = 0;
return NULL;
}
>From 45d116d72b07dbf080b381744aefd8e224ebebe5 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Wed, 12 Nov 2025 21:43:59 -0800
Subject: [PATCH 24/38] Improve kernel launch
---
.../common/include/PluginInterface.h | 14 +++
.../common/src/PluginInterface.cpp | 89 +++++++++++--------
2 files changed, 64 insertions(+), 39 deletions(-)
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 9f2b3b3f2ba63..3275849c8ae4d 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -439,6 +439,20 @@ struct GenericKernelTy {
uint32_t NumBlocks[3]) const;
private:
+ /// Information about the dynamic block memory needed for launching a kernel.
+ struct DynBlockMemInfoTy {
+ /// The size of the dynamic block memory buffer.
+ uint32_t Size = 0;
+ /// The size of dynamic shared memory natively provided by the device.
+ uint32_t NativeSize = 0;
+ /// The fallback that was triggered (if any).
+ DynCGroupMemFallbackType DynBlockMemFb = DynCGroupMemFallbackType::None;
+ /// The fallback pointer if global memory was used as alternative.
+ void *FallbackPtr = nullptr;
+ };
+
+ Expected<DynBlockMemInfoTy> prepareBlockMemory(GenericDeviceTy &GenericDevice, KernelArgsTy &KernelArgs);
+
/// Prepare the arguments before launching the kernel.
KernelLaunchParamsTy
prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs,
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 9c4aaf32de916..7478c2fa66dd9 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -518,66 +518,77 @@ Error GenericKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
return Plugin::success();
}
-Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
- ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs,
- AsyncInfoWrapperTy &AsyncInfoWrapper) const {
- llvm::SmallVector<void *, 16> Args;
- llvm::SmallVector<void *, 16> Ptrs;
-
- uint32_t NumThreads[3] = {KernelArgs.ThreadLimit[0],
- KernelArgs.ThreadLimit[1],
- KernelArgs.ThreadLimit[2]};
- uint32_t NumBlocks[3] = {KernelArgs.NumTeams[0], KernelArgs.NumTeams[1],
- KernelArgs.NumTeams[2]};
- if (!isBareMode()) {
- NumThreads[0] = getNumThreads(GenericDevice, NumThreads);
- NumBlocks[0] = getNumBlocks(GenericDevice, NumBlocks, KernelArgs.Tripcount,
- NumThreads[0], KernelArgs.ThreadLimit[0] > 0);
- }
-
- uint32_t MaxBlockMemSize = GenericDevice.getMaxBlockSharedMemSize();
- uint32_t DynBlockMemSize = KernelArgs.DynCGroupMem;
- uint32_t TotalBlockMemSize = StaticBlockMemSize + DynBlockMemSize;
- if (StaticBlockMemSize > MaxBlockMemSize)
+Expected<DynBlockMemInfoTy> prepareBlockMemory(GenericDeviceTy &GenericDevice, KernelArgsTy &KernelArgs) {
+ uint32_t MaxSize = GenericDevice.getMaxBlockSharedMemSize();
+ uint32_t DynSize = KernelArgs.DynCGroupMem;
+ uint32_t TotalSize = StaticSize + DynSize;
+ uint32_t DynNativeSize = DynSize;
+ void *DynFallbackPtr = nullptr;
+
+ // No enough block memory to cover the static one. Cannot run the kernel.
+ if (StaticSize > MaxSize)
return Plugin::error(ErrorCode::INVALID_ARGUMENT,
"Static block memory size exceeds maximum");
+ // No enough block memory to cover dynamic one, and the fallback is aborting.
else if (static_cast<DynCGroupMemFallbackType>(
KernelArgs.Flags.DynCGroupMemFallback) ==
DynCGroupMemFallbackType::Abort &&
- TotalBlockMemSize > MaxBlockMemSize)
+ TotalSize > MaxSize)
return Plugin::error(
ErrorCode::INVALID_ARGUMENT,
"Static and dynamic block memory size exceeds maximum");
- void *DynBlockMemFbPtr = nullptr;
- uint32_t DynBlockMemLaunchSize = DynBlockMemSize;
-
- DynCGroupMemFallbackType DynBlockMemFb = DynCGroupMemFallbackType::None;
- if (DynBlockMemSize && (!GenericDevice.hasNativeBlockSharedMem() ||
- TotalBlockMemSize > MaxBlockMemSize)) {
+ DynCGroupMemFallbackType DynFallback = DynCGroupMemFallbackType::None;
+ if (DynSize && (!GenericDevice.hasNativeBlockSharedMem() ||
+ TotalSize > MaxSize)) {
// Launch without native dynamic block memory.
- DynBlockMemLaunchSize = 0;
- DynBlockMemFb = static_cast<DynCGroupMemFallbackType>(
+ DynNativeSize = 0;
+ DynFallback = static_cast<DynCGroupMemFallbackType>(
KernelArgs.Flags.DynCGroupMemFallback);
- if (DynBlockMemFb == DynCGroupMemFallbackType::DefaultMem) {
+ if (DynFallback == DynCGroupMemFallbackType::DefaultMem) {
// Get global memory as fallback.
auto AllocOrErr = GenericDevice.dataAlloc(
- NumBlocks[0] * DynBlockMemSize,
+ NumBlocks[0] * DynSize,
/*HostPtr=*/nullptr, TargetAllocTy::TARGET_ALLOC_DEVICE);
if (!AllocOrErr)
return AllocOrErr.takeError();
-
- DynBlockMemFbPtr = *AllocOrErr;
- AsyncInfoWrapper.freeAllocationAfterSynchronization(DynBlockMemFbPtr);
+ DynFallbackPtr = *AllocOrErr;
} else {
// Do not provide any memory as fallback.
- DynBlockMemSize = 0;
+ DynSize = 0;
}
}
+ return { DynSize, DynNativeSize, DynFallback, DynFallbackPtr };
+}
+
+Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
+ ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) const {
+ llvm::SmallVector<void *, 16> Args;
+ llvm::SmallVector<void *, 16> Ptrs;
+
+ uint32_t NumThreads[3] = {KernelArgs.ThreadLimit[0],
+ KernelArgs.ThreadLimit[1],
+ KernelArgs.ThreadLimit[2]};
+ uint32_t NumBlocks[3] = {KernelArgs.NumTeams[0], KernelArgs.NumTeams[1],
+ KernelArgs.NumTeams[2]};
+ if (!isBareMode()) {
+ NumThreads[0] = getNumThreads(GenericDevice, NumThreads);
+ NumBlocks[0] = getNumBlocks(GenericDevice, NumBlocks, KernelArgs.Tripcount,
+ NumThreads[0], KernelArgs.ThreadLimit[0] > 0);
+ }
+
+ auto DynBlockMemInfoOrErr = prepareBlockMemory(GenericDevice, KernelArgs);
+ if (!DynBlockMemInfoOrErr)
+ return DynBlockMemInfoOrErr.takeError();
+
+ DynBlockMemInfoTy &DynBlockMemInfo = *DynBlockMemInfoOrErr;
+ if (DynBlockMemInfo.FallbackPtr)
+ AsyncInfoWrapper.freeAllocationAfterSynchronization(DynBlockMemInfo.FallbackPtr);
auto KernelLaunchEnvOrErr = getKernelLaunchEnvironment(
- GenericDevice, KernelArgs, DynBlockMemSize, DynBlockMemFb,
- DynBlockMemFbPtr, AsyncInfoWrapper);
+ GenericDevice, KernelArgs, DynBlockMemInfo.Size, DynBlockMemInfo.Fallback,
+ DynBlockMemInfo.FallbackPtr, AsyncInfoWrapper);
if (!KernelLaunchEnvOrErr)
return KernelLaunchEnvOrErr.takeError();
@@ -608,7 +619,7 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
printLaunchInfo(GenericDevice, KernelArgs, NumThreads, NumBlocks))
return Err;
- return launchImpl(GenericDevice, NumThreads, NumBlocks, DynBlockMemLaunchSize,
+ return launchImpl(GenericDevice, NumThreads, NumBlocks, DynBlockMemInfo.NativeSize,
KernelArgs, LaunchParams, AsyncInfoWrapper);
}
>From 3573177fb05d295bace897cd6b69209aa01e35b7 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Thu, 13 Nov 2025 09:31:48 -0800
Subject: [PATCH 25/38] Fixes
---
offload/libomptarget/device.cpp | 13 +++++++++---
.../common/include/PluginInterface.h | 10 +++------
.../common/src/PluginInterface.cpp | 21 +++++++++----------
3 files changed, 23 insertions(+), 21 deletions(-)
diff --git a/offload/libomptarget/device.cpp b/offload/libomptarget/device.cpp
index 3c45d728a078c..40b166e9780e1 100644
--- a/offload/libomptarget/device.cpp
+++ b/offload/libomptarget/device.cpp
@@ -373,7 +373,14 @@ bool DeviceTy::isAccessiblePtr(const void *Ptr, size_t Size) {
}
uint64_t DeviceTy::getMaxSharedTeamMemory() {
- using DeviceQueryKind = llvm::omp::target::plugin::DeviceQueryKind;
- return RTL->query_device_info(
- RTLDeviceID, DeviceQueryKind::DEVICE_QUERY_MAX_SHARED_TEAM_MEM);
+ InfoTreeNode Info = RTL->query_device_info(RTLDeviceID);
+
+ auto EntryOpt = Info.get(DeviceInfo::WORK_GROUP_SHARED_MEM_SIZE);
+ if (!EntryOpt)
+ return 0;
+
+ auto Entry = *EntryOpt;
+ if (!std::holds_alternative<uint64_t>(Entry->Value))
+ return 0;
+ return std::get<uint64_t>(Entry->Value);
}
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 3275849c8ae4d..3a29466e7daf1 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -299,10 +299,6 @@ struct InfoTreeNode {
}
};
-enum class DeviceQueryKind {
- DEVICE_QUERY_MAX_SHARED_TEAM_MEM = 0,
-};
-
/// Class wrapping a __tgt_device_image and its offload entry table on a
/// specific device. This class is responsible for storing and managing
/// the offload entries for an image on a device.
@@ -1515,12 +1511,12 @@ struct GenericPluginTy {
/// Query the current state of an asynchronous queue.
int32_t query_async(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr);
+ /// Obtain information about the given device.
+ InfoTreeNode obtain_device_info(int32_t DeviceId);
+
/// Prints information about the given devices supported by the plugin.
void print_device_info(int32_t DeviceId);
- /// Retrieve information about the given device.
- int64_t query_device_info(int32_t DeviceId, DeviceQueryKind Query);
-
/// Creates an event in the given plugin if supported.
int32_t create_event(int32_t DeviceId, void **EventPtr);
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 7478c2fa66dd9..c72a370f00fa4 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -2037,23 +2037,22 @@ int32_t GenericPluginTy::query_async(int32_t DeviceId,
return OFFLOAD_SUCCESS;
}
+InfoTreeNode GenericPluginTy::obtain_device_info(int32_t DeviceId) {
+ auto InfoOrErr = getDevice(DeviceId).obtainInfo();
+ if (auto Err = InfoOrErr.takeError()) {
+ REPORT("Failure to obtain device %d info: %s\n", DeviceId,
+ toString(std::move(Err)).data());
+ return InfoTreeNode{};
+ }
+ return *InfoOrErr;
+}
+
void GenericPluginTy::print_device_info(int32_t DeviceId) {
if (auto Err = getDevice(DeviceId).printInfo())
REPORT("Failure to print device %d info: %s\n", DeviceId,
toString(std::move(Err)).data());
}
-int64_t GenericPluginTy::query_device_info(int32_t DeviceId,
- DeviceQueryKind Query) {
- const GenericDeviceTy &Device = getDevice(DeviceId);
-
- switch (Query) {
- case DeviceQueryKind::DEVICE_QUERY_MAX_SHARED_TEAM_MEM:
- return Device.getMaxBlockSharedMemSize();
- }
- return 0;
-}
-
int32_t GenericPluginTy::create_event(int32_t DeviceId, void **EventPtr) {
auto Err = getDevice(DeviceId).createEvent(EventPtr);
if (Err) {
>From ec777650b5c8f5e15692b53a1aff41dacd5a6692 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Thu, 13 Nov 2025 12:20:40 -0800
Subject: [PATCH 26/38] Add fixes and improvements
---
offload/libomptarget/device.cpp | 4 +-
.../amdgpu/dynamic_hsa/hsa_ext_amd.h | 1 -
.../common/include/PluginInterface.h | 39 ++++++-----
.../common/src/PluginInterface.cpp | 64 ++++++++++---------
.../deviceinfo/llvm-offload-device-info.cpp | 2 -
5 files changed, 59 insertions(+), 51 deletions(-)
diff --git a/offload/libomptarget/device.cpp b/offload/libomptarget/device.cpp
index 40b166e9780e1..3ae0fba489bc7 100644
--- a/offload/libomptarget/device.cpp
+++ b/offload/libomptarget/device.cpp
@@ -373,9 +373,9 @@ bool DeviceTy::isAccessiblePtr(const void *Ptr, size_t Size) {
}
uint64_t DeviceTy::getMaxSharedTeamMemory() {
- InfoTreeNode Info = RTL->query_device_info(RTLDeviceID);
+ InfoTreeNode Info = RTL->obtain_device_info(RTLDeviceID);
- auto EntryOpt = Info.get(DeviceInfo::WORK_GROUP_SHARED_MEM_SIZE);
+ auto EntryOpt = Info.get(DeviceInfo::WORK_GROUP_LOCAL_MEM_SIZE);
if (!EntryOpt)
return 0;
diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
index c53538c64ccb2..ddfa65c76cf2d 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
@@ -52,7 +52,6 @@ typedef enum {
HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE = 6,
HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT = 7,
HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL = 15,
- HSA_AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE = 16,
} hsa_amd_memory_pool_info_t;
typedef enum {
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index e2f9957973293..5e1835672385e 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -299,6 +299,18 @@ struct InfoTreeNode {
}
};
+/// Configuration of dynamic block memory needed for launching a kernel.
+struct DynBlockMemConfTy {
+ /// The size of the dynamic block memory buffer.
+ uint32_t Size = 0;
+ /// The size of dynamic shared memory natively provided by the device.
+ uint32_t NativeSize = 0;
+ /// The fallback that was triggered (if any).
+ DynCGroupMemFallbackType Fallback = DynCGroupMemFallbackType::None;
+ /// The fallback pointer if global memory was used as alternative.
+ void *FallbackPtr = nullptr;
+};
+
/// Class wrapping a __tgt_device_image and its offload entry table on a
/// specific device. This class is responsible for storing and managing
/// the offload entries for an image on a device.
@@ -386,10 +398,11 @@ struct GenericKernelTy {
}
/// Return a device pointer to a new kernel launch environment.
- Expected<KernelLaunchEnvironmentTy *> getKernelLaunchEnvironment(
- GenericDeviceTy &GenericDevice, const KernelArgsTy &KernelArgs,
- uint32_t BlockMemSize, DynCGroupMemFallbackType DynBlockMemFb,
- void *DynBlockMemFbPtr, AsyncInfoWrapperTy &AsyncInfoWrapper) const;
+ Expected<KernelLaunchEnvironmentTy *>
+ getKernelLaunchEnvironment(GenericDeviceTy &GenericDevice,
+ const KernelArgsTy &KernelArgs,
+ const DynBlockMemConfTy &DynBlockMemConf,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) const;
/// Indicate whether an execution mode is valid.
static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) {
@@ -435,19 +448,11 @@ struct GenericKernelTy {
uint32_t NumBlocks[3]) const;
private:
- /// Information about the dynamic block memory needed for launching a kernel.
- struct DynBlockMemInfoTy {
- /// The size of the dynamic block memory buffer.
- uint32_t Size = 0;
- /// The size of dynamic shared memory natively provided by the device.
- uint32_t NativeSize = 0;
- /// The fallback that was triggered (if any).
- DynCGroupMemFallbackType DynBlockMemFb = DynCGroupMemFallbackType::None;
- /// The fallback pointer if global memory was used as alternative.
- void *FallbackPtr = nullptr;
- };
-
- Expected<DynBlockMemInfoTy> prepareBlockMemory(GenericDeviceTy &GenericDevice, KernelArgsTy &KernelArgs);
+ /// Prepare the block memory buffer requested for the kernel and execute the
+ /// specified fallback if necessary.
+ Expected<DynBlockMemConfTy> prepareBlockMemory(GenericDeviceTy &GenericDevice,
+ KernelArgsTy &KernelArgs,
+ uint32_t NumBlocks) const;
/// Prepare the arguments before launching the kernel.
KernelLaunchParamsTy
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index c72a370f00fa4..97a9d362e1abe 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -437,8 +437,8 @@ Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,
Expected<KernelLaunchEnvironmentTy *>
GenericKernelTy::getKernelLaunchEnvironment(
GenericDeviceTy &GenericDevice, const KernelArgsTy &KernelArgs,
- uint32_t BlockMemSize, DynCGroupMemFallbackType DynBlockMemFb,
- void *DynBlockMemFbPtr, AsyncInfoWrapperTy &AsyncInfoWrapper) const {
+ const DynBlockMemConfTy &DynBlockMemConf,
+ AsyncInfoWrapperTy &AsyncInfoWrapper) const {
// Ctor/Dtor have no arguments, replaying uses the original kernel launch
// environment. Older versions of the compiler do not generate a kernel
// launch environment.
@@ -480,9 +480,9 @@ GenericKernelTy::getKernelLaunchEnvironment(
LocalKLE.ReductionBuffer = nullptr;
}
- LocalKLE.DynCGroupMemSize = BlockMemSize;
- LocalKLE.DynCGroupMemFbPtr = DynBlockMemFbPtr;
- LocalKLE.DynCGroupMemFb = DynBlockMemFb;
+ LocalKLE.DynCGroupMemSize = DynBlockMemConf.Size;
+ LocalKLE.DynCGroupMemFbPtr = DynBlockMemConf.FallbackPtr;
+ LocalKLE.DynCGroupMemFb = DynBlockMemConf.Fallback;
INFO(OMP_INFOTYPE_DATA_TRANSFER, GenericDevice.getDeviceId(),
"Copying data from host to device, HstPtr=" DPxMOD ", TgtPtr=" DPxMOD
@@ -518,47 +518,51 @@ Error GenericKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
return Plugin::success();
}
-Expected<DynBlockMemInfoTy> prepareBlockMemory(GenericDeviceTy &GenericDevice, KernelArgsTy &KernelArgs) {
- uint32_t MaxSize = GenericDevice.getMaxBlockSharedMemSize();
- uint32_t DynSize = KernelArgs.DynCGroupMem;
- uint32_t TotalSize = StaticSize + DynSize;
- uint32_t DynNativeSize = DynSize;
+Expected<DynBlockMemConfTy>
+GenericKernelTy::prepareBlockMemory(GenericDeviceTy &GenericDevice,
+ KernelArgsTy &KernelArgs,
+ uint32_t NumBlocks) const {
+ uint32_t MaxBlockMemSize = GenericDevice.getMaxBlockSharedMemSize();
+ uint32_t DynBlockMemSize = KernelArgs.DynCGroupMem;
+ uint32_t TotalBlockMemSize = StaticBlockMemSize + DynBlockMemSize;
+ uint32_t DynNativeBlockMemSize = DynBlockMemSize;
void *DynFallbackPtr = nullptr;
// No enough block memory to cover the static one. Cannot run the kernel.
- if (StaticSize > MaxSize)
+ if (StaticBlockMemSize > MaxBlockMemSize)
return Plugin::error(ErrorCode::INVALID_ARGUMENT,
"Static block memory size exceeds maximum");
// No enough block memory to cover dynamic one, and the fallback is aborting.
else if (static_cast<DynCGroupMemFallbackType>(
KernelArgs.Flags.DynCGroupMemFallback) ==
DynCGroupMemFallbackType::Abort &&
- TotalSize > MaxSize)
+ TotalBlockMemSize > MaxBlockMemSize)
return Plugin::error(
ErrorCode::INVALID_ARGUMENT,
"Static and dynamic block memory size exceeds maximum");
DynCGroupMemFallbackType DynFallback = DynCGroupMemFallbackType::None;
- if (DynSize && (!GenericDevice.hasNativeBlockSharedMem() ||
- TotalSize > MaxSize)) {
+ if (DynBlockMemSize && (!GenericDevice.hasNativeBlockSharedMem() ||
+ TotalBlockMemSize > MaxBlockMemSize)) {
// Launch without native dynamic block memory.
- DynNativeSize = 0;
+ DynNativeBlockMemSize = 0;
DynFallback = static_cast<DynCGroupMemFallbackType>(
KernelArgs.Flags.DynCGroupMemFallback);
if (DynFallback == DynCGroupMemFallbackType::DefaultMem) {
// Get global memory as fallback.
auto AllocOrErr = GenericDevice.dataAlloc(
- NumBlocks[0] * DynSize,
+ NumBlocks * DynBlockMemSize,
/*HostPtr=*/nullptr, TargetAllocTy::TARGET_ALLOC_DEVICE);
if (!AllocOrErr)
return AllocOrErr.takeError();
DynFallbackPtr = *AllocOrErr;
} else {
// Do not provide any memory as fallback.
- DynSize = 0;
+ DynBlockMemSize = 0;
}
}
- return { DynSize, DynNativeSize, DynFallback, DynFallbackPtr };
+ return DynBlockMemConfTy{DynBlockMemSize, DynNativeBlockMemSize, DynFallback,
+ DynFallbackPtr};
}
Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
@@ -578,17 +582,18 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
NumThreads[0], KernelArgs.ThreadLimit[0] > 0);
}
- auto DynBlockMemInfoOrErr = prepareBlockMemory(GenericDevice, KernelArgs);
- if (!DynBlockMemInfoOrErr)
- return DynBlockMemInfoOrErr.takeError();
+ auto DynBlockMemConfOrErr =
+ prepareBlockMemory(GenericDevice, KernelArgs, NumBlocks[0]);
+ if (!DynBlockMemConfOrErr)
+ return DynBlockMemConfOrErr.takeError();
- DynBlockMemInfoTy &DynBlockMemInfo = *DynBlockMemInfoOrErr;
- if (DynBlockMemInfo.FallbackPtr)
- AsyncInfoWrapper.freeAllocationAfterSynchronization(DynBlockMemInfo.FallbackPtr);
+ DynBlockMemConfTy &DynBlockMemConf = *DynBlockMemConfOrErr;
+ if (DynBlockMemConf.FallbackPtr)
+ AsyncInfoWrapper.freeAllocationAfterSynchronization(
+ DynBlockMemConf.FallbackPtr);
auto KernelLaunchEnvOrErr = getKernelLaunchEnvironment(
- GenericDevice, KernelArgs, DynBlockMemInfo.Size, DynBlockMemInfo.Fallback,
- DynBlockMemInfo.FallbackPtr, AsyncInfoWrapper);
+ GenericDevice, KernelArgs, DynBlockMemConf, AsyncInfoWrapper);
if (!KernelLaunchEnvOrErr)
return KernelLaunchEnvOrErr.takeError();
@@ -619,8 +624,9 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
printLaunchInfo(GenericDevice, KernelArgs, NumThreads, NumBlocks))
return Err;
- return launchImpl(GenericDevice, NumThreads, NumBlocks, DynBlockMemInfo.NativeSize,
- KernelArgs, LaunchParams, AsyncInfoWrapper);
+ return launchImpl(GenericDevice, NumThreads, NumBlocks,
+ DynBlockMemConf.NativeSize, KernelArgs, LaunchParams,
+ AsyncInfoWrapper);
}
KernelLaunchParamsTy GenericKernelTy::prepareArgs(
@@ -2044,7 +2050,7 @@ InfoTreeNode GenericPluginTy::obtain_device_info(int32_t DeviceId) {
toString(std::move(Err)).data());
return InfoTreeNode{};
}
- return *InfoOrErr;
+ return std::move(*InfoOrErr);
}
void GenericPluginTy::print_device_info(int32_t DeviceId) {
diff --git a/offload/tools/deviceinfo/llvm-offload-device-info.cpp b/offload/tools/deviceinfo/llvm-offload-device-info.cpp
index 2a9f68fa45eec..74af3bfb13303 100644
--- a/offload/tools/deviceinfo/llvm-offload-device-info.cpp
+++ b/offload/tools/deviceinfo/llvm-offload-device-info.cpp
@@ -205,8 +205,6 @@ ol_result_t printDevice(std::ostream &S, ol_device_handle_t D) {
S, D, OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, "Max Mem Allocation Size", "B"));
OFFLOAD_ERR(printDeviceValue<uint64_t>(S, D, OL_DEVICE_INFO_GLOBAL_MEM_SIZE,
"Global Mem Size", "B"));
- OFFLOAD_ERR(printDeviceValue<uint64_t>(S, D, OL_DEVICE_INFO_WORK_GROUP_SHARED_MEM_SIZE,
- "Work Group Shared Mem Size", "B"));
OFFLOAD_ERR(
printDeviceValue<uint64_t>(S, D, OL_DEVICE_INFO_WORK_GROUP_LOCAL_MEM_SIZE,
"Work Group Shared Mem Size", "B"));
>From b38a2db603f6a4be87064eb11a61bd95e4713536 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Thu, 13 Nov 2025 16:43:18 -0800
Subject: [PATCH 27/38] Add more fixes
---
offload/include/device.h | 18 ++++++++++++++++--
offload/include/omptarget.h | 5 ++---
offload/libomptarget/OpenMP/API.cpp | 2 +-
offload/libomptarget/device.cpp | 13 -------------
4 files changed, 19 insertions(+), 19 deletions(-)
diff --git a/offload/include/device.h b/offload/include/device.h
index f767d352a774b..ca4ab75e77dd5 100644
--- a/offload/include/device.h
+++ b/offload/include/device.h
@@ -37,6 +37,8 @@
#include "PluginInterface.h"
using GenericPluginTy = llvm::omp::target::plugin::GenericPluginTy;
+using DeviceInfo = llvm::omp::target::plugin::DeviceInfo;
+using InfoTreeNode = llvm::omp::target::plugin::InfoTreeNode;
// Forward declarations.
struct __tgt_bin_desc;
@@ -167,8 +169,20 @@ struct DeviceTy {
/// Indicate that there are pending images for this device or not.
void setHasPendingImages(bool V) { HasPendingImages = V; }
- /// Get the maximum shared memory per team for any kernel.
- uint64_t getMaxSharedTeamMemory();
+ /// Get information from the device.
+ template <typename T>
+ T getInfo(DeviceInfo Info) const {
+ InfoTreeNode DevInfo = RTL->obtain_device_info(RTLDeviceID);
+
+ auto EntryOpt = DevInfo.get(Info);
+ if (!EntryOpt)
+ return 0;
+
+ auto Entry = *EntryOpt;
+ if (!std::holds_alternative<T>(Entry->Value))
+ return T{};
+ return std::get<T>(Entry->Value);
+ }
private:
/// Deinitialize the device (and plugin).
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index adecebe442d4c..18bfecc5498cc 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -280,9 +280,8 @@ void ompx_dump_mapping_tables(void);
int omp_get_num_devices(void);
int omp_get_device_num(void);
int omp_get_initial_device(void);
-size_t
-omp_get_groupprivate_limit(int DeviceNum,
- omp_access_t AccessGroup = omp_access_cgroup);
+size_t omp_get_groupprivate_limit(int DeviceNum,
+ omp_access_t AccessGroup = omp_access_cgroup);
void *omp_target_alloc(size_t Size, int DeviceNum);
void omp_target_free(void *DevicePtr, int DeviceNum);
int omp_target_is_present(const void *Ptr, int DeviceNum);
diff --git a/offload/libomptarget/OpenMP/API.cpp b/offload/libomptarget/OpenMP/API.cpp
index db2b45fb0b1b3..790158d80490b 100644
--- a/offload/libomptarget/OpenMP/API.cpp
+++ b/offload/libomptarget/OpenMP/API.cpp
@@ -90,7 +90,7 @@ EXTERN size_t omp_get_groupprivate_limit(int DeviceNum,
if (!DeviceOrErr)
FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
- return DeviceOrErr->getMaxSharedTeamMemory();
+ return DeviceOrErr->getInfo<uint64_t>(DeviceInfo::WORK_GROUP_LOCAL_MEM_SIZE);
}
EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) {
diff --git a/offload/libomptarget/device.cpp b/offload/libomptarget/device.cpp
index 3ae0fba489bc7..ee36fbed935a5 100644
--- a/offload/libomptarget/device.cpp
+++ b/offload/libomptarget/device.cpp
@@ -371,16 +371,3 @@ bool DeviceTy::useAutoZeroCopy() {
bool DeviceTy::isAccessiblePtr(const void *Ptr, size_t Size) {
return RTL->is_accessible_ptr(RTLDeviceID, Ptr, Size);
}
-
-uint64_t DeviceTy::getMaxSharedTeamMemory() {
- InfoTreeNode Info = RTL->obtain_device_info(RTLDeviceID);
-
- auto EntryOpt = Info.get(DeviceInfo::WORK_GROUP_LOCAL_MEM_SIZE);
- if (!EntryOpt)
- return 0;
-
- auto Entry = *EntryOpt;
- if (!std::holds_alternative<uint64_t>(Entry->Value))
- return 0;
- return std::get<uint64_t>(Entry->Value);
-}
>From 9d6a9b9bda1b82cd04800f19bb41ea3df39546f6 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Thu, 13 Nov 2025 17:13:52 -0800
Subject: [PATCH 28/38] Fix format
---
offload/include/device.h | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/offload/include/device.h b/offload/include/device.h
index ca4ab75e77dd5..06d21397c7377 100644
--- a/offload/include/device.h
+++ b/offload/include/device.h
@@ -170,8 +170,7 @@ struct DeviceTy {
void setHasPendingImages(bool V) { HasPendingImages = V; }
/// Get information from the device.
- template <typename T>
- T getInfo(DeviceInfo Info) const {
+ template <typename T> T getInfo(DeviceInfo Info) const {
InfoTreeNode DevInfo = RTL->obtain_device_info(RTLDeviceID);
auto EntryOpt = DevInfo.get(Info);
>From 249055b481fdeced9d57fa0136d179856e1e82c1 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Fri, 14 Nov 2025 09:55:20 -0800
Subject: [PATCH 29/38] Fix comment
---
offload/include/omptarget.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index 18bfecc5498cc..6ad9de3f88715 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -268,7 +268,7 @@ extern "C" {
#endif
/// The OpenMP access group type. The criterion for grupping tasks using a
-/// specific grouping property.
+/// specific groupping property.
enum omp_access_t {
/// Groups the tasks based on the contention group to which they belong.
omp_access_cgroup = 0,
>From b66c1188dae15956ba96f3b8eed01b21f4e9f338 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Fri, 14 Nov 2025 12:15:57 -0800
Subject: [PATCH 30/38] Fix typo
---
offload/include/omptarget.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index 6ad9de3f88715..0d8220bddb7a2 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -267,8 +267,8 @@ struct __tgt_target_non_contig {
extern "C" {
#endif
-/// The OpenMP access group type. The criterion for grupping tasks using a
-/// specific groupping property.
+/// The OpenMP access group type. The criterion for grouping tasks using a
+/// specific grouping property.
enum omp_access_t {
/// Groups the tasks based on the contention group to which they belong.
omp_access_cgroup = 0,
>From 31c405daf71912c6355e2a94954ba18edeaff664 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Wed, 4 Mar 2026 16:11:59 -0800
Subject: [PATCH 31/38] Fix usage of REPORT
---
offload/plugins-nextgen/common/src/PluginInterface.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index b459b294ab355..3ff47b934965a 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -2018,8 +2018,8 @@ int32_t GenericPluginTy::query_async(int32_t DeviceId,
InfoTreeNode GenericPluginTy::obtain_device_info(int32_t DeviceId) {
auto InfoOrErr = getDevice(DeviceId).obtainInfo();
if (auto Err = InfoOrErr.takeError()) {
- REPORT("Failure to obtain device %d info: %s\n", DeviceId,
- toString(std::move(Err)).data());
+ REPORT() << "Failure to obtain device " << DeviceId
+ << " info: " << toString(std::move(Err));
return InfoTreeNode{};
}
return std::move(*InfoOrErr);
>From 07677d82026b8b3dec98936baf3318bf4aaf19c6 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Wed, 4 Mar 2026 16:15:45 -0800
Subject: [PATCH 32/38] Adapt to latest OpenMP syntax
---
offload/include/omptarget.h | 4 +-
offload/libomptarget/OpenMP/API.cpp | 3 +-
offload/libomptarget/exports | 2 +-
offload/test/offloading/dyn_groupprivate.cpp | 103 +++++++++++++------
openmp/device/src/State.cpp | 48 ++++++---
openmp/runtime/src/include/omp.h.var | 14 +--
openmp/runtime/src/kmp_csupport.cpp | 11 +-
openmp/runtime/src/kmp_stub.cpp | 14 +--
8 files changed, 129 insertions(+), 70 deletions(-)
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index b680e2122b982..40c16a4a7580f 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -289,8 +289,8 @@ int omp_get_device_num(void);
int omp_get_device_from_uid(const char *DeviceUid);
const char *omp_get_uid_from_device(int DeviceNum);
int omp_get_initial_device(void);
-size_t omp_get_groupprivate_limit(int DeviceNum,
- omp_access_t AccessGroup = omp_access_cgroup);
+size_t omp_get_gprivate_limit(int DeviceNum,
+ omp_access_t AccessGroup = omp_access_cgroup);
void *omp_target_alloc(size_t Size, int DeviceNum);
void omp_target_free(void *DevicePtr, int DeviceNum);
int omp_target_is_present(const void *Ptr, int DeviceNum);
diff --git a/offload/libomptarget/OpenMP/API.cpp b/offload/libomptarget/OpenMP/API.cpp
index b13a0a7ce06eb..6dcd94e48e987 100644
--- a/offload/libomptarget/OpenMP/API.cpp
+++ b/offload/libomptarget/OpenMP/API.cpp
@@ -138,8 +138,7 @@ EXTERN int omp_get_initial_device(void) {
return HostDevice;
}
-EXTERN size_t omp_get_groupprivate_limit(int DeviceNum,
- omp_access_t AccessGroup) {
+EXTERN size_t omp_get_gprivate_limit(int DeviceNum, omp_access_t AccessGroup) {
TIMESCOPE();
OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
if (DeviceNum == omp_get_initial_device())
diff --git a/offload/libomptarget/exports b/offload/libomptarget/exports
index bb194134ac624..1831c43cc5f29 100644
--- a/offload/libomptarget/exports
+++ b/offload/libomptarget/exports
@@ -43,7 +43,7 @@ VERS1.0 {
omp_get_device_from_uid;
omp_get_uid_from_device;
omp_get_initial_device;
- omp_get_groupprivate_limit;
+ omp_get_gprivate_limit;
omp_target_alloc;
omp_target_free;
omp_target_is_accessible;
diff --git a/offload/test/offloading/dyn_groupprivate.cpp b/offload/test/offloading/dyn_groupprivate.cpp
index f36c35e611eb8..a8a0bbe3f37de 100644
--- a/offload/test/offloading/dyn_groupprivate.cpp
+++ b/offload/test/offloading/dyn_groupprivate.cpp
@@ -20,7 +20,7 @@ int main() {
int Buffer[N];
#pragma omp parallel
{
- int *DynBuffer = (int *)omp_get_dyn_groupprivate_ptr();
+ int *DynBuffer = (int *)omp_get_dyn_gprivate_nofb_ptr();
int TId = omp_get_thread_num();
if (TId == 0)
NumThreads = omp_get_num_threads();
@@ -47,25 +47,31 @@ int main() {
}
// Verify that the routines in the host returns NULL and zero.
- if (omp_get_dyn_groupprivate_ptr())
+ if (omp_get_dyn_gprivate_ptr())
++Failed;
- if (omp_get_dyn_groupprivate_size())
+ if (omp_get_dyn_gprivate_nofb_ptr())
+ ++Failed;
+ if (omp_get_dyn_gprivate_size())
++Failed;
- size_t MaxSize = omp_get_groupprivate_limit(0, omp_access_cgroup);
+ size_t MaxSize = omp_get_gprivate_limit(0, omp_access_cgroup);
size_t ExceededSize = MaxSize + 10;
// Verify that the fallback(default_mem) modifier works.
#pragma omp target dyn_groupprivate(fallback(default_mem) : ExceededSize) \
map(tofrom : Failed)
{
- int IsFallback;
- if (!omp_get_dyn_groupprivate_ptr(0, &IsFallback))
+ if (!omp_get_dyn_gprivate_ptr(0))
+ ++Failed;
+ if (omp_get_dyn_gprivate_nofb_ptr(0))
+ ++Failed;
+ if (omp_get_dyn_gprivate_ptr(0) == omp_get_dyn_gprivate_nofb_ptr(0))
++Failed;
- if (!omp_get_dyn_groupprivate_size())
+ if (!omp_get_dyn_gprivate_size())
++Failed;
- if (omp_get_dyn_groupprivate_size() != ExceededSize)
+ if (omp_get_dyn_gprivate_size() != ExceededSize)
++Failed;
+ bool IsFallback = true; // FIX
if (!IsFallback)
++Failed;
}
@@ -74,11 +80,15 @@ int main() {
#pragma omp target dyn_groupprivate(fallback(null) : ExceededSize) \
map(tofrom : Failed)
{
- int IsFallback;
- if (omp_get_dyn_groupprivate_ptr(0, &IsFallback))
+ if (omp_get_dyn_gprivate_ptr(0))
+ ++Failed;
+ if (omp_get_dyn_gprivate_nofb_ptr(0))
+ ++Failed;
+ if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0))
++Failed;
- if (omp_get_dyn_groupprivate_size())
+ if (omp_get_dyn_gprivate_size())
++Failed;
+ bool IsFallback = true; // FIX
if (!IsFallback)
++Failed;
}
@@ -86,13 +96,17 @@ int main() {
// Verify that the default modifier is fallback(default_mem).
#pragma omp target dyn_groupprivate(ExceededSize)
{
- int IsFallback;
- if (!omp_get_dyn_groupprivate_ptr(0, &IsFallback))
+ if (!omp_get_dyn_gprivate_ptr(0))
++Failed;
- if (!omp_get_dyn_groupprivate_size())
+ if (omp_get_dyn_gprivate_nofb_ptr(0))
++Failed;
- if (omp_get_dyn_groupprivate_size() != ExceededSize)
+ if (omp_get_dyn_gprivate_ptr(0) == omp_get_dyn_gprivate_nofb_ptr(0))
++Failed;
+ if (!omp_get_dyn_gprivate_size())
+ ++Failed;
+ if (omp_get_dyn_gprivate_size() != ExceededSize)
+ ++Failed;
+ bool IsFallback = true; // FIX
if (!IsFallback)
++Failed;
}
@@ -100,13 +114,19 @@ int main() {
// Verify that the fallback(abort) modifier works.
#pragma omp target dyn_groupprivate(fallback(abort) : N) map(tofrom : Failed)
{
- int IsFallback;
- if (!omp_get_dyn_groupprivate_ptr(0, &IsFallback))
+ if (!omp_get_dyn_gprivate_ptr(0))
+ ++Failed;
+ if (!omp_get_dyn_gprivate_nofb_ptr(0))
+ ++Failed;
+ if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0))
+ ++Failed;
+ if (omp_get_dyn_gprivate_ptr(5) != omp_get_dyn_gprivate_nofb_ptr(5))
++Failed;
- if (!omp_get_dyn_groupprivate_size())
+ if (!omp_get_dyn_gprivate_size())
++Failed;
- if (omp_get_dyn_groupprivate_size() != N)
+ if (omp_get_dyn_gprivate_size() != N)
++Failed;
+ bool IsFallback = false; // FIX
if (IsFallback)
++Failed;
}
@@ -115,13 +135,17 @@ int main() {
#pragma omp target dyn_groupprivate(fallback(default_mem) : N) \
map(tofrom : Failed)
{
- int IsFallback;
- if (!omp_get_dyn_groupprivate_ptr(0, &IsFallback))
+ if (!omp_get_dyn_gprivate_ptr(0))
++Failed;
- if (!omp_get_dyn_groupprivate_size())
+ if (!omp_get_dyn_gprivate_nofb_ptr(0))
++Failed;
- if (omp_get_dyn_groupprivate_size() != N)
+ if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0))
++Failed;
+ if (!omp_get_dyn_gprivate_size())
+ ++Failed;
+ if (omp_get_dyn_gprivate_size() != N)
+ ++Failed;
+ bool IsFallback = false; // FIX
if (IsFallback)
++Failed;
}
@@ -129,24 +153,33 @@ int main() {
// Verify that the clause works when passing a zero size.
#pragma omp target dyn_groupprivate(fallback(abort) : 0) map(tofrom : Failed)
{
- int IsFallback;
- if (omp_get_dyn_groupprivate_ptr(0, &IsFallback))
+ if (omp_get_dyn_gprivate_ptr(0))
+ ++Failed;
+ if (omp_get_dyn_gprivate_nofb_ptr(0))
++Failed;
- if (omp_get_dyn_groupprivate_size())
+ if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0))
++Failed;
+ if (omp_get_dyn_gprivate_size())
+ ++Failed;
+ bool IsFallback = false; // FIX
if (IsFallback)
++Failed;
}
-// Verify that the clause works when passing a zero size.
+// Verify that the clause works when passing a zero size and
+// fallback(default_mem).
#pragma omp target dyn_groupprivate(fallback(default_mem) : 0) \
map(tofrom : Failed)
{
- int IsFallback;
- if (omp_get_dyn_groupprivate_ptr(0, &IsFallback))
+ if (omp_get_dyn_gprivate_ptr(0))
++Failed;
- if (omp_get_dyn_groupprivate_size())
+ if (omp_get_dyn_gprivate_nofb_ptr(0))
++Failed;
+ if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0))
+ ++Failed;
+ if (omp_get_dyn_gprivate_size())
+ ++Failed;
+ bool IsFallback = false; // FIX
if (IsFallback)
++Failed;
}
@@ -154,11 +187,15 @@ int main() {
// Verify that omitting the clause is the same as setting zero size.
#pragma omp target map(tofrom : Failed)
{
- int IsFallback;
- if (omp_get_dyn_groupprivate_ptr(0, &IsFallback))
+ if (omp_get_dyn_gprivate_ptr(0))
+ ++Failed;
+ if (omp_get_dyn_gprivate_nofb_ptr(0))
+ ++Failed;
+ if (omp_get_dyn_gprivate_ptr(0) != omp_get_dyn_gprivate_nofb_ptr(0))
++Failed;
- if (omp_get_dyn_groupprivate_size())
+ if (omp_get_dyn_gprivate_size())
++Failed;
+ bool IsFallback = false; // FIX
if (IsFallback)
++Failed;
}
diff --git a/openmp/device/src/State.cpp b/openmp/device/src/State.cpp
index 1e2e9d5d42a9b..180d7e3e4d419 100644
--- a/openmp/device/src/State.cpp
+++ b/openmp/device/src/State.cpp
@@ -40,6 +40,10 @@ using namespace ompx;
[[clang::loader_uninitialized]] static Local<KernelLaunchEnvironmentTy *>
KernelLaunchEnvironmentPtr;
+/// The pointer type for dynamic shared memory. This is important to keep
+/// the alignment and address space information.
+using SharedMemPtrTy = decltype(&DynamicSharedBuffer[0]);
+
///}
namespace {
@@ -139,28 +143,38 @@ void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) {
}
struct DynCGroupMemTy {
- void init(KernelLaunchEnvironmentTy *KLE, void *NativeDynCGroup) {
+ void init(KernelLaunchEnvironmentTy *KLE, SharedMemPtrTy NativePtr) {
+ NativeOrNullPtr = nullptr;
+ FallbackPtr = nullptr;
Size = 0;
- Ptr = nullptr;
Fallback = DynCGroupMemFallbackType::None;
if (!KLE)
return;
Size = KLE->DynCGroupMemSize;
Fallback = KLE->DynCGroupMemFb;
- if (Fallback == DynCGroupMemFallbackType::None)
- Ptr = static_cast<char *>(NativeDynCGroup);
- else if (Fallback == DynCGroupMemFallbackType::DefaultMem)
- Ptr = static_cast<char *>(KLE->DynCGroupMemFbPtr) +
- Size * omp_get_team_num();
+ if (Size && Fallback == DynCGroupMemFallbackType::None)
+ NativeOrNullPtr = NativePtr;
+ if (Fallback == DynCGroupMemFallbackType::DefaultMem)
+ FallbackPtr = static_cast<unsigned char *>(KLE->DynCGroupMemFbPtr) +
+ Size * mapping::getBlockIdInKernel();
}
- char *getPtr(size_t Offset) const { return Ptr + Offset; }
bool isFallback() const { return Fallback != DynCGroupMemFallbackType::None; }
+ bool isDefaultMemFallback() const {
+ return Fallback == DynCGroupMemFallbackType::DefaultMem;
+ }
size_t getSize() const { return Size; }
+ SharedMemPtrTy getNativeOrNullPtr() const { return NativeOrNullPtr; }
+
+ unsigned char *getNativeOrFallbackPtr() const {
+ return (isDefaultMemFallback()) ? FallbackPtr : getNativeOrNullPtr();
+ }
+
private:
- char *Ptr;
+ SharedMemPtrTy NativeOrNullPtr;
+ unsigned char *FallbackPtr;
size_t Size;
DynCGroupMemFallbackType Fallback;
};
@@ -451,14 +465,18 @@ int omp_get_initial_device(void) { return -1; }
int omp_is_initial_device(void) { return 0; }
-void *omp_get_dyn_groupprivate_ptr(size_t Offset, int *IsFallback,
- omp_access_t) {
- if (IsFallback != nullptr)
- *IsFallback = DynCGroupMem.isFallback();
- return DynCGroupMem.getPtr(Offset);
+void *omp_get_dyn_gprivate_ptr(size_t Offset, omp_access_t) {
+ return DynCGroupMem.getNativeOrFallbackPtr() + Offset;
+}
+
+void *omp_get_dyn_gprivate_nofb_ptr(size_t Offset, omp_access_t) {
+ unsigned char *Ptr = DynCGroupMem.getNativeOrNullPtr();
+ // Ensure the alignment and address space information is kept.
+ Ptr = (unsigned char *)__builtin_assume_aligned(Ptr, allocator::ALIGNMENT);
+ return (SharedMemPtrTy)(Ptr + Offset);
}
-size_t omp_get_dyn_groupprivate_size(omp_access_t) {
+size_t omp_get_dyn_gprivate_size(omp_access_t) {
return DynCGroupMem.getSize();
}
}
diff --git a/openmp/runtime/src/include/omp.h.var b/openmp/runtime/src/include/omp.h.var
index df525906e2c92..2bf70d2f68aa9 100644
--- a/openmp/runtime/src/include/omp.h.var
+++ b/openmp/runtime/src/include/omp.h.var
@@ -468,9 +468,10 @@
omp_allocator_handle_t allocator = omp_null_allocator,
omp_allocator_handle_t free_allocator = omp_null_allocator);
extern void __KAI_KMPC_CONVENTION omp_free(void * ptr, omp_allocator_handle_t a = omp_null_allocator);
- extern void *__KAI_KMPC_CONVENTION omp_get_dyn_groupprivate_ptr(size_t offset = 0, int *is_fallback = NULL, omp_access_t access_group = omp_access_cgroup);
- extern size_t __KAI_KMPC_CONVENTION omp_get_dyn_groupprivate_size(omp_access_t access_group = omp_access_cgroup);
- extern size_t __KAI_KMPC_CONVENTION omp_get_groupprivate_limit(int device_num, omp_access_t access_group = omp_access_cgroup);
+ extern void *__KAI_KMPC_CONVENTION omp_get_dyn_gprivate_ptr(size_t offset = 0, omp_access_t access_group = omp_access_cgroup);
+ extern void *__KAI_KMPC_CONVENTION omp_get_dyn_gprivate_nofb_ptr(size_t offset = 0, omp_access_t access_group = omp_access_cgroup);
+ extern size_t __KAI_KMPC_CONVENTION omp_get_dyn_gprivate_size(omp_access_t access_group = omp_access_cgroup);
+ extern size_t __KAI_KMPC_CONVENTION omp_get_gprivate_limit(int device_num, omp_access_t access_group = omp_access_cgroup);
# else
extern void *__KAI_KMPC_CONVENTION omp_alloc(size_t size, omp_allocator_handle_t a);
extern void *__KAI_KMPC_CONVENTION omp_aligned_alloc(size_t align, size_t size,
@@ -481,9 +482,10 @@
extern void *__KAI_KMPC_CONVENTION omp_realloc(void *ptr, size_t size, omp_allocator_handle_t allocator,
omp_allocator_handle_t free_allocator);
extern void __KAI_KMPC_CONVENTION omp_free(void *ptr, omp_allocator_handle_t a);
- extern void *__KAI_KMPC_CONVENTION omp_get_dyn_groupprivate_ptr(size_t offset, int *is_fallback, omp_access_t access_group);
- extern size_t __KAI_KMPC_CONVENTION omp_get_dyn_groupprivate_size(omp_access_t access_group);
- extern size_t __KAI_KMPC_CONVENTION omp_get_groupprivate_limit(int device_num, omp_access_t access_group);
+ extern void *__KAI_KMPC_CONVENTION omp_get_dyn_gprivate_ptr(size_t offset, omp_access_t access_group);
+ extern void *__KAI_KMPC_CONVENTION omp_get_dyn_gprivate_nofb_ptr(size_t offset, omp_access_t access_group);
+ extern size_t __KAI_KMPC_CONVENTION omp_get_dyn_gprivate_size(omp_access_t access_group);
+ extern size_t __KAI_KMPC_CONVENTION omp_get_gprivate_limit(int device_num, omp_access_t access_group);
# endif
/* OpenMP TR11 routines to get memory spaces and allocators */
diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp
index d7e96fa87acc9..fc5e7a591032c 100644
--- a/openmp/runtime/src/kmp_csupport.cpp
+++ b/openmp/runtime/src/kmp_csupport.cpp
@@ -4515,14 +4515,15 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator) {
}
/* end of OpenMP 5.1 Memory Management routines */
-void *omp_get_dyn_groupprivate_ptr(size_t offset, int *is_fallback,
- omp_access_t access_group) {
- if (is_fallback != nullptr)
- *is_fallback = 0;
+void *omp_get_dyn_gprivate_ptr(size_t offset, omp_access_t access_group) {
return NULL;
}
-size_t omp_get_dyn_groupprivate_size(omp_access_t access_group) { return 0; }
+void *omp_get_dyn_gprivate_nofb_ptr(size_t offset, omp_access_t access_group) {
+ return NULL;
+}
+
+size_t omp_get_dyn_gprivate_size(omp_access_t access_group) { return 0; }
int __kmpc_get_target_offload(void) {
if (!__kmp_init_serial) {
diff --git a/openmp/runtime/src/kmp_stub.cpp b/openmp/runtime/src/kmp_stub.cpp
index c930c8305ab3b..2ba352377d674 100644
--- a/openmp/runtime/src/kmp_stub.cpp
+++ b/openmp/runtime/src/kmp_stub.cpp
@@ -454,20 +454,22 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator) {
#endif
}
-void *omp_get_dyn_groupprivate_ptr(size_t offset, int *is_fallback,
- omp_access_t access_group) {
+void *omp_get_dyn_gprivate_ptr(size_t offset, omp_access_t access_group) {
i;
- if (is_fallback)
- is_fallback = 0;
return NULL;
}
-size_t omp_get_dyn_groupprivate_size(omp_access_t access_group) {
+void *omp_get_dyn_gprivate_nofb_ptr(size_t offset, omp_access_t access_group) {
+ i;
+ return NULL;
+}
+
+size_t omp_get_dyn_gprivate_size(omp_access_t access_group) {
i;
return 0;
}
-size_t omp_get_groupprivate_limit(int device_num, omp_access_t access_group) {
+size_t omp_get_gprivate_limit(int device_num, omp_access_t access_group) {
i;
return 0;
}
>From 87b6dbeab9dd07d3f714002bea3ebb679bf2d912 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Wed, 4 Mar 2026 21:46:37 -0800
Subject: [PATCH 33/38] Add omp_get_dyn_gprivate_memspace routine
---
offload/test/offloading/dyn_groupprivate.cpp | 16 ++++++++++++++++
openmp/device/include/DeviceTypes.h | 6 ++++++
openmp/device/src/State.cpp | 18 ++++++++++++++----
openmp/runtime/src/dllexports | 1 +
openmp/runtime/src/include/omp.h.var | 4 ++++
openmp/runtime/src/kmp.h | 1 +
openmp/runtime/src/kmp_csupport.cpp | 4 ++++
openmp/runtime/src/kmp_global.cpp | 2 ++
openmp/runtime/src/kmp_stub.cpp | 7 +++++++
9 files changed, 55 insertions(+), 4 deletions(-)
diff --git a/offload/test/offloading/dyn_groupprivate.cpp b/offload/test/offloading/dyn_groupprivate.cpp
index a8a0bbe3f37de..bd4496a42d346 100644
--- a/offload/test/offloading/dyn_groupprivate.cpp
+++ b/offload/test/offloading/dyn_groupprivate.cpp
@@ -71,6 +71,8 @@ int main() {
++Failed;
if (omp_get_dyn_gprivate_size() != ExceededSize)
++Failed;
+ if (omp_get_dyn_gprivate_memspace() != omp_default_mem_space)
+ ++Failed;
bool IsFallback = true; // FIX
if (!IsFallback)
++Failed;
@@ -88,6 +90,8 @@ int main() {
++Failed;
if (omp_get_dyn_gprivate_size())
++Failed;
+ if (omp_get_dyn_gprivate_memspace() != omp_null_mem_space)
+ ++Failed;
bool IsFallback = true; // FIX
if (!IsFallback)
++Failed;
@@ -106,6 +110,8 @@ int main() {
++Failed;
if (omp_get_dyn_gprivate_size() != ExceededSize)
++Failed;
+ if (omp_get_dyn_gprivate_memspace() != omp_default_mem_space)
+ ++Failed;
bool IsFallback = true; // FIX
if (!IsFallback)
++Failed;
@@ -126,6 +132,8 @@ int main() {
++Failed;
if (omp_get_dyn_gprivate_size() != N)
++Failed;
+ if (omp_get_dyn_gprivate_memspace() != omp_cgroup_mem_space)
+ ++Failed;
bool IsFallback = false; // FIX
if (IsFallback)
++Failed;
@@ -145,6 +153,8 @@ int main() {
++Failed;
if (omp_get_dyn_gprivate_size() != N)
++Failed;
+ if (omp_get_dyn_gprivate_memspace() != omp_cgroup_mem_space)
+ ++Failed;
bool IsFallback = false; // FIX
if (IsFallback)
++Failed;
@@ -161,6 +171,8 @@ int main() {
++Failed;
if (omp_get_dyn_gprivate_size())
++Failed;
+ if (omp_get_dyn_gprivate_memspace() != omp_null_mem_space)
+ ++Failed;
bool IsFallback = false; // FIX
if (IsFallback)
++Failed;
@@ -179,6 +191,8 @@ int main() {
++Failed;
if (omp_get_dyn_gprivate_size())
++Failed;
+ if (omp_get_dyn_gprivate_memspace() != omp_null_mem_space)
+ ++Failed;
bool IsFallback = false; // FIX
if (IsFallback)
++Failed;
@@ -195,6 +209,8 @@ int main() {
++Failed;
if (omp_get_dyn_gprivate_size())
++Failed;
+ if (omp_get_dyn_gprivate_memspace() != omp_null_mem_space)
+ ++Failed;
bool IsFallback = false; // FIX
if (IsFallback)
++Failed;
diff --git a/openmp/device/include/DeviceTypes.h b/openmp/device/include/DeviceTypes.h
index 24554b135e3af..2d36c22be7475 100644
--- a/openmp/device/include/DeviceTypes.h
+++ b/openmp/device/include/DeviceTypes.h
@@ -171,6 +171,12 @@ typedef enum omp_allocator_handle_t {
KMP_ALLOCATOR_MAX_HANDLE = ~(0LU)
} omp_allocator_handle_t;
+typedef enum omp_memspace_handle_t {
+ omp_null_mem_space = 0,
+ omp_cgroup_mem_space = 5,
+ omp_default_mem_space = 99
+} omp_memspace_handle_t;
+
#define __PRAGMA(STR) _Pragma(#STR)
#define OMP_PRAGMA(STR) __PRAGMA(omp STR)
diff --git a/openmp/device/src/State.cpp b/openmp/device/src/State.cpp
index 180d7e3e4d419..e93d9cba49914 100644
--- a/openmp/device/src/State.cpp
+++ b/openmp/device/src/State.cpp
@@ -160,16 +160,22 @@ struct DynCGroupMemTy {
Size * mapping::getBlockIdInKernel();
}
- bool isFallback() const { return Fallback != DynCGroupMemFallbackType::None; }
- bool isDefaultMemFallback() const {
- return Fallback == DynCGroupMemFallbackType::DefaultMem;
+ omp_memspace_handle_t getMemSpace() const {
+ if (Size == 0)
+ return omp_null_mem_space;
+ if (Fallback == DynCGroupMemFallbackType::None)
+ return omp_cgroup_mem_space;
+ return omp_default_mem_space;
}
+
size_t getSize() const { return Size; }
SharedMemPtrTy getNativeOrNullPtr() const { return NativeOrNullPtr; }
unsigned char *getNativeOrFallbackPtr() const {
- return (isDefaultMemFallback()) ? FallbackPtr : getNativeOrNullPtr();
+ return (Fallback == DynCGroupMemFallbackType::DefaultMem)
+ ? FallbackPtr
+ : getNativeOrNullPtr();
}
private:
@@ -479,6 +485,10 @@ void *omp_get_dyn_gprivate_nofb_ptr(size_t Offset, omp_access_t) {
size_t omp_get_dyn_gprivate_size(omp_access_t) {
return DynCGroupMem.getSize();
}
+
+omp_memspace_handle_t omp_get_dyn_gprivate_memspace(omp_access_t) {
+ return DynCGroupMem.getMemSpace();
+}
}
extern "C" {
diff --git a/openmp/runtime/src/dllexports b/openmp/runtime/src/dllexports
index 00becd1a657fd..8a70f8bc6d20c 100644
--- a/openmp/runtime/src/dllexports
+++ b/openmp/runtime/src/dllexports
@@ -607,6 +607,7 @@ kmp_set_disp_num_buffers 890
llvm_omp_target_shared_mem_space DATA
llvm_omp_target_device_mem_space DATA
omp_null_mem_space DATA
+ omp_cgroup_mem_space DATA
%ifndef stub
# Ordinals between 900 and 999 are reserved
diff --git a/openmp/runtime/src/include/omp.h.var b/openmp/runtime/src/include/omp.h.var
index 2bf70d2f68aa9..be309727ba090 100644
--- a/openmp/runtime/src/include/omp.h.var
+++ b/openmp/runtime/src/include/omp.h.var
@@ -408,6 +408,7 @@
extern __KMP_IMP omp_memspace_handle_t const omp_const_mem_space;
extern __KMP_IMP omp_memspace_handle_t const omp_high_bw_mem_space;
extern __KMP_IMP omp_memspace_handle_t const omp_low_lat_mem_space;
+ extern __KMP_IMP omp_memspace_handle_t const omp_cgroup_mem_space;
extern __KMP_IMP omp_memspace_handle_t const llvm_omp_target_host_mem_space;
extern __KMP_IMP omp_memspace_handle_t const llvm_omp_target_shared_mem_space;
extern __KMP_IMP omp_memspace_handle_t const llvm_omp_target_device_mem_space;
@@ -444,6 +445,7 @@
omp_const_mem_space = 2,
omp_high_bw_mem_space = 3,
omp_low_lat_mem_space = 4,
+ omp_cgroup_mem_space = 5,
llvm_omp_target_host_mem_space = 100,
llvm_omp_target_shared_mem_space = 101,
llvm_omp_target_device_mem_space = 102,
@@ -471,6 +473,7 @@
extern void *__KAI_KMPC_CONVENTION omp_get_dyn_gprivate_ptr(size_t offset = 0, omp_access_t access_group = omp_access_cgroup);
extern void *__KAI_KMPC_CONVENTION omp_get_dyn_gprivate_nofb_ptr(size_t offset = 0, omp_access_t access_group = omp_access_cgroup);
extern size_t __KAI_KMPC_CONVENTION omp_get_dyn_gprivate_size(omp_access_t access_group = omp_access_cgroup);
+ extern omp_memspace_handle_t __KAI_KMPC_CONVENTION omp_get_dyn_gprivate_memspace(omp_access_t access_group = omp_access_cgroup);
extern size_t __KAI_KMPC_CONVENTION omp_get_gprivate_limit(int device_num, omp_access_t access_group = omp_access_cgroup);
# else
extern void *__KAI_KMPC_CONVENTION omp_alloc(size_t size, omp_allocator_handle_t a);
@@ -485,6 +488,7 @@
extern void *__KAI_KMPC_CONVENTION omp_get_dyn_gprivate_ptr(size_t offset, omp_access_t access_group);
extern void *__KAI_KMPC_CONVENTION omp_get_dyn_gprivate_nofb_ptr(size_t offset, omp_access_t access_group);
extern size_t __KAI_KMPC_CONVENTION omp_get_dyn_gprivate_size(omp_access_t access_group);
+ extern omp_memspace_handle_t __KAI_KMPC_CONVENTION omp_get_dyn_gprivate_memspace(omp_access_t access_group);
extern size_t __KAI_KMPC_CONVENTION omp_get_gprivate_limit(int device_num, omp_access_t access_group);
# endif
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 36c40abaf1ef4..19deaef75415d 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -1072,6 +1072,7 @@ extern omp_memspace_handle_t const omp_large_cap_mem_space;
extern omp_memspace_handle_t const omp_const_mem_space;
extern omp_memspace_handle_t const omp_high_bw_mem_space;
extern omp_memspace_handle_t const omp_low_lat_mem_space;
+extern omp_memspace_handle_t const omp_cgroup_mem_space;
extern omp_memspace_handle_t const llvm_omp_target_host_mem_space;
extern omp_memspace_handle_t const llvm_omp_target_shared_mem_space;
extern omp_memspace_handle_t const llvm_omp_target_device_mem_space;
diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp
index fc5e7a591032c..8aa9a9caa924b 100644
--- a/openmp/runtime/src/kmp_csupport.cpp
+++ b/openmp/runtime/src/kmp_csupport.cpp
@@ -4525,6 +4525,10 @@ void *omp_get_dyn_gprivate_nofb_ptr(size_t offset, omp_access_t access_group) {
size_t omp_get_dyn_gprivate_size(omp_access_t access_group) { return 0; }
+omp_memspace_handle_t omp_get_dyn_gprivate_memspace(omp_access_t access_group) {
+ return omp_null_mem_space;
+}
+
int __kmpc_get_target_offload(void) {
if (!__kmp_init_serial) {
__kmp_serial_initialize();
diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp
index 6c3b576cab405..c6fdcf824af92 100644
--- a/openmp/runtime/src/kmp_global.cpp
+++ b/openmp/runtime/src/kmp_global.cpp
@@ -333,6 +333,8 @@ omp_memspace_handle_t const omp_high_bw_mem_space =
(omp_memspace_handle_t const)3;
omp_memspace_handle_t const omp_low_lat_mem_space =
(omp_memspace_handle_t const)4;
+omp_memspace_handle_t const omp_cgroup_mem_space =
+ (omp_memspace_handle_t const)5;
omp_memspace_handle_t const llvm_omp_target_host_mem_space =
(omp_memspace_handle_t const)100;
omp_memspace_handle_t const llvm_omp_target_shared_mem_space =
diff --git a/openmp/runtime/src/kmp_stub.cpp b/openmp/runtime/src/kmp_stub.cpp
index 2ba352377d674..4c1e6099574a6 100644
--- a/openmp/runtime/src/kmp_stub.cpp
+++ b/openmp/runtime/src/kmp_stub.cpp
@@ -368,6 +368,8 @@ omp_memspace_handle_t const omp_high_bw_mem_space =
(omp_memspace_handle_t const)3;
omp_memspace_handle_t const omp_low_lat_mem_space =
(omp_memspace_handle_t const)4;
+omp_memspace_handle_t const omp_cgroup_mem_space =
+ (omp_memspace_handle_t const)5;
omp_memspace_handle_t const llvm_omp_target_host_mem_space =
(omp_memspace_handle_t const)100;
omp_memspace_handle_t const llvm_omp_target_shared_mem_space =
@@ -469,6 +471,11 @@ size_t omp_get_dyn_gprivate_size(omp_access_t access_group) {
return 0;
}
+omp_memspace_handle_t omp_get_dyn_gprivate_memspace(omp_access_t access_group) {
+ i;
+ return omp_null_mem_space;
+}
+
size_t omp_get_gprivate_limit(int device_num, omp_access_t access_group) {
i;
return 0;
>From 89b4732784e68204d8f9bded4eebeeabef917c1b Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Fri, 6 Mar 2026 16:27:27 -0800
Subject: [PATCH 34/38] Add fixes
---
offload/plugins-nextgen/amdgpu/src/rtl.cpp | 2 ++
offload/plugins-nextgen/cuda/src/rtl.cpp | 2 +-
offload/plugins-nextgen/level_zero/include/L0Kernel.h | 4 ++--
offload/plugins-nextgen/level_zero/src/L0Kernel.cpp | 2 +-
offload/test/offloading/dyn_groupprivate.cpp | 1 +
5 files changed, 7 insertions(+), 4 deletions(-)
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 8d529264285d0..c083e34e1bf4d 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -278,6 +278,7 @@ struct AMDGPUMemoryPoolTy {
if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, GlobalFlags))
return Err;
+
return Plugin::success();
}
@@ -548,6 +549,7 @@ struct AMDGPUKernelTy : public GenericKernelTy {
return Err;
}
+ // Set the static block memory size required by the kernel.
StaticBlockMemSize = GroupSize;
// Make sure it is a kernel symbol.
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 229fa2dc61d9f..473642906b2c0 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -156,7 +156,7 @@ struct CUDAKernelTy : public GenericKernelTy {
if (auto Err = Plugin::check(Res, "Error in cuFuncGetAttribute: %s"))
return Err;
- // Set the static block memory size.
+ // Set the static block memory size required by the kernel.
StaticBlockMemSize = SharedMemSize;
// Retrieve the size of the arguments.
diff --git a/offload/plugins-nextgen/level_zero/include/L0Kernel.h b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
index 1d5a014d9d0a5..50cdbd8390a9d 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Kernel.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
@@ -124,8 +124,8 @@ class L0KernelTy : public GenericKernelTy {
Error initImpl(GenericDeviceTy &GenericDevice, DeviceImageTy &Image) override;
/// Launch the L0 kernel function.
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
- uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
- KernelLaunchParamsTy LaunchParams,
+ uint32_t NumBlocks[3], uint32_t DynBlockMemSize,
+ KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
Error deinit() {
CALL_ZE_RET_ERROR(zeKernelDestroy, zeKernel);
diff --git a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
index b608e6ffe7931..8f95e39837856 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
@@ -413,7 +413,7 @@ Error L0KernelTy::setIndirectFlags(L0DeviceTy &l0Device,
Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t NumThreads[3], uint32_t NumBlocks[3],
- KernelArgsTy &KernelArgs,
+ uint32_t DynBlockMemSize, KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
auto &l0Device = L0DeviceTy::makeL0Device(GenericDevice);
diff --git a/offload/test/offloading/dyn_groupprivate.cpp b/offload/test/offloading/dyn_groupprivate.cpp
index bd4496a42d346..b1087a421550b 100644
--- a/offload/test/offloading/dyn_groupprivate.cpp
+++ b/offload/test/offloading/dyn_groupprivate.cpp
@@ -3,6 +3,7 @@
// RUN: %libomptarget-compileoptxx-generic -fopenmp-version=61
// RUN: %libomptarget-run-generic | %fcheck-generic
// REQUIRES: gpu
+// UNSUPPORTED: intelgpu
#include <omp.h>
#include <stdio.h>
>From 3a3c9d8a379c8cd09abe33eae7b5175308e8e247 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Tue, 10 Mar 2026 10:57:26 -0700
Subject: [PATCH 35/38] Add check in L0 plugin
---
offload/plugins-nextgen/level_zero/src/L0Kernel.cpp | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
index 8f95e39837856..1bffbbcd2fe92 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
@@ -416,6 +416,10 @@ Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t DynBlockMemSize, KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
+ if (DynBlockMemSize > 0)
+ return Plugin::error(ErrorCode::UNSUPPORTED,
+ "dynamic shared memory is unsupported in L0 plugin");
+
auto &l0Device = L0DeviceTy::makeL0Device(GenericDevice);
__tgt_async_info *AsyncInfo = AsyncInfoWrapper;
>From 630664e9a29966355998bd56ad10cb1161fd115c Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Tue, 10 Mar 2026 11:12:49 -0700
Subject: [PATCH 36/38] Add memspaces in openmp device rtl
---
openmp/device/include/DeviceTypes.h | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/openmp/device/include/DeviceTypes.h b/openmp/device/include/DeviceTypes.h
index 2d36c22be7475..57fb945b5a647 100644
--- a/openmp/device/include/DeviceTypes.h
+++ b/openmp/device/include/DeviceTypes.h
@@ -173,8 +173,13 @@ typedef enum omp_allocator_handle_t {
typedef enum omp_memspace_handle_t {
omp_null_mem_space = 0,
+ omp_default_mem_space = 99,
+ omp_large_cap_mem_space = 1,
+ omp_const_mem_space = 2,
+ omp_high_bw_mem_space = 3,
+ omp_low_lat_mem_space = 4,
omp_cgroup_mem_space = 5,
- omp_default_mem_space = 99
+ KMP_MEMSPACE_MAX_HANDLE = ~(0LU)
} omp_memspace_handle_t;
#define __PRAGMA(STR) _Pragma(#STR)
>From 13bd037c639b5d8c3fb0b4f5ed06653b564a8074 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Tue, 10 Mar 2026 11:19:34 -0700
Subject: [PATCH 37/38] Fix test
---
offload/test/offloading/dyn_groupprivate.cpp | 24 --------------------
1 file changed, 24 deletions(-)
diff --git a/offload/test/offloading/dyn_groupprivate.cpp b/offload/test/offloading/dyn_groupprivate.cpp
index b1087a421550b..fd0c3de0c8c5d 100644
--- a/offload/test/offloading/dyn_groupprivate.cpp
+++ b/offload/test/offloading/dyn_groupprivate.cpp
@@ -74,9 +74,6 @@ int main() {
++Failed;
if (omp_get_dyn_gprivate_memspace() != omp_default_mem_space)
++Failed;
- bool IsFallback = true; // FIX
- if (!IsFallback)
- ++Failed;
}
// Verify that the fallback(null) modifier works.
@@ -93,9 +90,6 @@ int main() {
++Failed;
if (omp_get_dyn_gprivate_memspace() != omp_null_mem_space)
++Failed;
- bool IsFallback = true; // FIX
- if (!IsFallback)
- ++Failed;
}
// Verify that the default modifier is fallback(default_mem).
@@ -113,9 +107,6 @@ int main() {
++Failed;
if (omp_get_dyn_gprivate_memspace() != omp_default_mem_space)
++Failed;
- bool IsFallback = true; // FIX
- if (!IsFallback)
- ++Failed;
}
// Verify that the fallback(abort) modifier works.
@@ -135,9 +126,6 @@ int main() {
++Failed;
if (omp_get_dyn_gprivate_memspace() != omp_cgroup_mem_space)
++Failed;
- bool IsFallback = false; // FIX
- if (IsFallback)
- ++Failed;
}
// Verify that the fallback(default_mem) does not trigger when not needed.
@@ -156,9 +144,6 @@ int main() {
++Failed;
if (omp_get_dyn_gprivate_memspace() != omp_cgroup_mem_space)
++Failed;
- bool IsFallback = false; // FIX
- if (IsFallback)
- ++Failed;
}
// Verify that the clause works when passing a zero size.
@@ -174,9 +159,6 @@ int main() {
++Failed;
if (omp_get_dyn_gprivate_memspace() != omp_null_mem_space)
++Failed;
- bool IsFallback = false; // FIX
- if (IsFallback)
- ++Failed;
}
// Verify that the clause works when passing a zero size and
@@ -194,9 +176,6 @@ int main() {
++Failed;
if (omp_get_dyn_gprivate_memspace() != omp_null_mem_space)
++Failed;
- bool IsFallback = false; // FIX
- if (IsFallback)
- ++Failed;
}
// Verify that omitting the clause is the same as setting zero size.
@@ -212,9 +191,6 @@ int main() {
++Failed;
if (omp_get_dyn_gprivate_memspace() != omp_null_mem_space)
++Failed;
- bool IsFallback = false; // FIX
- if (IsFallback)
- ++Failed;
}
// CHECK: PASS
>From 1b20c9cd859299bd9e59dc709b5f25a410d9a231 Mon Sep 17 00:00:00 2001
From: Kevin Sala <salapenades1 at llnl.gov>
Date: Wed, 11 Mar 2026 22:35:28 -0700
Subject: [PATCH 38/38] Address review comments
---
.../include/llvm/Frontend/OpenMP/OMPKinds.def | 2 +-
offload/include/Shared/Environment.h | 2 +-
offload/plugins-nextgen/amdgpu/src/rtl.cpp | 7 ++--
.../common/include/PluginInterface.h | 3 --
.../common/src/PluginInterface.cpp | 32 +++++++++----------
openmp/device/src/State.cpp | 9 ++++++
6 files changed, 31 insertions(+), 24 deletions(-)
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index 152a8f727310a..5fe7ee8997243 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -101,7 +101,7 @@ __OMP_STRUCT_TYPE(DynamicEnvironment, DynamicEnvironmentTy, false, Int16)
__OMP_STRUCT_TYPE(KernelEnvironment, KernelEnvironmentTy, false,
ConfigurationEnvironment, IdentPtr, DynamicEnvironmentPtr)
__OMP_STRUCT_TYPE(KernelLaunchEnvironment, KernelLaunchEnvironmentTy, false,
- Int32, Int32)
+ VoidPtr, VoidPtr, Int32, Int32, Int32, Int8)
#undef __OMP_STRUCT_TYPE
#undef OMP_STRUCT_TYPE
diff --git a/offload/include/Shared/Environment.h b/offload/include/Shared/Environment.h
index 1873f861c7f01..142fba40340e6 100644
--- a/offload/include/Shared/Environment.h
+++ b/offload/include/Shared/Environment.h
@@ -71,7 +71,7 @@ struct KernelEnvironmentTy {
};
/// The fallback types for the dynamic cgroup memory.
-enum class DynCGroupMemFallbackType : unsigned char {
+enum class DynCGroupMemFallbackType : uint8_t {
/// None. Used for indicating that no fallback was triggered.
None = 0,
/// Abort the execution.
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index c083e34e1bf4d..2a99423d9e94f 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -3805,10 +3805,13 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
DynBlockMemSize =
std::max(DynBlockMemSize, GenericDevice.getDynamicMemorySize());
+ // HSA requires the group segment size to include both static and dynamic.
+ uint32_t TotalBlockMemSize = getStaticBlockMemSize() + DynBlockMemSize;
+
// Push the kernel launch into the stream.
return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
- getStaticBlockMemSize() + DynBlockMemSize,
- StackSize, ArgsMemoryManager);
+ TotalBlockMemSize, StackSize,
+ ArgsMemoryManager);
}
Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 1ba0e6f4474fe..40e463e04fe3f 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -788,9 +788,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
/// kernel.
size_t getMaxBlockSharedMemSize() const { return MaxBlockSharedMemSize; }
- /// Indicate whether the device supports block shared memory natively.
- bool hasNativeBlockSharedMem() const { return MaxBlockSharedMemSize > 0; }
-
/// Set the context of the device if needed, before calling device-specific
/// functions. Plugins may implement this function as a no-op if not needed.
virtual Error setContext() = 0;
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 3ff47b934965a..c6057e549b59c 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -464,6 +464,11 @@ GenericKernelTy::getKernelLaunchEnvironment(
auto &LocalKLE = (*AsyncInfoWrapper).KernelLaunchEnvironment;
LocalKLE = KernelLaunchEnvironment;
+ LocalKLE.DynCGroupMemSize = DynBlockMemConf.Size;
+ LocalKLE.DynCGroupMemFbPtr = DynBlockMemConf.FallbackPtr;
+ LocalKLE.DynCGroupMemFb = DynBlockMemConf.Fallback;
+ LocalKLE.ReductionBuffer = nullptr;
+
if (KernelEnvironment.Configuration.ReductionDataSize &&
KernelEnvironment.Configuration.ReductionBufferLength) {
auto AllocOrErr = GenericDevice.dataAlloc(
@@ -475,14 +480,8 @@ GenericKernelTy::getKernelLaunchEnvironment(
LocalKLE.ReductionBuffer = *AllocOrErr;
// Remember to free the memory later.
AsyncInfoWrapper.freeAllocationAfterSynchronization(*AllocOrErr);
- } else {
- LocalKLE.ReductionBuffer = nullptr;
}
- LocalKLE.DynCGroupMemSize = DynBlockMemConf.Size;
- LocalKLE.DynCGroupMemFbPtr = DynBlockMemConf.FallbackPtr;
- LocalKLE.DynCGroupMemFb = DynBlockMemConf.Fallback;
-
INFO(OMP_INFOTYPE_DATA_TRANSFER, GenericDevice.getDeviceId(),
"Copying data from host to device, HstPtr=" DPxMOD ", TgtPtr=" DPxMOD
", Size=%" PRId64 ", Name=KernelLaunchEnv\n",
@@ -532,22 +531,24 @@ GenericKernelTy::prepareBlockMemory(GenericDeviceTy &GenericDevice,
return Plugin::error(ErrorCode::INVALID_ARGUMENT,
"Static block memory size exceeds maximum");
// No enough block memory to cover dynamic one, and the fallback is aborting.
- else if (static_cast<DynCGroupMemFallbackType>(
- KernelArgs.Flags.DynCGroupMemFallback) ==
- DynCGroupMemFallbackType::Abort &&
- TotalBlockMemSize > MaxBlockMemSize)
+ if (static_cast<DynCGroupMemFallbackType>(
+ KernelArgs.Flags.DynCGroupMemFallback) ==
+ DynCGroupMemFallbackType::Abort &&
+ TotalBlockMemSize > MaxBlockMemSize)
return Plugin::error(
ErrorCode::INVALID_ARGUMENT,
- "Static and dynamic block memory size exceeds maximum");
+ "Requested block memory size (static + dynamic) exceeds maximum");
DynCGroupMemFallbackType DynFallback = DynCGroupMemFallbackType::None;
- if (DynBlockMemSize && (!GenericDevice.hasNativeBlockSharedMem() ||
- TotalBlockMemSize > MaxBlockMemSize)) {
+ if (DynBlockMemSize && TotalBlockMemSize > MaxBlockMemSize) {
// Launch without native dynamic block memory.
DynNativeBlockMemSize = 0;
DynFallback = static_cast<DynCGroupMemFallbackType>(
KernelArgs.Flags.DynCGroupMemFallback);
- if (DynFallback == DynCGroupMemFallbackType::DefaultMem) {
+ if (DynFallback != DynCGroupMemFallbackType::DefaultMem) {
+ // Do not provide any memory as fallback.
+ DynBlockMemSize = 0;
+ } else {
// Get global memory as fallback.
auto AllocOrErr = GenericDevice.dataAlloc(
NumBlocks * DynBlockMemSize,
@@ -555,9 +556,6 @@ GenericKernelTy::prepareBlockMemory(GenericDeviceTy &GenericDevice,
if (!AllocOrErr)
return AllocOrErr.takeError();
DynFallbackPtr = *AllocOrErr;
- } else {
- // Do not provide any memory as fallback.
- DynBlockMemSize = 0;
}
}
return DynBlockMemConfTy{DynBlockMemSize, DynNativeBlockMemSize, DynFallback,
diff --git a/openmp/device/src/State.cpp b/openmp/device/src/State.cpp
index e93d9cba49914..243af1f2cb5e2 100644
--- a/openmp/device/src/State.cpp
+++ b/openmp/device/src/State.cpp
@@ -142,8 +142,12 @@ void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) {
memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
}
+/// Manager of the dynamic cgroup memory buffer.
struct DynCGroupMemTy {
+ /// Initialize the manager with the information from the kernel launch
+ /// enviornment and the pointer to the native shared memory buffer.
void init(KernelLaunchEnvironmentTy *KLE, SharedMemPtrTy NativePtr) {
+ // Initialize default values.
NativeOrNullPtr = nullptr;
FallbackPtr = nullptr;
Size = 0;
@@ -151,6 +155,7 @@ struct DynCGroupMemTy {
if (!KLE)
return;
+ // Initialize values using the kernel launch environment.
Size = KLE->DynCGroupMemSize;
Fallback = KLE->DynCGroupMemFb;
if (Size && Fallback == DynCGroupMemFallbackType::None)
@@ -160,6 +165,7 @@ struct DynCGroupMemTy {
Size * mapping::getBlockIdInKernel();
}
+ /// Get the memory space of the buffer.
omp_memspace_handle_t getMemSpace() const {
if (Size == 0)
return omp_null_mem_space;
@@ -168,10 +174,13 @@ struct DynCGroupMemTy {
return omp_default_mem_space;
}
+ /// Get the size of the buffer.
size_t getSize() const { return Size; }
+ /// Get the native pointer or null if it was a fallback.
SharedMemPtrTy getNativeOrNullPtr() const { return NativeOrNullPtr; }
+ /// Get the native pointer or the fallback pointer.
unsigned char *getNativeOrFallbackPtr() const {
return (Fallback == DynCGroupMemFallbackType::DefaultMem)
? FallbackPtr
More information about the cfe-commits
mailing list