[Openmp-commits] [openmp] 08bbff4 - [OpenMP] Codegen support for thread_limit on target directive for host
Sandeep Kosuri via Openmp-commits
openmp-commits at lists.llvm.org
Sat Aug 26 20:23:27 PDT 2023
Author: Sandeep Kosuri
Date: 2023-08-26T22:18:49-05:00
New Revision: 08bbff4aad57c70a38d5d2680a61901977e66637
URL: https://github.com/llvm/llvm-project/commit/08bbff4aad57c70a38d5d2680a61901977e66637
DIFF: https://github.com/llvm/llvm-project/commit/08bbff4aad57c70a38d5d2680a61901977e66637.diff
LOG: [OpenMP] Codegen support for thread_limit on target directive for host
offloading
- This patch adds support for thread_limit clause on target directive according to OpenMP 51 [2.14.5]
- The idea is to create an outer task for target region, when there is a thread_limit clause, and manipulate the thread_limit of task instead. This way, thread_limit will be applied to all the relevant constructs enclosed by the target region.
Differential Revision: https://reviews.llvm.org/D152054
Added:
clang/test/OpenMP/target_parallel_for_simd_tl_codegen.cpp
clang/test/OpenMP/target_parallel_for_tl_codegen.cpp
clang/test/OpenMP/target_parallel_generic_loop_tl_codegen.cpp
clang/test/OpenMP/target_parallel_tl_codegen.cpp
clang/test/OpenMP/target_simd_tl_codegen.cpp
openmp/runtime/test/target/target_thread_limit.cpp
Modified:
clang/include/clang/Basic/OpenMPKinds.h
clang/lib/Basic/OpenMPKinds.cpp
clang/lib/CodeGen/CGOpenMPRuntime.cpp
clang/lib/CodeGen/CGOpenMPRuntime.h
clang/lib/CodeGen/CGStmtOpenMP.cpp
clang/lib/Sema/SemaOpenMP.cpp
clang/test/OpenMP/target_codegen.cpp
llvm/include/llvm/Frontend/OpenMP/OMP.td
llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
openmp/runtime/src/kmp.h
openmp/runtime/src/kmp_csupport.cpp
openmp/runtime/src/kmp_ftn_entry.h
openmp/runtime/src/kmp_global.cpp
openmp/runtime/src/kmp_runtime.cpp
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/OpenMPKinds.h b/clang/include/clang/Basic/OpenMPKinds.h
index f5fc7a8ce5bb3c..ac1b3cdfff145b 100644
--- a/clang/include/clang/Basic/OpenMPKinds.h
+++ b/clang/include/clang/Basic/OpenMPKinds.h
@@ -356,6 +356,13 @@ void getOpenMPCaptureRegions(
/// \return true - if the above condition is met for this directive
/// otherwise - false.
bool isOpenMPCombinedParallelADirective(OpenMPDirectiveKind DKind);
+
+/// Checks if the specified target directive, combined or not, needs task based
+/// thread_limit
+/// \param DKind Specified directive.
+/// \return true - if the above condition is met for this directive
+/// otherwise - false.
+bool needsTaskBasedThreadLimit(OpenMPDirectiveKind DKind);
}
#endif
diff --git a/clang/lib/Basic/OpenMPKinds.cpp b/clang/lib/Basic/OpenMPKinds.cpp
index a679f2ecf0e2b5..86de067da134a0 100644
--- a/clang/lib/Basic/OpenMPKinds.cpp
+++ b/clang/lib/Basic/OpenMPKinds.cpp
@@ -748,6 +748,13 @@ bool clang::isOpenMPCombinedParallelADirective(OpenMPDirectiveKind DKind) {
DKind == OMPD_parallel_sections;
}
+bool clang::needsTaskBasedThreadLimit(OpenMPDirectiveKind DKind) {
+ return DKind == OMPD_target || DKind == OMPD_target_parallel ||
+ DKind == OMPD_target_parallel_for ||
+ DKind == OMPD_target_parallel_for_simd || DKind == OMPD_target_simd ||
+ DKind == OMPD_target_parallel_loop;
+}
+
void clang::getOpenMPCaptureRegions(
SmallVectorImpl<OpenMPDirectiveKind> &CaptureRegions,
OpenMPDirectiveKind DKind) {
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 5d947a2c0943a1..253ef8b75163ec 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -9681,9 +9681,13 @@ void CGOpenMPRuntime::emitTargetCall(
assert((OffloadingMandatory || OutlinedFn) && "Invalid outlined function!");
- const bool RequiresOuterTask = D.hasClausesOfKind<OMPDependClause>() ||
- D.hasClausesOfKind<OMPNowaitClause>() ||
- D.hasClausesOfKind<OMPInReductionClause>();
+ const bool RequiresOuterTask =
+ D.hasClausesOfKind<OMPDependClause>() ||
+ D.hasClausesOfKind<OMPNowaitClause>() ||
+ D.hasClausesOfKind<OMPInReductionClause>() ||
+ (CGM.getLangOpts().OpenMP >= 51 &&
+ needsTaskBasedThreadLimit(D.getDirectiveKind()) &&
+ D.hasClausesOfKind<OMPThreadLimitClause>());
llvm::SmallVector<llvm::Value *, 16> CapturedVars;
const CapturedStmt &CS = *D.getCapturedStmt(OMPD_target);
auto &&ArgsCodegen = [&CS, &CapturedVars](CodeGenFunction &CGF,
@@ -10235,6 +10239,24 @@ void CGOpenMPRuntime::emitNumTeamsClause(CodeGenFunction &CGF,
PushNumTeamsArgs);
}
+void CGOpenMPRuntime::emitThreadLimitClause(CodeGenFunction &CGF,
+ const Expr *ThreadLimit,
+ SourceLocation Loc) {
+ llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
+ llvm::Value *ThreadLimitVal =
+ ThreadLimit
+ ? CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(ThreadLimit),
+ CGF.CGM.Int32Ty, /* isSigned = */ true)
+ : CGF.Builder.getInt32(0);
+
+ // Build call __kmpc_set_thread_limit(&loc, global_tid, thread_limit)
+ llvm::Value *ThreadLimitArgs[] = {RTLoc, getThreadID(CGF, Loc),
+ ThreadLimitVal};
+ CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+ CGM.getModule(), OMPRTL___kmpc_set_thread_limit),
+ ThreadLimitArgs);
+}
+
void CGOpenMPRuntime::emitTargetDataCalls(
CodeGenFunction &CGF, const OMPExecutableDirective &D, const Expr *IfCond,
const Expr *Device, const RegionCodeGenTy &CodeGen,
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h
index d1ad6d7f06a857..74b528d6cd7f8c 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -1435,6 +1435,14 @@ class CGOpenMPRuntime {
virtual void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams,
const Expr *ThreadLimit, SourceLocation Loc);
+ /// Emits call to void __kmpc_set_thread_limit(ident_t *loc, kmp_int32
+ /// global_tid, kmp_int32 thread_limit) to generate code for
+ /// thread_limit clause on target directive
+ /// \param ThreadLimit An integer expression of threads.
+ virtual void emitThreadLimitClause(CodeGenFunction &CGF,
+ const Expr *ThreadLimit,
+ SourceLocation Loc);
+
/// Struct that keeps all the relevant information that should be kept
/// throughout a 'target data' region.
class TargetDataInfo : public llvm::OpenMPIRBuilder::TargetDataInfo {
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 4910ff6865e43d..6eca0a5ccab41d 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -5143,6 +5143,15 @@ void CodeGenFunction::EmitOMPTargetTaskBasedDirective(
Action.Enter(CGF);
OMPLexicalScope LexScope(CGF, S, OMPD_task, /*EmitPreInitStmt=*/false);
+ auto *TL = S.getSingleClause<OMPThreadLimitClause>();
+ if (CGF.CGM.getLangOpts().OpenMP >= 51 &&
+ needsTaskBasedThreadLimit(S.getDirectiveKind()) && TL) {
+ // Emit __kmpc_set_thread_limit() to set the thread_limit for the task
+ // enclosing this target region. This will indirectly set the thread_limit
+ // for every applicable construct within target region.
+ CGF.CGM.getOpenMPRuntime().emitThreadLimitClause(
+ CGF, TL->getThreadLimit(), S.getBeginLoc());
+ }
BodyGen(CGF);
};
llvm::Function *OutlinedFn = CGM.getOpenMPRuntime().emitTaskOutlinedFunction(
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 04aac12efe8bf0..46eae3596d2a8f 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -15907,6 +15907,11 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
case OMPD_target_teams_distribute_parallel_for:
case OMPD_target_teams_distribute_parallel_for_simd:
case OMPD_target_teams_loop:
+ case OMPD_target_simd:
+ case OMPD_target_parallel:
+ case OMPD_target_parallel_for:
+ case OMPD_target_parallel_for_simd:
+ case OMPD_target_parallel_loop:
CaptureRegion = OMPD_target;
break;
case OMPD_teams_distribute_parallel_for:
@@ -15942,11 +15947,6 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
case OMPD_parallel_for:
case OMPD_parallel_for_simd:
case OMPD_parallel_loop:
- case OMPD_target_simd:
- case OMPD_target_parallel:
- case OMPD_target_parallel_for:
- case OMPD_target_parallel_for_simd:
- case OMPD_target_parallel_loop:
case OMPD_threadprivate:
case OMPD_allocate:
case OMPD_taskyield:
diff --git a/clang/test/OpenMP/target_codegen.cpp b/clang/test/OpenMP/target_codegen.cpp
index bf56b25af11ed6..bd3d7eb853dab1 100644
--- a/clang/test/OpenMP/target_codegen.cpp
+++ b/clang/test/OpenMP/target_codegen.cpp
@@ -846,7 +846,8 @@ void thread_limit_target(int TargetTL, int TeamsTL) {
// OMP51: store {{.*}} [[TL]], {{.*}} [[CEA:%.*]]
// OMP51: load {{.*}} [[CEA]]
// OMP51: [[CE:%.*]] = load {{.*}} [[CEA]]
-// OMP51: call i32 @__tgt_target_kernel({{.*}}, i64 -1, i32 -1, i32 [[CE]],
+// OMP51: call ptr @__kmpc_omp_task_alloc({{.*@.omp_task_entry.*}})
+// OMP51: call i32 [[OMP_TASK_ENTRY]]
#pragma omp target thread_limit(TargetTL)
#pragma omp teams
@@ -854,8 +855,8 @@ void thread_limit_target(int TargetTL, int TeamsTL) {
// OMP51: [[TL:%.*]] = load {{.*}} %TargetTL.addr
// OMP51: store {{.*}} [[TL]], {{.*}} [[CEA:%.*]]
// OMP51: load {{.*}} [[CEA]]
-// OMP51: [[CE:%.*]] = load {{.*}} [[CEA]]
-// OMP51: call i32 @__tgt_target_kernel({{.*}}, i64 -1, i32 0, i32 [[CE]],
+// OMP51: call ptr @__kmpc_omp_task_alloc({{.*@.omp_task_entry.*}})
+// OMP51: call i32 [[OMP_TASK_ENTRY]]
#pragma omp target
#pragma omp teams thread_limit(TeamsTL)
@@ -869,10 +870,25 @@ void thread_limit_target(int TargetTL, int TeamsTL) {
{}
// OMP51: load {{.*}} %TeamsTL.addr
// OMP51: [[TeamsL:%.*]] = load {{.*}} %TeamsTL.addr
-// OMP51: call i32 @__tgt_target_kernel({{.*}}, i64 -1, i32 0, i32 [[TeamsL]],
+// OMP51: call ptr @__kmpc_omp_task_alloc({{.*@.omp_task_entry.*}})
+// OMP51: call i32 [[OMP_TASK_ENTRY]]
}
#endif
+// Check that the offloading functions are called after setting thread_limit in the task entry functions
+
+// OMP51: define internal {{.*}}i32 [[OMP_TASK_ENTRY:@.+]](i32 {{.*}}%0, ptr noalias noundef %1)
+// OMP51: call void @__kmpc_set_thread_limit(ptr @{{.+}}, i32 %{{.+}}, i32 %{{.+}})
+// OMP51: call i32 @__tgt_target_kernel({{.*}}, i64 -1, i32 -1,
+
+// OMP51: define internal {{.*}}i32 [[OMP_TASK_ENTRY:@.+]](i32 {{.*}}%0, ptr noalias noundef %1)
+// OMP51: call void @__kmpc_set_thread_limit(ptr @{{.+}}, i32 %{{.+}}, i32 %{{.+}})
+// OMP51: call i32 @__tgt_target_kernel({{.*}}, i64 -1, i32 0,
+
+// OMP51: define internal {{.*}}i32 [[OMP_TASK_ENTRY:@.+]](i32 {{.*}}%0, ptr noalias noundef %1)
+// OMP51: call void @__kmpc_set_thread_limit(ptr @{{.+}}, i32 %{{.+}}, i32 %{{.+}})
+// OMP51: call i32 @__tgt_target_kernel({{.*}}, i64 -1, i32 0,
+
// CHECK: define internal void @.omp_offloading.requires_reg()
// CHECK: call void @__tgt_register_requires(i64 1)
diff --git a/clang/test/OpenMP/target_parallel_for_simd_tl_codegen.cpp b/clang/test/OpenMP/target_parallel_for_simd_tl_codegen.cpp
new file mode 100644
index 00000000000000..daeb5102b0e22b
--- /dev/null
+++ b/clang/test/OpenMP/target_parallel_for_simd_tl_codegen.cpp
@@ -0,0 +1,66 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 2
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -emit-llvm %s -o - | FileCheck --check-prefix=OMP51 %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=OMP51
+
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+int thread_limit_target_parallel_for_simd() {
+
+// Check that the offloading function is called after setting thread_limit in the task entry function
+#pragma omp target parallel for simd thread_limit(2)
+ for(int i=0; i<2; i++) {}
+
+ return 0;
+}
+
+#endif
+// OMP51-LABEL: define {{.*}}thread_limit_target_parallel_for_simd{{.*}}{
+// OMP51-NEXT: entry:
+// OMP51-NEXT: [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 1
+// OMP51-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2:[0-9]+]])
+// OMP51-NEXT: [[TMP1:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB2]], i32 [[TMP0]], i32 1, i64 40, i64 1, ptr @.omp_task_entry.)
+// OMP51-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP1]], i32 0, i32 0
+// OMP51-NEXT: call void @__kmpc_omp_task_begin_if0(ptr @[[GLOB2]], i32 [[TMP0]], ptr [[TMP1]])
+// OMP51-NEXT: [[TMP3:%.*]] = call i32 @.omp_task_entry.(i32 [[TMP0]], ptr [[TMP1]]) #[[ATTR2:[0-9]+]]
+// OMP51-NEXT: call void @__kmpc_omp_task_complete_if0(ptr @[[GLOB2]], i32 [[TMP0]], ptr [[TMP1]])
+// OMP51-NEXT: ret i32 0
+//
+//
+// OMP51-LABEL: define {{.*}}omp_task_entry{{.*}}{
+// OMP51-NEXT: entry:
+// OMP51-NEXT: [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
+// OMP51-NEXT: [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 8
+// OMP51-NEXT: [[DOTPRIVATES__ADDR_I:%.*]] = alloca ptr, align 8
+// OMP51-NEXT: [[DOTCOPY_FN__ADDR_I:%.*]] = alloca ptr, align 8
+// OMP51-NEXT: [[DOTTASK_T__ADDR_I:%.*]] = alloca ptr, align 8
+// OMP51-NEXT: [[__CONTEXT_ADDR_I:%.*]] = alloca ptr, align 8
+// OMP51-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4
+// OMP51-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// OMP51-NEXT: store i32 [[TMP0]], ptr [[DOTADDR]], align 4
+// OMP51-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// OMP51-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTADDR]], align 4
+// OMP51-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// OMP51-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP3]], i32 0, i32 0
+// OMP51-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2
+// OMP51-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
+// OMP51-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META9:![0-9]+]])
+// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]])
+// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META14:![0-9]+]])
+// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META16:![0-9]+]])
+// OMP51-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !18
+// OMP51-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !18
+// OMP51-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !18
+// OMP51-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !18
+// OMP51-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !18
+// OMP51-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !18
+// OMP51-NEXT: [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !18
+// OMP51-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !18
+// OMP51-NEXT: call void @__kmpc_set_thread_limit(ptr @[[GLOB2]], i32 [[TMP9]], i32 2)
+// OMP51-NEXT: call void @__omp_offloading{{.*}}thread_limit_target_parallel_for_simd{{.*\(.*\).*}}
+// OMP51-NEXT: ret i32 0
+//
diff --git a/clang/test/OpenMP/target_parallel_for_tl_codegen.cpp b/clang/test/OpenMP/target_parallel_for_tl_codegen.cpp
new file mode 100644
index 00000000000000..e6483b704586ef
--- /dev/null
+++ b/clang/test/OpenMP/target_parallel_for_tl_codegen.cpp
@@ -0,0 +1,66 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 2
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -emit-llvm %s -o - | FileCheck --check-prefix=OMP51 %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=OMP51
+
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+int thread_limit_target_parallel_for() {
+
+// Check that the offloading function is called after setting thread_limit in the task entry function
+#pragma omp target parallel for thread_limit(2)
+ for(int i=0; i<2; i++) {}
+
+ return 0;
+}
+
+#endif
+// OMP51-LABEL: define {{.*}}thread_limit_target_parallel_for{{.*}}{
+// OMP51-NEXT: entry:
+// OMP51-NEXT: [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 1
+// OMP51-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2:[0-9]+]])
+// OMP51-NEXT: [[TMP1:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB2]], i32 [[TMP0]], i32 1, i64 40, i64 1, ptr @.omp_task_entry.)
+// OMP51-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP1]], i32 0, i32 0
+// OMP51-NEXT: call void @__kmpc_omp_task_begin_if0(ptr @[[GLOB2]], i32 [[TMP0]], ptr [[TMP1]])
+// OMP51-NEXT: [[TMP3:%.*]] = call i32 @.omp_task_entry.(i32 [[TMP0]], ptr [[TMP1]]) #[[ATTR2:[0-9]+]]
+// OMP51-NEXT: call void @__kmpc_omp_task_complete_if0(ptr @[[GLOB2]], i32 [[TMP0]], ptr [[TMP1]])
+// OMP51-NEXT: ret i32 0
+//
+//
+// OMP51-LABEL: define {{.*}}omp_task_entry{{.*}}{
+// OMP51-NEXT: entry:
+// OMP51-NEXT: [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
+// OMP51-NEXT: [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 8
+// OMP51-NEXT: [[DOTPRIVATES__ADDR_I:%.*]] = alloca ptr, align 8
+// OMP51-NEXT: [[DOTCOPY_FN__ADDR_I:%.*]] = alloca ptr, align 8
+// OMP51-NEXT: [[DOTTASK_T__ADDR_I:%.*]] = alloca ptr, align 8
+// OMP51-NEXT: [[__CONTEXT_ADDR_I:%.*]] = alloca ptr, align 8
+// OMP51-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4
+// OMP51-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// OMP51-NEXT: store i32 [[TMP0]], ptr [[DOTADDR]], align 4
+// OMP51-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// OMP51-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTADDR]], align 4
+// OMP51-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// OMP51-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP3]], i32 0, i32 0
+// OMP51-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2
+// OMP51-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
+// OMP51-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META5:![0-9]+]])
+// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META8:![0-9]+]])
+// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META10:![0-9]+]])
+// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]])
+// OMP51-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14
+// OMP51-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !14
+// OMP51-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !14
+// OMP51-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !14
+// OMP51-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !14
+// OMP51-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !14
+// OMP51-NEXT: [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !14
+// OMP51-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14
+// OMP51-NEXT: call void @__kmpc_set_thread_limit(ptr @[[GLOB2]], i32 [[TMP9]], i32 2)
+// OMP51-NEXT: call void @__omp_offloading{{.*}}thread_limit_target_parallel_for{{.*}}
+// OMP51-NEXT: ret i32 0
+//
diff --git a/clang/test/OpenMP/target_parallel_generic_loop_tl_codegen.cpp b/clang/test/OpenMP/target_parallel_generic_loop_tl_codegen.cpp
new file mode 100644
index 00000000000000..32bbb546a05a31
--- /dev/null
+++ b/clang/test/OpenMP/target_parallel_generic_loop_tl_codegen.cpp
@@ -0,0 +1,66 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 2
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -emit-llvm %s -o - | FileCheck --check-prefix=OMP51 %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=OMP51
+
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+int thread_limit_target_parallel_loop() {
+
+// Check that the offloading function is called after setting thread_limit in the task entry function
+#pragma omp target parallel loop thread_limit(2)
+ for(int i=0; i<2; i++) {}
+
+ return 0;
+}
+
+#endif
+// OMP51-LABEL: define {{.*}}thread_limit_target_parallel_loop{{.*}}{
+// OMP51-NEXT: entry:
+// OMP51-NEXT: [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 1
+// OMP51-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2:[0-9]+]])
+// OMP51-NEXT: [[TMP1:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB2]], i32 [[TMP0]], i32 1, i64 40, i64 1, ptr @.omp_task_entry.)
+// OMP51-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP1]], i32 0, i32 0
+// OMP51-NEXT: call void @__kmpc_omp_task_begin_if0(ptr @[[GLOB2]], i32 [[TMP0]], ptr [[TMP1]])
+// OMP51-NEXT: [[TMP3:%.*]] = call i32 @.omp_task_entry.(i32 [[TMP0]], ptr [[TMP1]]) #[[ATTR2:[0-9]+]]
+// OMP51-NEXT: call void @__kmpc_omp_task_complete_if0(ptr @[[GLOB2]], i32 [[TMP0]], ptr [[TMP1]])
+// OMP51-NEXT: ret i32 0
+//
+//
+// OMP51-LABEL: define {{.*}}omp_task_entry{{.*}}{
+// OMP51-NEXT: entry:
+// OMP51-NEXT: [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
+// OMP51-NEXT: [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 8
+// OMP51-NEXT: [[DOTPRIVATES__ADDR_I:%.*]] = alloca ptr, align 8
+// OMP51-NEXT: [[DOTCOPY_FN__ADDR_I:%.*]] = alloca ptr, align 8
+// OMP51-NEXT: [[DOTTASK_T__ADDR_I:%.*]] = alloca ptr, align 8
+// OMP51-NEXT: [[__CONTEXT_ADDR_I:%.*]] = alloca ptr, align 8
+// OMP51-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4
+// OMP51-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// OMP51-NEXT: store i32 [[TMP0]], ptr [[DOTADDR]], align 4
+// OMP51-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// OMP51-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTADDR]], align 4
+// OMP51-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// OMP51-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP3]], i32 0, i32 0
+// OMP51-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2
+// OMP51-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
+// OMP51-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META5:![0-9]+]])
+// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META8:![0-9]+]])
+// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META10:![0-9]+]])
+// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]])
+// OMP51-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14
+// OMP51-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !14
+// OMP51-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !14
+// OMP51-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !14
+// OMP51-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !14
+// OMP51-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !14
+// OMP51-NEXT: [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !14
+// OMP51-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14
+// OMP51-NEXT: call void @__kmpc_set_thread_limit(ptr @[[GLOB2]], i32 [[TMP9]], i32 2)
+// OMP51-NEXT: call void @__omp_offloading{{.*}}thread_limit_target_parallel_loop{{.*}}
+// OMP51-NEXT: ret i32 0
+//
diff --git a/clang/test/OpenMP/target_parallel_tl_codegen.cpp b/clang/test/OpenMP/target_parallel_tl_codegen.cpp
new file mode 100644
index 00000000000000..e1ca288bd73298
--- /dev/null
+++ b/clang/test/OpenMP/target_parallel_tl_codegen.cpp
@@ -0,0 +1,66 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 2
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -emit-llvm %s -o - | FileCheck --check-prefix=OMP51 %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=OMP51
+
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+int thread_limit_target_parallel() {
+
+// Check that the offloading function is called after setting thread_limit in the task entry function
+#pragma omp target parallel thread_limit(2)
+{}
+
+ return 0;
+}
+
+#endif
+// OMP51-LABEL: define {{.*}}thread_limit_target_parallel{{.*}}{
+// OMP51-NEXT: entry:
+// OMP51-NEXT: [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 1
+// OMP51-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
+// OMP51-NEXT: [[TMP1:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i64 40, i64 1, ptr @.omp_task_entry.)
+// OMP51-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP1]], i32 0, i32 0
+// OMP51-NEXT: call void @__kmpc_omp_task_begin_if0(ptr @[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]])
+// OMP51-NEXT: [[TMP3:%.*]] = call i32 @.omp_task_entry.(i32 [[TMP0]], ptr [[TMP1]]) #[[ATTR2:[0-9]+]]
+// OMP51-NEXT: call void @__kmpc_omp_task_complete_if0(ptr @[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]])
+// OMP51-NEXT: ret i32 0
+//
+//
+// OMP51-LABEL: define {{.*}}omp_task_entry{{.*}}{
+// OMP51-NEXT: entry:
+// OMP51-NEXT: [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
+// OMP51-NEXT: [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 8
+// OMP51-NEXT: [[DOTPRIVATES__ADDR_I:%.*]] = alloca ptr, align 8
+// OMP51-NEXT: [[DOTCOPY_FN__ADDR_I:%.*]] = alloca ptr, align 8
+// OMP51-NEXT: [[DOTTASK_T__ADDR_I:%.*]] = alloca ptr, align 8
+// OMP51-NEXT: [[__CONTEXT_ADDR_I:%.*]] = alloca ptr, align 8
+// OMP51-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4
+// OMP51-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// OMP51-NEXT: store i32 [[TMP0]], ptr [[DOTADDR]], align 4
+// OMP51-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// OMP51-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTADDR]], align 4
+// OMP51-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// OMP51-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP3]], i32 0, i32 0
+// OMP51-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2
+// OMP51-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
+// OMP51-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META5:![0-9]+]])
+// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META8:![0-9]+]])
+// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META10:![0-9]+]])
+// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]])
+// OMP51-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14
+// OMP51-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !14
+// OMP51-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !14
+// OMP51-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !14
+// OMP51-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !14
+// OMP51-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !14
+// OMP51-NEXT: [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !14
+// OMP51-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14
+// OMP51-NEXT: call void @__kmpc_set_thread_limit(ptr @[[GLOB1]], i32 [[TMP9]], i32 2)
+// OMP51-NEXT: call void @__omp_offloading{{.*}}thread_limit_target_parallel{{.*}}
+// OMP51-NEXT: ret i32 0
+//
diff --git a/clang/test/OpenMP/target_simd_tl_codegen.cpp b/clang/test/OpenMP/target_simd_tl_codegen.cpp
new file mode 100644
index 00000000000000..8d6139d055fc54
--- /dev/null
+++ b/clang/test/OpenMP/target_simd_tl_codegen.cpp
@@ -0,0 +1,66 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 2
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -emit-llvm %s -o - | FileCheck --check-prefix=OMP51 %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=OMP51
+
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+int thread_limit_target_simd() {
+
+// Check that the offloading function is called after setting thread_limit in the task entry function
+#pragma omp target simd thread_limit(2)
+ for(int i=0; i<2; i++) {}
+
+ return 0;
+}
+
+#endif
+// OMP51-LABEL: define {{.*}}thread_limit_target_simd{{.*}}{
+// OMP51-NEXT: entry:
+// OMP51-NEXT: [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 1
+// OMP51-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
+// OMP51-NEXT: [[TMP1:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i64 40, i64 1, ptr @.omp_task_entry.)
+// OMP51-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP1]], i32 0, i32 0
+// OMP51-NEXT: call void @__kmpc_omp_task_begin_if0(ptr @[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]])
+// OMP51-NEXT: [[TMP3:%.*]] = call i32 @.omp_task_entry.(i32 [[TMP0]], ptr [[TMP1]]) #[[ATTR2:[0-9]+]]
+// OMP51-NEXT: call void @__kmpc_omp_task_complete_if0(ptr @[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]])
+// OMP51-NEXT: ret i32 0
+//
+//
+// OMP51-LABEL: define {{.*}}omp_task_entry{{.*}}{
+// OMP51-NEXT: entry:
+// OMP51-NEXT: [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
+// OMP51-NEXT: [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 8
+// OMP51-NEXT: [[DOTPRIVATES__ADDR_I:%.*]] = alloca ptr, align 8
+// OMP51-NEXT: [[DOTCOPY_FN__ADDR_I:%.*]] = alloca ptr, align 8
+// OMP51-NEXT: [[DOTTASK_T__ADDR_I:%.*]] = alloca ptr, align 8
+// OMP51-NEXT: [[__CONTEXT_ADDR_I:%.*]] = alloca ptr, align 8
+// OMP51-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4
+// OMP51-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// OMP51-NEXT: store i32 [[TMP0]], ptr [[DOTADDR]], align 4
+// OMP51-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// OMP51-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTADDR]], align 4
+// OMP51-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// OMP51-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP3]], i32 0, i32 0
+// OMP51-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2
+// OMP51-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
+// OMP51-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META7:![0-9]+]])
+// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META10:![0-9]+]])
+// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]])
+// OMP51-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META14:![0-9]+]])
+// OMP51-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !16
+// OMP51-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !16
+// OMP51-NEXT: store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !16
+// OMP51-NEXT: store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !16
+// OMP51-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !16
+// OMP51-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !16
+// OMP51-NEXT: [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !16
+// OMP51-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !16
+// OMP51-NEXT: call void @__kmpc_set_thread_limit(ptr @[[GLOB1]], i32 [[TMP9]], i32 2)
+// OMP51-NEXT: call void @__omp_offloading{{.*}}thread_limit_target_simd{{.*}}
+// OMP51-NEXT: ret i32 0
+//
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index 84ed836ff236cf..b6639b67a5c527 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -762,6 +762,7 @@ def OMP_TargetParallel : Directive<"target parallel"> {
VersionedClause<OMPC_NumThreads>,
VersionedClause<OMPC_ProcBind>,
VersionedClause<OMPC_OMPX_DynCGroupMem>,
+ VersionedClause<OMPC_ThreadLimit, 51>,
];
}
def OMP_TargetParallelFor : Directive<"target parallel for"> {
@@ -793,6 +794,7 @@ def OMP_TargetParallelFor : Directive<"target parallel for"> {
];
let allowedOnceClauses = [
VersionedClause<OMPC_OMPX_DynCGroupMem>,
+ VersionedClause<OMPC_ThreadLimit, 51>,
];
}
def OMP_TargetParallelDo : Directive<"target parallel do"> {
@@ -1279,6 +1281,7 @@ def OMP_TargetParallelForSimd : Directive<"target parallel for simd"> {
];
let allowedOnceClauses = [
VersionedClause<OMPC_OMPX_DynCGroupMem>,
+ VersionedClause<OMPC_ThreadLimit, 51>,
];
}
def OMP_TargetParallelDoSimd : Directive<"target parallel do simd"> {
@@ -1342,7 +1345,8 @@ def OMP_TargetSimd : Directive<"target simd"> {
VersionedClause<OMPC_DefaultMap>,
VersionedClause<OMPC_Schedule>,
VersionedClause<OMPC_OMPX_DynCGroupMem>,
- VersionedClause<OMPC_Order, 50>
+ VersionedClause<OMPC_Order, 50>,
+ VersionedClause<OMPC_ThreadLimit, 51>,
];
}
def OMP_TeamsDistribute : Directive<"teams distribute"> {
@@ -2160,6 +2164,7 @@ def OMP_target_parallel_loop : Directive<"target parallel loop"> {
VersionedClause<OMPC_Order>,
VersionedClause<OMPC_ProcBind>,
VersionedClause<OMPC_OMPX_DynCGroupMem>,
+ VersionedClause<OMPC_ThreadLimit, 51>,
];
}
def OMP_Metadirective : Directive<"metadirective"> {
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index aa85b3fa7f2096..c4218326280b2b 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -386,6 +386,7 @@ __OMP_RTL(__kmpc_cancellationpoint, false, Int32, IdentPtr, Int32, Int32)
__OMP_RTL(__kmpc_fork_teams, true, Void, IdentPtr, Int32, ParallelTaskPtr)
__OMP_RTL(__kmpc_push_num_teams, false, Void, IdentPtr, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_set_thread_limit, false, Void, IdentPtr, Int32, Int32)
__OMP_RTL(__kmpc_copyprivate, false, Void, IdentPtr, Int32, SizeTy, VoidPtr,
CopyFunctionPtr, Int32)
@@ -913,6 +914,8 @@ __OMP_RTL_ATTRS(__kmpc_fork_teams, ForkAttrs, AttributeSet(),
ParamAttrs(ReadOnlyPtrAttrs, SExt, ReadOnlyPtrAttrs))
__OMP_RTL_ATTRS(__kmpc_push_num_teams, InaccessibleArgOnlyAttrs, AttributeSet(),
ParamAttrs(ReadOnlyPtrAttrs, SExt, SExt, SExt))
+__OMP_RTL_ATTRS(__kmpc_set_thread_limit, InaccessibleArgOnlyAttrs, AttributeSet(),
+ ParamAttrs(ReadOnlyPtrAttrs, SExt, SExt))
__OMP_RTL_ATTRS(__kmpc_copyprivate, DefaultAttrs, AttributeSet(),
ParamAttrs(ReadOnlyPtrAttrs, SExt, SizeTyExt,
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index a65f34ff3b86d6..33895f8fbb1e36 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -2111,6 +2111,7 @@ typedef struct kmp_internal_control {
int nproc; /* internal control for #threads for next parallel region (per
thread) */
int thread_limit; /* internal control for thread-limit-var */
+ int task_thread_limit; /* internal control for thread-limit-var of a task*/
int max_active_levels; /* internal control for max_active_levels */
kmp_r_sched_t
sched; /* internal control for runtime schedule {sched,chunk} pair */
@@ -3340,6 +3341,7 @@ extern int __kmp_sys_max_nth; /* system-imposed maximum number of threads */
extern int __kmp_max_nth;
// maximum total number of concurrently-existing threads in a contention group
extern int __kmp_cg_max_nth;
+extern int __kmp_task_max_nth; // max threads used in a task
extern int __kmp_teams_max_nth; // max threads used in a teams construct
extern int __kmp_threads_capacity; /* capacity of the arrays __kmp_threads and
__kmp_root */
@@ -4297,6 +4299,8 @@ KMP_EXPORT void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
KMP_EXPORT void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
kmp_int32 num_teams,
kmp_int32 num_threads);
+KMP_EXPORT void __kmpc_set_thread_limit(ident_t *loc, kmp_int32 global_tid,
+ kmp_int32 thread_limit);
/* Function for OpenMP 5.1 num_teams clause */
KMP_EXPORT void __kmpc_push_num_teams_51(ident_t *loc, kmp_int32 global_tid,
kmp_int32 num_teams_lb,
diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp
index 82836915016696..8bd0e89a7dacd9 100644
--- a/openmp/runtime/src/kmp_csupport.cpp
+++ b/openmp/runtime/src/kmp_csupport.cpp
@@ -381,6 +381,24 @@ void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
__kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
}
+/*!
+ at ingroup PARALLEL
+ at param loc source location information
+ at param global_tid global thread number
+ at param thread_limit limit on number of threads which can be created within the
+current task
+
+Set the thread_limit for the current task
+This call is there to support `thread_limit` clause on the `target` construct
+*/
+void __kmpc_set_thread_limit(ident_t *loc, kmp_int32 global_tid,
+ kmp_int32 thread_limit) {
+ __kmp_assert_valid_gtid(global_tid);
+ kmp_info_t *thread = __kmp_threads[global_tid];
+ if (thread_limit > 0)
+ thread->th.th_current_task->td_icvs.task_thread_limit = thread_limit;
+}
+
/*!
@ingroup PARALLEL
@param loc source location information
diff --git a/openmp/runtime/src/kmp_ftn_entry.h b/openmp/runtime/src/kmp_ftn_entry.h
index d686a889972ec9..ffb01a31fb93ed 100644
--- a/openmp/runtime/src/kmp_ftn_entry.h
+++ b/openmp/runtime/src/kmp_ftn_entry.h
@@ -807,6 +807,10 @@ int FTN_STDCALL KMP_EXPAND_NAME(FTN_GET_THREAD_LIMIT)(void) {
gtid = __kmp_entry_gtid();
thread = __kmp_threads[gtid];
+ // If thread_limit for the target task is defined, return that instead of the
+ // regular task thread_limit
+ if (int thread_limit = thread->th.th_current_task->td_icvs.task_thread_limit)
+ return thread_limit;
return thread->th.th_current_task->td_icvs.thread_limit;
#endif
}
diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp
index c66ab59a01c6fd..48097fb530d1c6 100644
--- a/openmp/runtime/src/kmp_global.cpp
+++ b/openmp/runtime/src/kmp_global.cpp
@@ -125,6 +125,7 @@ size_t __kmp_sys_min_stksize = KMP_MIN_STKSIZE;
int __kmp_sys_max_nth = KMP_MAX_NTH;
int __kmp_max_nth = 0;
int __kmp_cg_max_nth = 0;
+int __kmp_task_max_nth = 0;
int __kmp_teams_max_nth = 0;
int __kmp_threads_capacity = 0;
int __kmp_dflt_team_nth = 0;
diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp
index 34f6781e91010a..c8a18e81810cb5 100644
--- a/openmp/runtime/src/kmp_runtime.cpp
+++ b/openmp/runtime/src/kmp_runtime.cpp
@@ -1872,6 +1872,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
int nthreads;
int master_active;
int master_set_numthreads;
+ int task_thread_limit = 0;
int level;
int active_level;
int teams_level;
@@ -1910,6 +1911,8 @@ int __kmp_fork_call(ident_t *loc, int gtid,
root = master_th->th.th_root;
master_active = root->r.r_active;
master_set_numthreads = master_th->th.th_set_nproc;
+ task_thread_limit =
+ master_th->th.th_current_task->td_icvs.task_thread_limit;
#if OMPT_SUPPORT
ompt_data_t ompt_parallel_data = ompt_data_none;
@@ -2000,6 +2003,11 @@ int __kmp_fork_call(ident_t *loc, int gtid,
? master_set_numthreads
// TODO: get nproc directly from current task
: get__nproc_2(parent_team, master_tid);
+ // Use the thread_limit set for the current target task if exists, else go
+ // with the deduced nthreads
+ nthreads = task_thread_limit > 0 && task_thread_limit < nthreads
+ ? task_thread_limit
+ : nthreads;
// Check if we need to take forkjoin lock? (no need for serialized
// parallel out of teams construct).
if (nthreads > 1) {
@@ -3291,6 +3299,8 @@ static kmp_internal_control_t __kmp_get_global_icvs(void) {
// next parallel region (per thread)
// (use a max ub on value if __kmp_parallel_initialize not called yet)
__kmp_cg_max_nth, // int thread_limit;
+ __kmp_task_max_nth, // int task_thread_limit; // to set the thread_limit
+ // on task. This is used in the case of target thread_limit
__kmp_dflt_max_active_levels, // int max_active_levels; //internal control
// for max_active_levels
r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
diff --git a/openmp/runtime/test/target/target_thread_limit.cpp b/openmp/runtime/test/target/target_thread_limit.cpp
new file mode 100644
index 00000000000000..0cc3307977e974
--- /dev/null
+++ b/openmp/runtime/test/target/target_thread_limit.cpp
@@ -0,0 +1,168 @@
+// RUN: %libomp-cxx-compile -fopenmp-version=51
+// RUN: %libomp-run | FileCheck %s --check-prefix OMP51
+
+#include <stdio.h>
+#include <omp.h>
+
+void foo() {
+#pragma omp parallel num_threads(10)
+ { printf("\ntarget: foo(): parallel num_threads(10)"); }
+}
+
+int main(void) {
+
+ int tl = 4;
+ printf("\nmain: thread_limit = %d", omp_get_thread_limit());
+ // OMP51: main: thread_limit = {{[0-9]+}}
+
+#pragma omp target thread_limit(tl)
+ {
+ printf("\ntarget: thread_limit = %d", omp_get_thread_limit());
+// OMP51: target: thread_limit = 4
+// check whether thread_limit is honoured
+#pragma omp parallel
+ { printf("\ntarget: parallel"); }
+// OMP51: target: parallel
+// OMP51: target: parallel
+// OMP51: target: parallel
+// OMP51: target: parallel
+// OMP51-NOT: target: parallel
+
+// check whether num_threads is honoured
+#pragma omp parallel num_threads(2)
+ { printf("\ntarget: parallel num_threads(2)"); }
+// OMP51: target: parallel num_threads(2)
+// OMP51: target: parallel num_threads(2)
+// OMP51-NOT: target: parallel num_threads(2)
+
+// check whether thread_limit is honoured when there is a conflicting
+// num_threads
+#pragma omp parallel num_threads(10)
+ { printf("\ntarget: parallel num_threads(10)"); }
+ // OMP51: target: parallel num_threads(10)
+ // OMP51: target: parallel num_threads(10)
+ // OMP51: target: parallel num_threads(10)
+ // OMP51: target: parallel num_threads(10)
+ // OMP51-NOT: target: parallel num_threads(10)
+
+ // check whether threads are limited across functions
+ foo();
+ // OMP51: target: foo(): parallel num_threads(10)
+ // OMP51: target: foo(): parallel num_threads(10)
+ // OMP51: target: foo(): parallel num_threads(10)
+ // OMP51: target: foo(): parallel num_threads(10)
+ // OMP51-NOT: target: foo(): parallel num_threads(10)
+
+ // check if user can set num_threads at runtime
+ omp_set_num_threads(2);
+#pragma omp parallel
+ { printf("\ntarget: parallel with omp_set_num_thread(2)"); }
+ // OMP51: target: parallel with omp_set_num_thread(2)
+ // OMP51: target: parallel with omp_set_num_thread(2)
+ // OMP51-NOT: target: parallel with omp_set_num_thread(2)
+
+ // make sure thread_limit is unaffected by omp_set_num_threads
+ printf("\ntarget: thread_limit = %d", omp_get_thread_limit());
+ // OMP51: target: thread_limit = 4
+ }
+
+// checking consecutive target regions with
diff erent thread_limits
+#pragma omp target thread_limit(3)
+ {
+ printf("\nsecond target: thread_limit = %d", omp_get_thread_limit());
+// OMP51: second target: thread_limit = 3
+#pragma omp parallel
+ { printf("\nsecond target: parallel"); }
+ // OMP51: second target: parallel
+ // OMP51: second target: parallel
+ // OMP51: second target: parallel
+ // OMP51-NOT: second target: parallel
+ }
+
+ // confirm that thread_limit's effects are limited to target region
+ printf("\nmain: thread_limit = %d", omp_get_thread_limit());
+ // OMP51: main: thread_limit = {{[0-9]+}}
+#pragma omp parallel num_threads(10)
+ { printf("\nmain: parallel num_threads(10)"); }
+ // OMP51: main: parallel num_threads(10)
+ // OMP51: main: parallel num_threads(10)
+ // OMP51: main: parallel num_threads(10)
+ // OMP51: main: parallel num_threads(10)
+ // OMP51: main: parallel num_threads(10)
+ // OMP51: main: parallel num_threads(10)
+ // OMP51: main: parallel num_threads(10)
+ // OMP51: main: parallel num_threads(10)
+ // OMP51: main: parallel num_threads(10)
+ // OMP51: main: parallel num_threads(10)
+ // OMP51-NOT: main: parallel num_threads(10)
+
+// check combined target directives which support thread_limit
+// target parallel
+#pragma omp target parallel thread_limit(2)
+ printf("\ntarget parallel thread_limit(2)");
+ // OMP51: target parallel thread_limit(2)
+ // OMP51: target parallel thread_limit(2)
+ // OMP51-NOT: target parallel thread_limit(2)
+
+#pragma omp target parallel num_threads(2) thread_limit(3)
+ printf("\ntarget parallel num_threads(2) thread_limit(3)");
+ // OMP51: target parallel num_threads(2) thread_limit(3)
+ // OMP51: target parallel num_threads(2) thread_limit(3)
+ // OMP51-NOT: target parallel num_threads(2) thread_limit(3)
+
+#pragma omp target parallel num_threads(3) thread_limit(2)
+ printf("\ntarget parallel num_threads(3) thread_limit(2)");
+ // OMP51: target parallel num_threads(3) thread_limit(2)
+ // OMP51: target parallel num_threads(3) thread_limit(2)
+ // OMP51-NOT: target parallel num_threads(3) thread_limit(2)
+
+// target parallel for
+#pragma omp target parallel for thread_limit(2)
+ for (int i = 0; i < 5; ++i)
+ printf("\ntarget parallel for thread_limit(2) : thread num = %d",
+ omp_get_thread_num());
+ // OMP51: target parallel for thread_limit(2) : thread num = {{0|1}}
+ // OMP51: target parallel for thread_limit(2) : thread num = {{0|1}}
+ // OMP51: target parallel for thread_limit(2) : thread num = {{0|1}}
+ // OMP51: target parallel for thread_limit(2) : thread num = {{0|1}}
+ // OMP51: target parallel for thread_limit(2) : thread num = {{0|1}}
+ // OMP51-NOT: target parallel for thread_limit(3) : thread num = {{0|1}}
+
+// target parallel for simd
+#pragma omp target parallel for simd thread_limit(2)
+ for (int i = 0; i < 5; ++i)
+ printf("\ntarget parallel for simd thread_limit(2) : thread num = %d",
+ omp_get_thread_num());
+ // OMP51: target parallel for simd thread_limit(2) : thread num = {{0|1}}
+ // OMP51: target parallel for simd thread_limit(2) : thread num = {{0|1}}
+ // OMP51: target parallel for simd thread_limit(2) : thread num = {{0|1}}
+ // OMP51: target parallel for simd thread_limit(2) : thread num = {{0|1}}
+ // OMP51: target parallel for simd thread_limit(2) : thread num = {{0|1}}
+ // OMP51-NOT: target parallel for simd thread_limit(2) : thread num =
+ // {{0|1}}
+
+// target simd
+#pragma omp target simd thread_limit(2)
+ for (int i = 0; i < 5; ++i)
+ printf("\ntarget simd thread_limit(2) : thread num = %d",
+ omp_get_thread_num());
+ // OMP51: target simd thread_limit(2) : thread num = {{0|1}}
+ // OMP51: target simd thread_limit(2) : thread num = {{0|1}}
+ // OMP51: target simd thread_limit(2) : thread num = {{0|1}}
+ // OMP51: target simd thread_limit(2) : thread num = {{0|1}}
+ // OMP51: target simd thread_limit(2) : thread num = {{0|1}}
+ // OMP51-NOT: target simd thread_limit(2) : thread num = {{0|1}}
+
+// target parallel loop
+#pragma omp target parallel loop thread_limit(2)
+ for (int i = 0; i < 5; ++i)
+ printf("\ntarget parallel loop thread_limit(2) : thread num = %d",
+ omp_get_thread_num());
+ // # OMP51: target parallel loop thread_limit(2) : thread num = {{0|1}}
+ // # OMP51: target parallel loop thread_limit(2) : thread num = {{0|1}}
+ // # OMP51: target parallel loop thread_limit(2) : thread num = {{0|1}}
+ // # OMP51: target parallel loop thread_limit(2) : thread num = {{0|1}}
+ // # OMP51: target parallel loop thread_limit(2) : thread num = {{0|1}}
+ // # OMP51-NOT: target parallel loop thread_limit(2) : thread num = {{0|1}}
+ return 0;
+}
More information about the Openmp-commits
mailing list