[Openmp-commits] [openmp] bc9c4d7 - [OpenMP][FIX] Pass the num_threads value directly to parallel_51
Joseph Huber via Openmp-commits
openmp-commits at lists.llvm.org
Thu Dec 9 13:30:40 PST 2021
Author: Joseph Huber
Date: 2021-12-09T16:30:29-05:00
New Revision: bc9c4d7216a5bcdeec8543834fe1451db2c46e5c
URL: https://github.com/llvm/llvm-project/commit/bc9c4d7216a5bcdeec8543834fe1451db2c46e5c
DIFF: https://github.com/llvm/llvm-project/commit/bc9c4d7216a5bcdeec8543834fe1451db2c46e5c.diff
LOG: [OpenMP][FIX] Pass the num_threads value directly to parallel_51
The problem with the old scheme is that we would need to keep track of
the "next region" and reset the num_threads value after it. The new RT
doesn't do it and an assertion is triggered. The old RT doesn't do it
either, I haven't tested it but I assume a num_threads clause might
impact multiple parallel regions "accidentally". Further, in SPMD mode
num_threads was simply ignored, for some reason beyond me.
In any case, parallel_51 is designed to take the clause value directly,
so let's do that instead.
Reviewed By: tianshilei1992
Differential Revision: https://reviews.llvm.org/D113623
Added:
Modified:
clang/lib/CodeGen/CGOpenMPRuntime.cpp
clang/lib/CodeGen/CGOpenMPRuntime.h
clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
clang/lib/CodeGen/CGStmtOpenMP.cpp
clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp
clang/test/OpenMP/nvptx_target_codegen.cpp
clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp
openmp/libomptarget/DeviceRTL/include/Interface.h
openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
openmp/libomptarget/deviceRTLs/common/omptarget.h
openmp/libomptarget/deviceRTLs/common/omptargeti.h
openmp/libomptarget/deviceRTLs/common/src/parallel.cu
openmp/libomptarget/deviceRTLs/interface.h
Removed:
################################################################################
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index c3a01448389b3..280cf853ab77c 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -2085,7 +2085,8 @@ void CGOpenMPRuntime::emitIfClause(CodeGenFunction &CGF, const Expr *Cond,
void CGOpenMPRuntime::emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc,
llvm::Function *OutlinedFn,
ArrayRef<llvm::Value *> CapturedVars,
- const Expr *IfCond) {
+ const Expr *IfCond,
+ llvm::Value *NumThreads) {
if (!CGF.HaveInsertPoint())
return;
llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
@@ -12890,7 +12891,8 @@ void CGOpenMPSIMDRuntime::emitParallelCall(CodeGenFunction &CGF,
SourceLocation Loc,
llvm::Function *OutlinedFn,
ArrayRef<llvm::Value *> CapturedVars,
- const Expr *IfCond) {
+ const Expr *IfCond,
+ llvm::Value *NumThreads) {
llvm_unreachable("Not supported in SIMD-only mode");
}
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h
index 527a23a8af6a8..95704226f9b20 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -1015,11 +1015,13 @@ class CGOpenMPRuntime {
/// variables used in \a OutlinedFn function.
/// \param IfCond Condition in the associated 'if' clause, if it was
/// specified, nullptr otherwise.
+ /// \param NumThreads The value corresponding to the num_threads clause, if
+ /// any, or nullptr.
///
virtual void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc,
llvm::Function *OutlinedFn,
ArrayRef<llvm::Value *> CapturedVars,
- const Expr *IfCond);
+ const Expr *IfCond, llvm::Value *NumThreads);
/// Emits a critical region.
/// \param CriticalName Name of the critical region.
@@ -1991,11 +1993,13 @@ class CGOpenMPSIMDRuntime final : public CGOpenMPRuntime {
/// variables used in \a OutlinedFn function.
/// \param IfCond Condition in the associated 'if' clause, if it was
/// specified, nullptr otherwise.
+ /// \param NumThreads The value corresponding to the num_threads clause, if
+ /// any, or nullptr.
///
void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc,
llvm::Function *OutlinedFn,
ArrayRef<llvm::Value *> CapturedVars,
- const Expr *IfCond) override;
+ const Expr *IfCond, llvm::Value *NumThreads) override;
/// Emits a critical region.
/// \param CriticalName Name of the critical region.
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index dcb224f331561..fb4061b7ce9e9 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1221,11 +1221,7 @@ void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,
void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF,
llvm::Value *NumThreads,
SourceLocation Loc) {
- // Do nothing in case of SPMD mode and L0 parallel.
- if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
- return;
-
- CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
+ // Nothing to do.
}
void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF,
@@ -1510,13 +1506,16 @@ void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
SourceLocation Loc,
llvm::Function *OutlinedFn,
ArrayRef<llvm::Value *> CapturedVars,
- const Expr *IfCond) {
+ const Expr *IfCond,
+ llvm::Value *NumThreads) {
if (!CGF.HaveInsertPoint())
return;
- auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars,
- IfCond](CodeGenFunction &CGF, PrePostActionTy &Action) {
+ auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond,
+ NumThreads](CodeGenFunction &CGF,
+ PrePostActionTy &Action) {
CGBuilderTy &Bld = CGF.Builder;
+ llvm::Value *NumThreadsVal = NumThreads;
llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
if (WFn)
@@ -1556,13 +1555,18 @@ void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
else
IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
- assert(IfCondVal && "Expected a value");
+ if (!NumThreadsVal)
+ NumThreadsVal = llvm::ConstantInt::get(CGF.Int32Ty, -1);
+ else
+ NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty),
+
+ assert(IfCondVal && "Expected a value");
llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
llvm::Value *Args[] = {
RTLoc,
getThreadID(CGF, Loc),
IfCondVal,
- llvm::ConstantInt::get(CGF.Int32Ty, -1),
+ NumThreadsVal,
llvm::ConstantInt::get(CGF.Int32Ty, -1),
FnPtr,
ID,
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
index ac51264d7685b..1d30c5061743a 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
@@ -257,10 +257,13 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
/// variables used in \a OutlinedFn function.
/// \param IfCond Condition in the associated 'if' clause, if it was
/// specified, nullptr otherwise.
+ /// \param NumThreads The value corresponding to the num_threads clause, if
+ /// any,
+ /// or nullptr.
void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc,
llvm::Function *OutlinedFn,
ArrayRef<llvm::Value *> CapturedVars,
- const Expr *IfCond) override;
+ const Expr *IfCond, llvm::Value *NumThreads) override;
/// Emit an implicit/explicit barrier for OpenMP threads.
/// \param Kind Directive for which this implicit barrier call must be
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 6f0ef7bb7fe52..2509de486671a 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -1557,14 +1557,14 @@ static void emitCommonOMPParallelDirective(
OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen,
const CodeGenBoundParametersTy &CodeGenBoundParameters) {
const CapturedStmt *CS = S.getCapturedStmt(OMPD_parallel);
+ llvm::Value *NumThreads = nullptr;
llvm::Function *OutlinedFn =
CGF.CGM.getOpenMPRuntime().emitParallelOutlinedFunction(
S, *CS->getCapturedDecl()->param_begin(), InnermostKind, CodeGen);
if (const auto *NumThreadsClause = S.getSingleClause<OMPNumThreadsClause>()) {
CodeGenFunction::RunCleanupsScope NumThreadsScope(CGF);
- llvm::Value *NumThreads =
- CGF.EmitScalarExpr(NumThreadsClause->getNumThreads(),
- /*IgnoreResultAssign=*/true);
+ NumThreads = CGF.EmitScalarExpr(NumThreadsClause->getNumThreads(),
+ /*IgnoreResultAssign=*/true);
CGF.CGM.getOpenMPRuntime().emitNumThreadsClause(
CGF, NumThreads, NumThreadsClause->getBeginLoc());
}
@@ -1591,7 +1591,7 @@ static void emitCommonOMPParallelDirective(
CodeGenBoundParameters(CGF, S, CapturedVars);
CGF.GenerateOpenMPCapturedVars(*CS, CapturedVars);
CGF.CGM.getOpenMPRuntime().emitParallelCall(CGF, S.getBeginLoc(), OutlinedFn,
- CapturedVars, IfCond);
+ CapturedVars, IfCond, NumThreads);
}
static bool isAllocatableDecl(const VarDecl *VD) {
diff --git a/clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp
index 6c70478dbc0c6..95c47ee16cc3b 100644
--- a/clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp
@@ -46,12 +46,11 @@ int main() {
// CHECK1: user_code.entry:
// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
// CHECK1-NEXT: call void @_Z3usePi(i32* [[TMP0]]) #[[ATTR6:[0-9]+]]
-// CHECK1-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 2)
// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP0]] to i8*
// CHECK1-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 8
// CHECK1-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP5]], i64 1)
+// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 1, i32 2, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP5]], i64 1)
// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
// CHECK1-NEXT: ret void
// CHECK1: worker.exit:
@@ -65,12 +64,11 @@ int main() {
// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8
// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]])
// CHECK1-NEXT: store i32* [[C]], i32** [[C_ADDR]], align 8
-// CHECK1-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 2)
// CHECK1-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
// CHECK1-NEXT: [[TMP2:%.*]] = bitcast i32** [[C_ADDR]] to i8*
// CHECK1-NEXT: store i8* [[TMP2]], i8** [[TMP1]], align 8
// CHECK1-NEXT: [[TMP3:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32**)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP3]], i64 1)
+// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 2, i32 -1, i8* bitcast (void (i32*, i32*, i32**)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP3]], i64 1)
// CHECK1-NEXT: ret void
//
//
@@ -179,12 +177,11 @@ int main() {
// CHECK2: user_code.entry:
// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
// CHECK2-NEXT: call void @_Z3usePi(i32* [[TMP0]]) #[[ATTR6:[0-9]+]]
-// CHECK2-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 2)
// CHECK2-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP0]] to i8*
// CHECK2-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 4
// CHECK2-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP5]], i32 1)
+// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 1, i32 2, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP5]], i32 1)
// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
// CHECK2-NEXT: ret void
// CHECK2: worker.exit:
@@ -198,12 +195,11 @@ int main() {
// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4
// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]])
// CHECK2-NEXT: store i32* [[C]], i32** [[C_ADDR]], align 4
-// CHECK2-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 2)
// CHECK2-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
// CHECK2-NEXT: [[TMP2:%.*]] = bitcast i32** [[C_ADDR]] to i8*
// CHECK2-NEXT: store i8* [[TMP2]], i8** [[TMP1]], align 4
// CHECK2-NEXT: [[TMP3:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32**)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP3]], i32 1)
+// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 2, i32 -1, i8* bitcast (void (i32*, i32*, i32**)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP3]], i32 1)
// CHECK2-NEXT: ret void
//
//
@@ -312,12 +308,11 @@ int main() {
// CHECK3: user_code.entry:
// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
// CHECK3-NEXT: call void @_Z3usePi(i32* [[TMP0]]) #[[ATTR6:[0-9]+]]
-// CHECK3-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 2)
// CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP0]] to i8*
// CHECK3-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 4
// CHECK3-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP5]], i32 1)
+// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 1, i32 2, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP5]], i32 1)
// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
// CHECK3-NEXT: ret void
// CHECK3: worker.exit:
@@ -331,12 +326,11 @@ int main() {
// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4
// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]])
// CHECK3-NEXT: store i32* [[C]], i32** [[C_ADDR]], align 4
-// CHECK3-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 2)
// CHECK3-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
// CHECK3-NEXT: [[TMP2:%.*]] = bitcast i32** [[C_ADDR]] to i8*
// CHECK3-NEXT: store i8* [[TMP2]], i8** [[TMP1]], align 4
// CHECK3-NEXT: [[TMP3:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32**)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP3]], i32 1)
+// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 2, i32 -1, i8* bitcast (void (i32*, i32*, i32**)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP3]], i32 1)
// CHECK3-NEXT: ret void
//
//
diff --git a/clang/test/OpenMP/nvptx_target_codegen.cpp b/clang/test/OpenMP/nvptx_target_codegen.cpp
index b3f0746c1eae0..34fc9d94501fc 100644
--- a/clang/test/OpenMP/nvptx_target_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_codegen.cpp
@@ -165,7 +165,7 @@ void unreachable_call() {
// CHECK1-NEXT: [[TMP6:%.*]] = bitcast i32** [[TMP0]] to i8*
// CHECK1-NEXT: store i8* [[TMP6]], i8** [[TMP5]], align 8
// CHECK1-NEXT: [[TMP7:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32**, i32**)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP7]], i64 2)
+// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 2, i32 -1, i8* bitcast (void (i32*, i32*, i32**, i32**)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP7]], i64 2)
// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true)
// CHECK1-NEXT: ret void
// CHECK1: worker.exit:
@@ -301,7 +301,7 @@ void unreachable_call() {
// CHECK1-NEXT: [[ADD20:%.*]] = add nsw i32 [[CONV19]], 1
// CHECK1-NEXT: [[CONV21:%.*]] = trunc i32 [[ADD20]] to i8
// CHECK1-NEXT: store i8 [[CONV21]], i8* [[Y]], align 8
-// CHECK1-NEXT: [[CALL:%.*]] = call nonnull align 8 dereferenceable(8) i64* @_ZN2TTIxcEixEi(%struct.TT* nonnull align 8 dereferenceable(16) [[TMP7]], i32 0) #[[ATTR6:[0-9]+]]
+// CHECK1-NEXT: [[CALL:%.*]] = call nonnull align 8 dereferenceable(8) i64* @_ZN2TTIxcEixEi(%struct.TT* nonnull align 8 dereferenceable(16) [[TMP7]], i32 0) #[[ATTR8:[0-9]+]]
// CHECK1-NEXT: [[TMP17:%.*]] = load i64, i64* [[CALL]], align 8
// CHECK1-NEXT: [[ADD22:%.*]] = add nsw i64 [[TMP17]], 1
// CHECK1-NEXT: store i64 [[ADD22]], i64* [[CALL]], align 8
@@ -312,7 +312,7 @@ void unreachable_call() {
//
//
// CHECK1-LABEL: define {{[^@]+}}@_ZN2TTIxcEixEi
-// CHECK1-SAME: (%struct.TT* nonnull align 8 dereferenceable(16) [[THIS:%.*]], i32 [[I:%.*]]) #[[ATTR3:[0-9]+]] comdat align 2 {
+// CHECK1-SAME: (%struct.TT* nonnull align 8 dereferenceable(16) [[THIS:%.*]], i32 [[I:%.*]]) #[[ATTR4:[0-9]+]] comdat align 2 {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca %struct.TT*, align 8
// CHECK1-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4
@@ -405,7 +405,7 @@ void unreachable_call() {
// CHECK1-NEXT: [[TMP8:%.*]] = load double, double* [[A7]], align 8
// CHECK1-NEXT: [[CONV8:%.*]] = fptosi double [[TMP8]] to i32
// CHECK1-NEXT: [[A9:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
-// CHECK1-NEXT: [[CALL:%.*]] = call i32 @_Z3baziRd(i32 [[CONV8]], double* nonnull align 8 dereferenceable(8) [[A9]]) #[[ATTR6]]
+// CHECK1-NEXT: [[CALL:%.*]] = call i32 @_Z3baziRd(i32 [[CONV8]], double* nonnull align 8 dereferenceable(8) [[A9]]) #[[ATTR8]]
// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
// CHECK1-NEXT: ret void
// CHECK1: worker.exit:
@@ -413,7 +413,7 @@ void unreachable_call() {
//
//
// CHECK1-LABEL: define {{[^@]+}}@_Z3baziRd
-// CHECK1-SAME: (i32 [[F1:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (i32 [[F1:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR4]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca double*, align 8
// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
@@ -443,7 +443,7 @@ void unreachable_call() {
// CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
// CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK1: user_code.entry:
-// CHECK1-NEXT: call void @_Z6asserti(i32 0) #[[ATTR7:[0-9]+]]
+// CHECK1-NEXT: call void @_Z6asserti(i32 0) #[[ATTR9:[0-9]+]]
// CHECK1-NEXT: unreachable
// CHECK1: worker.exit:
// CHECK1-NEXT: ret void
@@ -510,7 +510,7 @@ void unreachable_call() {
//
//
// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
-// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR7:[0-9]+]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2
// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
@@ -552,7 +552,7 @@ void unreachable_call() {
// CHECK2-NEXT: [[TMP6:%.*]] = bitcast i32** [[TMP0]] to i8*
// CHECK2-NEXT: store i8* [[TMP6]], i8** [[TMP5]], align 4
// CHECK2-NEXT: [[TMP7:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32**, i32**)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP7]], i32 2)
+// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 2, i32 -1, i8* bitcast (void (i32*, i32*, i32**, i32**)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP7]], i32 2)
// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true)
// CHECK2-NEXT: ret void
// CHECK2: worker.exit:
@@ -687,7 +687,7 @@ void unreachable_call() {
// CHECK2-NEXT: [[ADD19:%.*]] = add nsw i32 [[CONV18]], 1
// CHECK2-NEXT: [[CONV20:%.*]] = trunc i32 [[ADD19]] to i8
// CHECK2-NEXT: store i8 [[CONV20]], i8* [[Y]], align 8
-// CHECK2-NEXT: [[CALL:%.*]] = call nonnull align 8 dereferenceable(8) i64* @_ZN2TTIxcEixEi(%struct.TT* nonnull align 8 dereferenceable(16) [[TMP7]], i32 0) #[[ATTR6:[0-9]+]]
+// CHECK2-NEXT: [[CALL:%.*]] = call nonnull align 8 dereferenceable(8) i64* @_ZN2TTIxcEixEi(%struct.TT* nonnull align 8 dereferenceable(16) [[TMP7]], i32 0) #[[ATTR8:[0-9]+]]
// CHECK2-NEXT: [[TMP17:%.*]] = load i64, i64* [[CALL]], align 8
// CHECK2-NEXT: [[ADD21:%.*]] = add nsw i64 [[TMP17]], 1
// CHECK2-NEXT: store i64 [[ADD21]], i64* [[CALL]], align 8
@@ -698,7 +698,7 @@ void unreachable_call() {
//
//
// CHECK2-LABEL: define {{[^@]+}}@_ZN2TTIxcEixEi
-// CHECK2-SAME: (%struct.TT* nonnull align 8 dereferenceable(16) [[THIS:%.*]], i32 [[I:%.*]]) #[[ATTR3:[0-9]+]] comdat align 2 {
+// CHECK2-SAME: (%struct.TT* nonnull align 8 dereferenceable(16) [[THIS:%.*]], i32 [[I:%.*]]) #[[ATTR4:[0-9]+]] comdat align 2 {
// CHECK2-NEXT: entry:
// CHECK2-NEXT: [[THIS_ADDR:%.*]] = alloca %struct.TT*, align 4
// CHECK2-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4
@@ -789,7 +789,7 @@ void unreachable_call() {
// CHECK2-NEXT: [[TMP8:%.*]] = load double, double* [[A6]], align 8
// CHECK2-NEXT: [[CONV7:%.*]] = fptosi double [[TMP8]] to i32
// CHECK2-NEXT: [[A8:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
-// CHECK2-NEXT: [[CALL:%.*]] = call i32 @_Z3baziRd(i32 [[CONV7]], double* nonnull align 8 dereferenceable(8) [[A8]]) #[[ATTR6]]
+// CHECK2-NEXT: [[CALL:%.*]] = call i32 @_Z3baziRd(i32 [[CONV7]], double* nonnull align 8 dereferenceable(8) [[A8]]) #[[ATTR8]]
// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
// CHECK2-NEXT: ret void
// CHECK2: worker.exit:
@@ -797,7 +797,7 @@ void unreachable_call() {
//
//
// CHECK2-LABEL: define {{[^@]+}}@_Z3baziRd
-// CHECK2-SAME: (i32 [[F1:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR3]] {
+// CHECK2-SAME: (i32 [[F1:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR4]] {
// CHECK2-NEXT: entry:
// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca double*, align 4
// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
@@ -827,7 +827,7 @@ void unreachable_call() {
// CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
// CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK2: user_code.entry:
-// CHECK2-NEXT: call void @_Z6asserti(i32 0) #[[ATTR7:[0-9]+]]
+// CHECK2-NEXT: call void @_Z6asserti(i32 0) #[[ATTR9:[0-9]+]]
// CHECK2-NEXT: unreachable
// CHECK2: worker.exit:
// CHECK2-NEXT: ret void
@@ -893,7 +893,7 @@ void unreachable_call() {
//
//
// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
-// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR7:[0-9]+]] {
// CHECK2-NEXT: entry:
// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2
// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
@@ -935,7 +935,7 @@ void unreachable_call() {
// CHECK3-NEXT: [[TMP6:%.*]] = bitcast i32** [[TMP0]] to i8*
// CHECK3-NEXT: store i8* [[TMP6]], i8** [[TMP5]], align 4
// CHECK3-NEXT: [[TMP7:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32**, i32**)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP7]], i32 2)
+// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 2, i32 -1, i8* bitcast (void (i32*, i32*, i32**, i32**)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP7]], i32 2)
// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true)
// CHECK3-NEXT: ret void
// CHECK3: worker.exit:
@@ -1070,7 +1070,7 @@ void unreachable_call() {
// CHECK3-NEXT: [[ADD19:%.*]] = add nsw i32 [[CONV18]], 1
// CHECK3-NEXT: [[CONV20:%.*]] = trunc i32 [[ADD19]] to i8
// CHECK3-NEXT: store i8 [[CONV20]], i8* [[Y]], align 8
-// CHECK3-NEXT: [[CALL:%.*]] = call nonnull align 8 dereferenceable(8) i64* @_ZN2TTIxcEixEi(%struct.TT* nonnull align 8 dereferenceable(16) [[TMP7]], i32 0) #[[ATTR6:[0-9]+]]
+// CHECK3-NEXT: [[CALL:%.*]] = call nonnull align 8 dereferenceable(8) i64* @_ZN2TTIxcEixEi(%struct.TT* nonnull align 8 dereferenceable(16) [[TMP7]], i32 0) #[[ATTR8:[0-9]+]]
// CHECK3-NEXT: [[TMP17:%.*]] = load i64, i64* [[CALL]], align 8
// CHECK3-NEXT: [[ADD21:%.*]] = add nsw i64 [[TMP17]], 1
// CHECK3-NEXT: store i64 [[ADD21]], i64* [[CALL]], align 8
@@ -1081,7 +1081,7 @@ void unreachable_call() {
//
//
// CHECK3-LABEL: define {{[^@]+}}@_ZN2TTIxcEixEi
-// CHECK3-SAME: (%struct.TT* nonnull align 8 dereferenceable(16) [[THIS:%.*]], i32 [[I:%.*]]) #[[ATTR3:[0-9]+]] comdat align 2 {
+// CHECK3-SAME: (%struct.TT* nonnull align 8 dereferenceable(16) [[THIS:%.*]], i32 [[I:%.*]]) #[[ATTR4:[0-9]+]] comdat align 2 {
// CHECK3-NEXT: entry:
// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca %struct.TT*, align 4
// CHECK3-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4
@@ -1172,7 +1172,7 @@ void unreachable_call() {
// CHECK3-NEXT: [[TMP8:%.*]] = load double, double* [[A6]], align 8
// CHECK3-NEXT: [[CONV7:%.*]] = fptosi double [[TMP8]] to i32
// CHECK3-NEXT: [[A8:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
-// CHECK3-NEXT: [[CALL:%.*]] = call i32 @_Z3baziRd(i32 [[CONV7]], double* nonnull align 8 dereferenceable(8) [[A8]]) #[[ATTR6]]
+// CHECK3-NEXT: [[CALL:%.*]] = call i32 @_Z3baziRd(i32 [[CONV7]], double* nonnull align 8 dereferenceable(8) [[A8]]) #[[ATTR8]]
// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
// CHECK3-NEXT: ret void
// CHECK3: worker.exit:
@@ -1180,7 +1180,7 @@ void unreachable_call() {
//
//
// CHECK3-LABEL: define {{[^@]+}}@_Z3baziRd
-// CHECK3-SAME: (i32 [[F1:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (i32 [[F1:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR4]] {
// CHECK3-NEXT: entry:
// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca double*, align 4
// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
@@ -1210,7 +1210,7 @@ void unreachable_call() {
// CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
// CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
// CHECK3: user_code.entry:
-// CHECK3-NEXT: call void @_Z6asserti(i32 0) #[[ATTR7:[0-9]+]]
+// CHECK3-NEXT: call void @_Z6asserti(i32 0) #[[ATTR9:[0-9]+]]
// CHECK3-NEXT: unreachable
// CHECK3: worker.exit:
// CHECK3-NEXT: ret void
@@ -1276,7 +1276,7 @@ void unreachable_call() {
//
//
// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
-// CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR7:[0-9]+]] {
// CHECK3-NEXT: entry:
// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2
// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
diff --git a/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp
index fb1e427cfed72..5049500516f6b 100644
--- a/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp
@@ -62,7 +62,7 @@ int bar(int n){
// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP0]] to i8*
// CHECK1-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 8
// CHECK1-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i64 1)
+// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 1024, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i64 1)
// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true)
// CHECK1-NEXT: ret void
// CHECK1: worker.exit:
@@ -119,7 +119,7 @@ int bar(int n){
// CHECK1-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8*
// CHECK1-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 8
// CHECK1-NEXT: [[TMP12:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP12]], i64 3)
+// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 [[TMP5]], i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP12]], i64 3)
// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true)
// CHECK1-NEXT: ret void
// CHECK1: worker.exit:
@@ -173,7 +173,7 @@ int bar(int n){
// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP0]] to i8*
// CHECK2-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 4
// CHECK2-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i32 1)
+// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 1024, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i32 1)
// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true)
// CHECK2-NEXT: ret void
// CHECK2: worker.exit:
@@ -229,7 +229,7 @@ int bar(int n){
// CHECK2-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8*
// CHECK2-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 4
// CHECK2-NEXT: [[TMP12:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP12]], i32 3)
+// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 [[TMP5]], i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP12]], i32 3)
// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true)
// CHECK2-NEXT: ret void
// CHECK2: worker.exit:
@@ -283,7 +283,7 @@ int bar(int n){
// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP0]] to i8*
// CHECK3-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 4
// CHECK3-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i32 1)
+// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 1024, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i32 1)
// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true)
// CHECK3-NEXT: ret void
// CHECK3: worker.exit:
@@ -339,7 +339,7 @@ int bar(int n){
// CHECK3-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8*
// CHECK3-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 4
// CHECK3-NEXT: [[TMP12:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP12]], i32 3)
+// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 [[TMP5]], i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP12]], i32 3)
// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true)
// CHECK3-NEXT: ret void
// CHECK3: worker.exit:
@@ -393,7 +393,7 @@ int bar(int n){
// CHECK4-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP0]] to i8*
// CHECK4-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 8
// CHECK4-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i64 1)
+// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 1024, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i64 1)
// CHECK4-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true)
// CHECK4-NEXT: ret void
// CHECK4: worker.exit:
@@ -450,7 +450,7 @@ int bar(int n){
// CHECK4-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8*
// CHECK4-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 8
// CHECK4-NEXT: [[TMP12:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP12]], i64 3)
+// CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 [[TMP5]], i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP12]], i64 3)
// CHECK4-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true)
// CHECK4-NEXT: ret void
// CHECK4: worker.exit:
@@ -504,7 +504,7 @@ int bar(int n){
// CHECK5-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP0]] to i8*
// CHECK5-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 4
// CHECK5-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i32 1)
+// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 1024, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i32 1)
// CHECK5-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true)
// CHECK5-NEXT: ret void
// CHECK5: worker.exit:
@@ -560,7 +560,7 @@ int bar(int n){
// CHECK5-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8*
// CHECK5-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 4
// CHECK5-NEXT: [[TMP12:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP12]], i32 3)
+// CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 [[TMP5]], i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP12]], i32 3)
// CHECK5-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true)
// CHECK5-NEXT: ret void
// CHECK5: worker.exit:
@@ -614,7 +614,7 @@ int bar(int n){
// CHECK6-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP0]] to i8*
// CHECK6-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 4
// CHECK6-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i32 1)
+// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 1024, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i32 1)
// CHECK6-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true)
// CHECK6-NEXT: ret void
// CHECK6: worker.exit:
@@ -670,7 +670,7 @@ int bar(int n){
// CHECK6-NEXT: [[TMP11:%.*]] = bitcast [10 x i32]* [[TMP2]] to i8*
// CHECK6-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 4
// CHECK6-NEXT: [[TMP12:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP12]], i32 3)
+// CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 [[TMP5]], i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP12]], i32 3)
// CHECK6-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true)
// CHECK6-NEXT: ret void
// CHECK6: worker.exit:
diff --git a/openmp/libomptarget/DeviceRTL/include/Interface.h b/openmp/libomptarget/DeviceRTL/include/Interface.h
index 7cfe7b5bc0571..9ef9b823f9e2d 100644
--- a/openmp/libomptarget/DeviceRTL/include/Interface.h
+++ b/openmp/libomptarget/DeviceRTL/include/Interface.h
@@ -295,8 +295,6 @@ void __kmpc_push_num_teams(IdentTy *Loc, int32_t TId, int32_t NumTeams,
/// TODO
uint16_t __kmpc_parallel_level(IdentTy *Loc, uint32_t);
-/// TODO
-void __kmpc_push_num_threads(IdentTy *Loc, int32_t, int32_t NumThreads);
///}
/// Tasking
diff --git a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
index ae7df3f95b7ae..610512a5f799b 100644
--- a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
@@ -212,11 +212,6 @@ int32_t __kmpc_global_thread_num(IdentTy *) {
return omp_get_thread_num();
}
-void __kmpc_push_num_threads(IdentTy *, int32_t, int32_t NumThreads) {
- FunctionTracingRAII();
- icv::NThreads = NumThreads;
-}
-
void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams,
int32_t thread_limit) {
FunctionTracingRAII();
diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h
index d4f8678ee1b35..417c22d607d5e 100644
--- a/openmp/libomptarget/deviceRTLs/common/omptarget.h
+++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h
@@ -181,10 +181,6 @@ class omptarget_nvptx_ThreadPrivateContext {
topTaskDescr[tid] = taskICV;
}
INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid) const;
- // parallel
- INLINE uint16_t &NumThreadsForNextParallel(int tid) {
- return nextRegion.tnum[tid];
- }
// schedule (for dispatch)
INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; }
INLINE int64_t &Chunk(int tid) { return chunk[tid]; }
@@ -204,11 +200,6 @@ class omptarget_nvptx_ThreadPrivateContext {
omptarget_nvptx_TaskDescr levelOneTaskDescr[MAX_THREADS_PER_TEAM];
// pointer where to find the current task ICV (top of the stack)
omptarget_nvptx_TaskDescr *topTaskDescr[MAX_THREADS_PER_TEAM];
- union {
- // Only one of the two is live at the same time.
- // parallel
- uint16_t tnum[MAX_THREADS_PER_TEAM];
- } nextRegion;
// schedule (for dispatch)
kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for
int64_t chunk[MAX_THREADS_PER_TEAM];
diff --git a/openmp/libomptarget/deviceRTLs/common/omptargeti.h b/openmp/libomptarget/deviceRTLs/common/omptargeti.h
index 485e30cc8d72c..93831c8952739 100644
--- a/openmp/libomptarget/deviceRTLs/common/omptargeti.h
+++ b/openmp/libomptarget/deviceRTLs/common/omptargeti.h
@@ -158,8 +158,6 @@ omptarget_nvptx_ThreadPrivateContext::InitThreadPrivateContext(int tid) {
// levelOneTaskDescr is init when starting the parallel region
// top task descr is NULL (team master version will be fixed separately)
topTaskDescr[tid] = NULL;
- // no num threads value has been pushed
- nextRegion.tnum[tid] = 0;
// the following don't need to be init here; they are init when using dyn
// sched
// current_Event, events_Number, chunk, num_Iterations, schedule
diff --git a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
index c064bd5889388..4b27529473dcc 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu
@@ -73,7 +73,8 @@ INLINE static uint16_t determineNumberOfThreads(uint16_t NumThreadsClause,
}
// This routine is always called by the team master..
-EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn) {
+EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
+ kmp_int32 NumThreadsClause) {
PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n");
omptarget_nvptx_workFn = WorkFn;
@@ -92,9 +93,6 @@ EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn) {
return;
}
- uint16_t &NumThreadsClause =
- omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
-
uint16_t NumThreads =
determineNumberOfThreads(NumThreadsClause, nThreads, threadLimit);
@@ -255,16 +253,6 @@ EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) {
// push params
////////////////////////////////////////////////////////////////////////////////
-EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t tid,
- int32_t num_threads) {
- PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads);
- ASSERT0(LT_FUSSY, isRuntimeInitialized(),
- "Runtime must be initialized.");
- tid = GetLogicalThreadIdInBlock();
- omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) =
- num_threads;
-}
-
// Do nothing. The host guarantees we started the requested number of
// teams and we only need inspection of gridDim.
@@ -304,11 +292,7 @@ NOINLINE EXTERN void __kmpc_parallel_51(kmp_Ident *ident, kmp_int32 global_tid,
return;
}
- // Handle the num_threads clause.
- if (num_threads != -1)
- __kmpc_push_num_threads(ident, global_tid, num_threads);
-
- __kmpc_kernel_prepare_parallel((void *)wrapper_fn);
+ __kmpc_kernel_prepare_parallel((void *)wrapper_fn, num_threads);
if (nargs) {
void **GlobalArgs;
diff --git a/openmp/libomptarget/deviceRTLs/interface.h b/openmp/libomptarget/deviceRTLs/interface.h
index 00aa07c01419c..b9b537cf1d060 100644
--- a/openmp/libomptarget/deviceRTLs/interface.h
+++ b/openmp/libomptarget/deviceRTLs/interface.h
@@ -220,8 +220,6 @@ typedef int32_t kmp_CriticalName[8];
// parallel
EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc);
-EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t global_tid,
- int32_t num_threads);
NOINLINE EXTERN uint8_t __kmpc_parallel_level();
// proc bind
@@ -445,7 +443,8 @@ EXTERN int32_t __kmpc_target_init(ident_t *Ident, int8_t Mode,
bool RequiresFullRuntime);
EXTERN void __kmpc_target_deinit(ident_t *Ident, int8_t Mode,
bool RequiresFullRuntime);
-EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn);
+EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
+ int32_t NumThreadsClause);
EXTERN bool __kmpc_kernel_parallel(void **WorkFn);
EXTERN void __kmpc_kernel_end_parallel();
More information about the Openmp-commits
mailing list