r292999 - [OpenMP] Support for the num_threads-clause on 'target parallel' on the NVPTX device.
Arpith Chacko Jacob via cfe-commits
cfe-commits at lists.llvm.org
Tue Jan 24 17:18:34 PST 2017
Author: arpith
Date: Tue Jan 24 19:18:34 2017
New Revision: 292999
URL: http://llvm.org/viewvc/llvm-project?rev=292999&view=rev
Log:
[OpenMP] Support for the num_threads-clause on 'target parallel' on the NVPTX device.
This patch adds support for the Spmd construct 'target parallel' on the
NVPTX device. This involves ignoring the num_threads clause on the device
since the number of threads in this combined construct is already set on
the host through the call to __tgt_target_teams().
Reviewers: ABataev
Differential Revision: https://reviews.llvm.org/D29083
Added:
cfe/trunk/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp
Modified:
cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
Modified: cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp?rev=292999&r1=292998&r2=292999&view=diff
==============================================================================
--- cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp (original)
+++ cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp Tue Jan 24 19:18:34 2017
@@ -642,6 +642,17 @@ CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVP
llvm_unreachable("OpenMP NVPTX can only handle device code.");
}
+void CGOpenMPRuntimeNVPTX::emitNumThreadsClause(CodeGenFunction &CGF,
+ llvm::Value *NumThreads,
+ SourceLocation Loc) {
+ // Do nothing in case of Spmd mode and L0 parallel.
+ // TODO: If in Spmd mode and L1 parallel emit the clause.
+ if (isInSpmdExecutionMode())
+ return;
+
+ CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
+}
+
void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF,
const Expr *NumTeams,
const Expr *ThreadLimit,
Modified: cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.h?rev=292999&r1=292998&r2=292999&view=diff
==============================================================================
--- cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.h (original)
+++ cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.h Tue Jan 24 19:18:34 2017
@@ -170,6 +170,14 @@ protected:
public:
explicit CGOpenMPRuntimeNVPTX(CodeGenModule &CGM);
+ /// \brief Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32
+ /// global_tid, kmp_int32 num_threads) to generate code for 'num_threads'
+ /// clause.
+ /// \param NumThreads An integer value of threads.
+ virtual void emitNumThreadsClause(CodeGenFunction &CGF,
+ llvm::Value *NumThreads,
+ SourceLocation Loc) override;
+
/// \brief This function ought to emit, in the general case, a call to
// the openmp runtime kmpc_push_num_teams. In NVPTX backend it is not needed
// as these numbers are obtained through the PTX grid and block configuration.
Added: cfe/trunk/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp?rev=292999&view=auto
==============================================================================
--- cfe/trunk/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp (added)
+++ cfe/trunk/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp Tue Jan 24 19:18:34 2017
@@ -0,0 +1,126 @@
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// Check that the execution mode of all 2 target regions on the gpu is set to SPMD Mode.
+// CHECK-DAG: {{@__omp_offloading_.+l21}}_exec_mode = weak constant i8 0
+// CHECK-DAG: {{@__omp_offloading_.+l26}}_exec_mode = weak constant i8 0
+
+template<typename tx>
+tx ftemplate(int n) {
+ tx a = 0;
+ short aa = 0;
+ tx b[10];
+
+ #pragma omp target parallel map(tofrom: aa) num_threads(1024)
+ {
+ aa += 1;
+ }
+
+ #pragma omp target parallel map(tofrom:a, aa, b) if(target: n>40) num_threads(n)
+ {
+ a += 1;
+ aa += 1;
+ b[2] += 1;
+ }
+
+ return a;
+}
+
+int bar(int n){
+ int a = 0;
+
+ a += ftemplate<int>(n);
+
+ return a;
+}
+
+ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l21}}(
+ // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align
+ // CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align
+ // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align
+ // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+ // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]],
+ // CHECK: br label {{%?}}[[EXEC:.+]]
+ //
+ // CHECK: [[EXEC]]
+ // CHECK-NOT: call void @__kmpc_push_num_threads
+ // CHECK: {{call|invoke}} void [[OP1:@.+]](i32* null, i32* null, i16* [[AA]])
+ // CHECK: br label {{%?}}[[DONE:.+]]
+ //
+ // CHECK: [[DONE]]
+ // CHECK: call void @__kmpc_spmd_kernel_deinit()
+ // CHECK: br label {{%?}}[[EXIT:.+]]
+ //
+ // CHECK: [[EXIT]]
+ // CHECK: ret void
+ // CHECK: }
+
+ // CHECK: define internal void [[OP1]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i16* {{[^%]*}}[[ARG:%.+]])
+ // CHECK: = alloca i32*, align
+ // CHECK: = alloca i32*, align
+ // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align
+ // CHECK: store i16* [[ARG]], i16** [[AA_ADDR]], align
+ // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align
+ // CHECK: [[VAL:%.+]] = load i16, i16* [[AA]], align
+ // CHECK: store i16 {{%.+}}, i16* [[AA]], align
+ // CHECK: ret void
+ // CHECK: }
+
+
+
+
+
+
+ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l26}}(
+ // CHECK: [[A_ADDR:%.+]] = alloca i32*, align
+ // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align
+ // CHECK: [[B_ADDR:%.+]] = alloca [10 x i32]*, align
+ // CHECK: store i32* {{%.+}}, i32** [[A_ADDR]], align
+ // CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align
+ // CHECK: store [10 x i32]* {{%.+}}, [10 x i32]** [[B_ADDR]], align
+ // CHECK: [[A:%.+]] = load i32*, i32** [[A_ADDR]], align
+ // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align
+ // CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align
+ // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+ // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]],
+ // CHECK: br label {{%?}}[[EXEC:.+]]
+ //
+ // CHECK: [[EXEC]]
+ // CHECK-NOT: call void @__kmpc_push_num_threads
+ // CHECK: {{call|invoke}} void [[OP2:@.+]](i32* null, i32* null, i32* [[A]], i16* [[AA]], [10 x i32]* [[B]])
+ // CHECK: br label {{%?}}[[DONE:.+]]
+ //
+ // CHECK: [[DONE]]
+ // CHECK: call void @__kmpc_spmd_kernel_deinit()
+ // CHECK: br label {{%?}}[[EXIT:.+]]
+ //
+ // CHECK: [[EXIT]]
+ // CHECK: ret void
+ // CHECK: }
+
+ // CHECK: define internal void [[OP2]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* {{[^%]*}}[[ARG1:%.+]], i16* {{[^%]*}}[[ARG2:%.+]], [10 x i32]* {{[^%]*}}[[ARG3:%.+]])
+ // CHECK: = alloca i32*, align
+ // CHECK: = alloca i32*, align
+ // CHECK: [[A_ADDR:%.+]] = alloca i32*, align
+ // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align
+ // CHECK: [[B_ADDR:%.+]] = alloca [10 x i32]*, align
+ // CHECK: store i32* [[ARG1]], i32** [[A_ADDR]], align
+ // CHECK: store i16* [[ARG2]], i16** [[AA_ADDR]], align
+ // CHECK: store [10 x i32]* [[ARG3]], [10 x i32]** [[B_ADDR]], align
+ // CHECK: [[A:%.+]] = load i32*, i32** [[A_ADDR]], align
+ // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align
+ // CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align
+ // CHECK: store i32 {{%.+}}, i32* [[A]], align
+ // CHECK: store i16 {{%.+}}, i16* [[AA]], align
+ // CHECK: [[ELT:%.+]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B]],
+ // CHECK: store i32 {{%.+}}, i32* [[ELT]], align
+ // CHECK: ret void
+ // CHECK: }
+#endif
More information about the cfe-commits
mailing list