r292428 - [OpenMP] Codegen for the 'target parallel' directive on the NVPTX device.

Wed Jan 18 11:35:01 PST 2017

Author: arpith
Date: Wed Jan 18 13:35:00 2017
New Revision: 292428

URL: http://llvm.org/viewvc/llvm-project?rev=292428&view=rev
Log:
[OpenMP] Codegen for the 'target parallel' directive on the NVPTX device.

This patch adds codegen for the 'target parallel' directive on the NVPTX
device.  We term offload OpenMP directives such as 'target parallel' and
'target teams distribute parallel for' as SPMD constructs.  SPMD constructs,
in contrast to Generic ones like the plain 'target', can never contain
a serial region.

SPMD constructs can be handled more efficiently on the GPU and do not
require the Warp Loop of the Generic codegen scheme. This patch adds
SPMD codegen support for 'target parallel' on the NVPTX device and can
be reused for other SPMD constructs.

Reviewers: ABataev
Differential Revision: https://reviews.llvm.org/D28755

Added:
    cfe/trunk/test/OpenMP/nvptx_target_parallel_codegen.cpp
Modified:
    cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
    cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
    cfe/trunk/test/OpenMP/nvptx_target_codegen.cpp

Modified: cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp?rev=292428&r1=292427&r2=292428&view=diff
==============================================================================

--- cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp (original)
+++ cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp Wed Jan 18 13:35:00 2017
@@ -26,6 +26,11 @@ enum OpenMPRTLFunctionNVPTX {
   OMPRTL_NVPTX__kmpc_kernel_init,
   /// \brief Call to void __kmpc_kernel_deinit();
   OMPRTL_NVPTX__kmpc_kernel_deinit,
+  /// \brief Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
+  /// short RequiresOMPRuntime, short RequiresDataSharing);
+  OMPRTL_NVPTX__kmpc_spmd_kernel_init,
+  /// \brief Call to void __kmpc_spmd_kernel_deinit();
+  OMPRTL_NVPTX__kmpc_spmd_kernel_deinit,
   /// \brief Call to void __kmpc_kernel_prepare_parallel(void
   /// *outlined_function);
   OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
@@ -76,6 +81,25 @@ public:
     CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
   }
 };
+
+// A class to track the execution mode when codegening directives within
+// a target region. The appropriate mode (generic/spmd) is set on entry
+// to the target region and used by containing directives such as 'parallel'
+// to emit optimized code.
+class ExecutionModeRAII {
+private:
+  CGOpenMPRuntimeNVPTX::ExecutionMode SavedMode;
+  CGOpenMPRuntimeNVPTX::ExecutionMode &Mode;
+
+public:
+  ExecutionModeRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &Mode,
+                    CGOpenMPRuntimeNVPTX::ExecutionMode NewMode)
+      : Mode(Mode) {
+    SavedMode = Mode;
+    Mode = NewMode;
+  }
+  ~ExecutionModeRAII() { Mode = SavedMode; }
+};
 } // anonymous namespace
 
 /// Get the GPU warp size.
@@ -116,12 +140,17 @@ static void getNVPTXCTABarrier(CodeGenFu
 static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); }
 
 /// Get the value of the thread_limit clause in the teams directive.
-/// The runtime encodes thread_limit in the launch parameter, always starting
-/// thread_limit+warpSize threads per team.
-static llvm::Value *getThreadLimit(CodeGenFunction &CGF) {
+/// For the 'generic' execution mode, the runtime encodes thread_limit in
+/// the launch parameters, always starting thread_limit+warpSize threads per
+/// CTA. The threads in the last warp are reserved for master execution.
+/// For the 'spmd' execution mode, all threads in a CTA are part of the team.
+static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
+                                   bool IsInSpmdExecutionMode = false) {
   CGBuilderTy &Bld = CGF.Builder;
-  return Bld.CreateSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
-                       "thread_limit");
+  return IsInSpmdExecutionMode
+             ? getNVPTXNumThreads(CGF)
+             : Bld.CreateSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
+                             "thread_limit");
 }
 
 /// Get the thread id of the OMP master thread.
@@ -159,12 +188,33 @@ void CGOpenMPRuntimeNVPTX::WorkerFunctio
   CGM.SetInternalFunctionAttributes(/*D=*/nullptr, WorkerFn, *CGFI);
 }
 
+bool CGOpenMPRuntimeNVPTX::isInSpmdExecutionMode() const {
+  return CurrentExecutionMode == CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd;
+}
+
+static CGOpenMPRuntimeNVPTX::ExecutionMode
+getExecutionModeForDirective(CodeGenModule &CGM,
+                             const OMPExecutableDirective &D) {
+  OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
+  switch (DirectiveKind) {
+  case OMPD_target:
+    return CGOpenMPRuntimeNVPTX::ExecutionMode::Generic;
+  case OMPD_target_parallel:
+    return CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd;
+  default:
+    llvm_unreachable("Unsupported directive on NVPTX device.");
+  }
+  llvm_unreachable("Unsupported directive on NVPTX device.");
+}
+
 void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D,
                                              StringRef ParentName,
                                              llvm::Function *&OutlinedFn,
                                              llvm::Constant *&OutlinedFnID,
                                              bool IsOffloadEntry,
                                              const RegionCodeGenTy &CodeGen) {
+  ExecutionModeRAII ModeRAII(CurrentExecutionMode,
+                             CGOpenMPRuntimeNVPTX::ExecutionMode::Generic);
   EntryFunctionState EST;
   WorkerFunctionState WST(CGM);
   Work.clear();
@@ -252,6 +302,94 @@ void CGOpenMPRuntimeNVPTX::emitGenericEn
   EST.ExitBB = nullptr;
 }
 
+void CGOpenMPRuntimeNVPTX::emitSpmdKernel(const OMPExecutableDirective &D,
+                                          StringRef ParentName,
+                                          llvm::Function *&OutlinedFn,
+                                          llvm::Constant *&OutlinedFnID,
+                                          bool IsOffloadEntry,
+                                          const RegionCodeGenTy &CodeGen) {
+  ExecutionModeRAII ModeRAII(CurrentExecutionMode,
+                             CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd);
+  EntryFunctionState EST;
+
+  // Emit target region as a standalone region.
+  class NVPTXPrePostActionTy : public PrePostActionTy {
+    CGOpenMPRuntimeNVPTX &RT;
+    CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
+    const OMPExecutableDirective &D;
+
+  public:
+    NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
+                         CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
+                         const OMPExecutableDirective &D)
+        : RT(RT), EST(EST), D(D) {}
+    void Enter(CodeGenFunction &CGF) override {
+      RT.emitSpmdEntryHeader(CGF, EST, D);
+    }
+    void Exit(CodeGenFunction &CGF) override {
+      RT.emitSpmdEntryFooter(CGF, EST);
+    }
+  } Action(*this, EST, D);
+  CodeGen.setAction(Action);
+  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
+                                   IsOffloadEntry, CodeGen);
+  return;
+}
+
+void CGOpenMPRuntimeNVPTX::emitSpmdEntryHeader(
+    CodeGenFunction &CGF, EntryFunctionState &EST,
+    const OMPExecutableDirective &D) {
+  auto &Bld = CGF.Builder;
+
+  // Setup BBs in entry function.
+  llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
+  EST.ExitBB = CGF.createBasicBlock(".exit");
+
+  // Initialize the OMP state in the runtime; called by all active threads.
+  // TODO: Set RequiresOMPRuntime and RequiresDataSharing parameters
+  // based on code analysis of the target region.
+  llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSpmdExecutionMode=*/true),
+                         /*RequiresOMPRuntime=*/Bld.getInt16(1),
+                         /*RequiresDataSharing=*/Bld.getInt16(1)};
+  CGF.EmitRuntimeCall(
+      createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
+  CGF.EmitBranch(ExecuteBB);
+
+  CGF.EmitBlock(ExecuteBB);
+}
+
+void CGOpenMPRuntimeNVPTX::emitSpmdEntryFooter(CodeGenFunction &CGF,
+                                               EntryFunctionState &EST) {
+  if (!EST.ExitBB)
+    EST.ExitBB = CGF.createBasicBlock(".exit");
+
+  llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
+  CGF.EmitBranch(OMPDeInitBB);
+
+  CGF.EmitBlock(OMPDeInitBB);
+  // DeInitialize the OMP state in the runtime; called by all active threads.
+  CGF.EmitRuntimeCall(
+      createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_deinit), None);
+  CGF.EmitBranch(EST.ExitBB);
+
+  CGF.EmitBlock(EST.ExitBB);
+  EST.ExitBB = nullptr;
+}
+
+// Create a unique global variable to indicate the execution mode of this target
+// region. The execution mode is either 'generic', or 'spmd' depending on the
+// target directive. This variable is picked up by the offload library to setup
+// the device appropriately before kernel launch. If the execution mode is
+// 'generic', the runtime reserves one warp for the master, otherwise, all
+// warps participate in parallel work.
+static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
+                                     CGOpenMPRuntimeNVPTX::ExecutionMode Mode) {
+  (void)new llvm::GlobalVariable(
+      CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
+      llvm::GlobalValue::WeakAnyLinkage,
+      llvm::ConstantInt::get(CGM.Int8Ty, Mode), Name + Twine("_exec_mode"));
+}
+
 void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
   auto &Ctx = CGM.getContext();
 
@@ -385,6 +523,22 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntime
     RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
     break;
   }
+  case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
+    // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
+    // short RequiresOMPRuntime, short RequiresDataSharing);
+    llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init");
+    break;
+  }
+  case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit: {
+    // Build void __kmpc_spmd_kernel_deinit();
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit");
+    break;
+  }
   case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
     /// Build void __kmpc_kernel_prepare_parallel(
     /// void *outlined_function);
@@ -463,12 +617,27 @@ void CGOpenMPRuntimeNVPTX::emitTargetOut
 
   assert(!ParentName.empty() && "Invalid target region parent name!");
 
-  emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
-                    CodeGen);
+  CGOpenMPRuntimeNVPTX::ExecutionMode Mode =
+      getExecutionModeForDirective(CGM, D);
+  switch (Mode) {
+  case CGOpenMPRuntimeNVPTX::ExecutionMode::Generic:
+    emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
+                      CodeGen);
+    break;
+  case CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd:
+    emitSpmdKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
+                   CodeGen);
+    break;
+  case CGOpenMPRuntimeNVPTX::ExecutionMode::Unknown:
+    llvm_unreachable(
+        "Unknown programming model for OpenMP directive on NVPTX target.");
+  }
+
+  setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
 }
 
 CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
-    : CGOpenMPRuntime(CGM) {
+    : CGOpenMPRuntime(CGM), CurrentExecutionMode(ExecutionMode::Unknown) {
   if (!CGM.getLangOpts().OpenMPIsDevice)
     llvm_unreachable("OpenMP NVPTX can only handle device code.");
 }
@@ -523,7 +692,10 @@ void CGOpenMPRuntimeNVPTX::emitParallelC
   if (!CGF.HaveInsertPoint())
     return;
 
-  emitGenericParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
+  if (isInSpmdExecutionMode())
+    emitSpmdParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
+  else
+    emitGenericParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
 }
 
 void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
@@ -593,3 +765,20 @@ void CGOpenMPRuntimeNVPTX::emitGenericPa
     ThenRCG(CGF);
   }
 }
+
+void CGOpenMPRuntimeNVPTX::emitSpmdParallelCall(
+    CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
+    ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
+  // Just call the outlined function to execute the parallel region.
+  // OutlinedFn(&GTid, &zero, CapturedStruct);
+  //
+  // TODO: Do something with IfCond when support for the 'if' clause
+  // is added on Spmd target directives.
+  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
+  OutlinedFnArgs.push_back(
+      llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
+  OutlinedFnArgs.push_back(
+      llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
+  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
+  CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs);
+}

Modified: cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.h?rev=292428&r1=292427&r2=292428&view=diff
==============================================================================
--- cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.h (original)
+++ cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.h Wed Jan 18 13:35:00 2017
@@ -43,6 +43,8 @@ private:
     void createWorkerFunction(CodeGenModule &CGM);
   };
 
+  bool isInSpmdExecutionMode() const;
+
   /// \brief Emit the worker function for the current target region.
   void emitWorkerFunction(WorkerFunctionState &WST);
 
@@ -58,6 +60,13 @@ private:
   /// function.
   void emitGenericEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST);
 
+  /// \brief Helper for Spmd mode target directive's entry function.
+  void emitSpmdEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST,
+                           const OMPExecutableDirective &D);
+
+  /// \brief Signal termination of Spmd mode execution.
+  void emitSpmdEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST);
+
   /// \brief Returns specified OpenMP runtime function for the current OpenMP
   /// implementation.  Specialized for the NVPTX device.
   /// \param Function OpenMP runtime function.
@@ -87,6 +96,22 @@ private:
                          llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
                          const RegionCodeGenTy &CodeGen);
 
+  /// \brief Emit outlined function specialized for the Single Program
+  /// Multiple Data programming model for applicable target directives on the
+  /// NVPTX device.
+  /// \param D Directive to emit.
+  /// \param ParentName Name of the function that encloses the target region.
+  /// \param OutlinedFn Outlined function value to be defined by this call.
+  /// \param OutlinedFnID Outlined function ID value to be defined by this call.
+  /// \param IsOffloadEntry True if the outlined function is an offload entry.
+  /// \param CodeGen Object containing the target statements.
+  /// An outlined function may not be an entry if, e.g. the if clause always
+  /// evaluates to false.
+  void emitSpmdKernel(const OMPExecutableDirective &D, StringRef ParentName,
+                      llvm::Function *&OutlinedFn,
+                      llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
+                      const RegionCodeGenTy &CodeGen);
+
   /// \brief Emit outlined function for 'target' directive on the NVPTX
   /// device.
   /// \param D Directive to emit.
@@ -118,6 +143,22 @@ private:
                                ArrayRef<llvm::Value *> CapturedVars,
                                const Expr *IfCond);
 
+  /// \brief Emits code for parallel or serial call of the \a OutlinedFn with
+  /// variables captured in a record which address is stored in \a
+  /// CapturedStruct.
+  /// This call is for a parallel directive within an SPMD target directive.
+  /// \param OutlinedFn Outlined function to be run in parallel threads. Type of
+  /// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*).
+  /// \param CapturedVars A pointer to the record with the references to
+  /// variables used in \a OutlinedFn function.
+  /// \param IfCond Condition in the associated 'if' clause, if it was
+  /// specified, nullptr otherwise.
+  ///
+  void emitSpmdParallelCall(CodeGenFunction &CGF, SourceLocation Loc,
+                            llvm::Value *OutlinedFn,
+                            ArrayRef<llvm::Value *> CapturedVars,
+                            const Expr *IfCond);
+
 protected:
   /// \brief Get the function name of an outlined region.
   //  The name can be customized depending on the target.
@@ -192,6 +233,25 @@ public:
                         llvm::Value *OutlinedFn,
                         ArrayRef<llvm::Value *> CapturedVars,
                         const Expr *IfCond) override;
+
+public:
+  /// Target codegen is specialized based on two programming models: the
+  /// 'generic' fork-join model of OpenMP, and a more GPU efficient 'spmd'
+  /// model for constructs like 'target parallel' that support it.
+  enum ExecutionMode {
+    /// Single Program Multiple Data.
+    Spmd,
+    /// Generic codegen to support fork-join model.
+    Generic,
+    Unknown,
+  };
+
+private:
+  // Track the execution mode when codegening directives within a target
+  // region. The appropriate mode (generic/spmd) is set on entry to the
+  // target region and used by containing directives such as 'parallel'
+  // to emit optimized code.
+  ExecutionMode CurrentExecutionMode;
 };
 
 } // CodeGen namespace.

Modified: cfe/trunk/test/OpenMP/nvptx_target_codegen.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/OpenMP/nvptx_target_codegen.cpp?rev=292428&r1=292427&r2=292428&view=diff
==============================================================================
--- cfe/trunk/test/OpenMP/nvptx_target_codegen.cpp (original)
+++ cfe/trunk/test/OpenMP/nvptx_target_codegen.cpp Wed Jan 18 13:35:00 2017
@@ -8,6 +8,14 @@
 #ifndef HEADER
 #define HEADER
 
+// Check that the execution mode of all 6 target regions is set to Generic Mode.
+// CHECK-DAG: {{@__omp_offloading_.+l98}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l175}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l284}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l321}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l339}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l304}}_exec_mode = weak constant i8 1
+
 template<typename tx, typename ty>
 struct TT{
   tx X;
@@ -23,7 +31,7 @@ int foo(int n) {
   double cn[5][n];
   TT<long long, char> d;
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l90}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l98}}_worker()
   // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
   // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
   // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
@@ -54,7 +62,7 @@ int foo(int n) {
   // CHECK: [[EXIT]]
   // CHECK: ret void
 
-  // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+foo.+l90]]()
+  // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+foo.+l98]]()
   // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
   // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
   // CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
@@ -96,7 +104,7 @@ int foo(int n) {
   {
   }
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l167}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l175}}_worker()
   // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
   // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
   // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
@@ -127,7 +135,7 @@ int foo(int n) {
   // CHECK: [[EXIT]]
   // CHECK: ret void
 
-  // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+foo.+l167]](i[[SZ:32|64]] [[ARG1:%[a-zA-Z_]+]])
+  // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+foo.+l175]](i[[SZ:32|64]] [[ARG1:%[a-zA-Z_]+]])
   // CHECK: [[AA_ADDR:%.+]] = alloca i[[SZ]],
   // CHECK: store i[[SZ]] [[ARG1]], i[[SZ]]* [[AA_ADDR]],
   // CHECK: [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i16*
@@ -169,7 +177,7 @@ int foo(int n) {
     aa += 1;
   }
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l276}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l284}}_worker()
   // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
   // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
   // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
@@ -200,7 +208,7 @@ int foo(int n) {
   // CHECK: [[EXIT]]
   // CHECK: ret void
 
-  // CHECK: define {{.*}}void [[T3:@__omp_offloading_.+foo.+l276]](i[[SZ]]
+  // CHECK: define {{.*}}void [[T3:@__omp_offloading_.+foo.+l284]](i[[SZ]]
   // Create local storage for each capture.
   // CHECK:    [[LOCAL_A:%.+]] = alloca i[[SZ]]
   // CHECK:    [[LOCAL_B:%.+]] = alloca [10 x float]*
@@ -353,7 +361,7 @@ int bar(int n){
   return a;
 }
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+static.+313}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+static.+321}}_worker()
   // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
   // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
   // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
@@ -384,7 +392,7 @@ int bar(int n){
   // CHECK: [[EXIT]]
   // CHECK: ret void
 
-  // CHECK: define {{.*}}void [[T4:@__omp_offloading_.+static.+l313]](i[[SZ]]
+  // CHECK: define {{.*}}void [[T4:@__omp_offloading_.+static.+l321]](i[[SZ]]
   // Create local storage for each capture.
   // CHECK:  [[LOCAL_A:%.+]] = alloca i[[SZ]]
   // CHECK:  [[LOCAL_AA:%.+]] = alloca i[[SZ]]
@@ -439,7 +447,7 @@ int bar(int n){
 
 
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+S1.+l331}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+S1.+l339}}_worker()
   // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
   // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
   // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
@@ -470,7 +478,7 @@ int bar(int n){
   // CHECK: [[EXIT]]
   // CHECK: ret void
 
-  // CHECK: define {{.*}}void [[T5:@__omp_offloading_.+S1.+l331]](
+  // CHECK: define {{.*}}void [[T5:@__omp_offloading_.+S1.+l339]](
   // Create local storage for each capture.
   // CHECK:       [[LOCAL_THIS:%.+]] = alloca [[S1:%struct.*]]*
   // CHECK:       [[LOCAL_B:%.+]] = alloca i[[SZ]]
@@ -529,7 +537,7 @@ int bar(int n){
 
 
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l296}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l304}}_worker()
   // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
   // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
   // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
@@ -560,7 +568,7 @@ int bar(int n){
   // CHECK: [[EXIT]]
   // CHECK: ret void
 
-  // CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l296]](i[[SZ]]
+  // CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l304]](i[[SZ]]
   // Create local storage for each capture.
   // CHECK:  [[LOCAL_A:%.+]] = alloca i[[SZ]]
   // CHECK:  [[LOCAL_AA:%.+]] = alloca i[[SZ]]

Added: cfe/trunk/test/OpenMP/nvptx_target_parallel_codegen.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/OpenMP/nvptx_target_parallel_codegen.cpp?rev=292428&view=auto
==============================================================================
--- cfe/trunk/test/OpenMP/nvptx_target_parallel_codegen.cpp (added)
+++ cfe/trunk/test/OpenMP/nvptx_target_parallel_codegen.cpp Wed Jan 18 13:35:00 2017
@@ -0,0 +1,136 @@
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// Check that the execution mode of all 2 target regions on the gpu is set to SPMD Mode.
+// CHECK-DAG: {{@__omp_offloading_.+l26}}_exec_mode = weak constant i8 0
+// CHECK-DAG: {{@__omp_offloading_.+l31}}_exec_mode = weak constant i8 0
+
+template<typename tx>
+tx ftemplate(int n) {
+  tx a = 0;
+  short aa = 0;
+  tx b[10];
+
+  #pragma omp target parallel if(target: 0)
+  {
+    a += 1;
+  }
+
+  #pragma omp target parallel map(tofrom: aa)
+  {
+    aa += 1;
+  }
+
+  #pragma omp target parallel map(tofrom:a, aa, b) if(target: n>40)
+  {
+    a += 1;
+    aa += 1;
+    b[2] += 1;
+  }
+
+  return a;
+}
+
+int bar(int n){
+  int a = 0;
+
+  a += ftemplate<int>(n);
+
+  return a;
+}
+
+  // CHECK-NOT: define {{.*}}void {{@__omp_offloading_.+template.+l17}}
+
+
+
+
+
+
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l26}}(
+  // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align
+  // CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align
+  // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align
+  // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]],
+  // CHECK: br label {{%?}}[[EXEC:.+]]
+  //
+  // CHECK: [[EXEC]]
+  // CHECK: {{call|invoke}} void [[OP1:@.+]](i32* null, i32* null, i16* [[AA]])
+  // CHECK: br label {{%?}}[[DONE:.+]]
+  //
+  // CHECK: [[DONE]]
+  // CHECK: call void @__kmpc_spmd_kernel_deinit()
+  // CHECK: br label {{%?}}[[EXIT:.+]]
+  //
+  // CHECK: [[EXIT]]
+  // CHECK: ret void
+  // CHECK: }
+
+  // CHECK: define internal void [[OP1]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i16* {{[^%]*}}[[ARG:%.+]])
+  // CHECK: = alloca i32*, align
+  // CHECK: = alloca i32*, align
+  // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align
+  // CHECK: store i16* [[ARG]], i16** [[AA_ADDR]], align
+  // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align
+  // CHECK: [[VAL:%.+]] = load i16, i16* [[AA]], align
+  // CHECK: store i16 {{%.+}}, i16* [[AA]], align
+  // CHECK: ret void
+  // CHECK: }
+
+
+
+
+
+
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l31}}(
+  // CHECK: [[A_ADDR:%.+]] = alloca i32*, align
+  // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align
+  // CHECK: [[B_ADDR:%.+]] = alloca [10 x i32]*, align
+  // CHECK: store i32* {{%.+}}, i32** [[A_ADDR]], align
+  // CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align
+  // CHECK: store [10 x i32]* {{%.+}}, [10 x i32]** [[B_ADDR]], align
+  // CHECK: [[A:%.+]] = load i32*, i32** [[A_ADDR]], align
+  // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align
+  // CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align
+  // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]],
+  // CHECK: br label {{%?}}[[EXEC:.+]]
+  //
+  // CHECK: [[EXEC]]
+  // CHECK: {{call|invoke}} void [[OP2:@.+]](i32* null, i32* null, i32* [[A]], i16* [[AA]], [10 x i32]* [[B]])
+  // CHECK: br label {{%?}}[[DONE:.+]]
+  //
+  // CHECK: [[DONE]]
+  // CHECK: call void @__kmpc_spmd_kernel_deinit()
+  // CHECK: br label {{%?}}[[EXIT:.+]]
+  //
+  // CHECK: [[EXIT]]
+  // CHECK: ret void
+  // CHECK: }
+
+  // CHECK: define internal void [[OP2]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* {{[^%]*}}[[ARG1:%.+]], i16* {{[^%]*}}[[ARG2:%.+]], [10 x i32]* {{[^%]*}}[[ARG3:%.+]])
+  // CHECK: = alloca i32*, align
+  // CHECK: = alloca i32*, align
+  // CHECK: [[A_ADDR:%.+]] = alloca i32*, align
+  // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align
+  // CHECK: [[B_ADDR:%.+]] = alloca [10 x i32]*, align
+  // CHECK: store i32* [[ARG1]], i32** [[A_ADDR]], align
+  // CHECK: store i16* [[ARG2]], i16** [[AA_ADDR]], align
+  // CHECK: store [10 x i32]* [[ARG3]], [10 x i32]** [[B_ADDR]], align
+  // CHECK: [[A:%.+]] = load i32*, i32** [[A_ADDR]], align
+  // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align
+  // CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align
+  // CHECK: store i32 {{%.+}}, i32* [[A]], align
+  // CHECK: store i16 {{%.+}}, i16* [[AA]], align
+  // CHECK: [[ELT:%.+]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B]],
+  // CHECK: store i32 {{%.+}}, i32* [[ELT]], align
+  // CHECK: ret void
+  // CHECK: }
+#endif