[Openmp-commits] [openmp] 7cea0ce - [libomptarget] Revert all improvements to support

Jon Chesterfield via Openmp-commits openmp-commits at lists.llvm.org
Wed Nov 6 07:44:27 PST 2019


Author: Jon Chesterfield
Date: 2019-11-06T15:44:10Z
New Revision: 7cea0cea77d15b70d54083ec7c17ae5d49b3f53f

URL: https://github.com/llvm/llvm-project/commit/7cea0cea77d15b70d54083ec7c17ae5d49b3f53f
DIFF: https://github.com/llvm/llvm-project/commit/7cea0cea77d15b70d54083ec7c17ae5d49b3f53f.diff

LOG: [libomptarget] Revert all improvements to support

Summary:
[libomptarget] Revert all improvements to support

The change to unity build for nvcc has broken the build for some developers.
This patch reverts to a known-working state.

There has been some confusion over exactly how the build broke. I think we
have reached a common understanding that the disappearing symbols are from
the bitcode library built by clang. The static archive built by nvcc may show the
same problem. Some of the confusion arose from building the deviceRTL twice
and using one or the other library based on various environmental factors.

I'm pretty sure the problem is clang expanding `__forceinline__` into both `__inline__`
and `attribute(("always_inline"))`. The `__inline__` attribute resolves to linkonce_odr
which is not safe for exporting symbols from translation units.

"always_inline" is the desired semantic for small functions defined in one translation
unit that are intended to be inlined at link time. "inline" is not.

This therefore reintroduces the dependency hazard of supporti.h and some code
duplication, and blocks progress separating deviceRTL into reusable components.

See also D69857, D69859 for attempts at a fix instead of a revert.

Reviewers: ABataev, jdoerfert, grokos, ikitayama, tianshilei1992

Reviewed By: ABataev

Subscribers: mgorny, jfb, openmp-commits

Tags: #openmp

Differential Revision: https://reviews.llvm.org/D69885

Added: 
    openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h

Modified: 
    openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
    openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
    openmp/libomptarget/deviceRTLs/nvptx/src/debug.h
    openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu
    openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
    openmp/libomptarget/deviceRTLs/nvptx/src/support.h
    openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h

Removed: 
    openmp/libomptarget/deviceRTLs/nvptx/src/support.cu
    openmp/libomptarget/deviceRTLs/nvptx/unity.cu


################################################################################
diff  --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
index 6e7f76f5c5b8..1cd13c50395f 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -55,7 +55,6 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
       src/omptarget-nvptx.cu
       src/parallel.cu
       src/reduction.cu
-      src/support.cu
       src/sync.cu
       src/task.cu
   )
@@ -89,7 +88,7 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
   set(BUILD_SHARED_LIBS OFF)
   set(CUDA_SEPARABLE_COMPILATION ON)
   list(APPEND CUDA_NVCC_FLAGS -I${devicertl_base_directory})
-  cuda_add_library(omptarget-nvptx STATIC unity.cu
+  cuda_add_library(omptarget-nvptx STATIC ${cuda_src_files} ${omp_data_objects}
       OPTIONS ${CUDA_ARCH} ${CUDA_DEBUG})
 
   # Install device RTL under the lib destination folder.

diff  --git a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
index f2892acb790a..78b04ec5cffe 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -13,6 +13,11 @@
 #include "target_impl.h"
 #include <stdio.h>
 
+// Warp ID in the CUDA block
+INLINE static unsigned getWarpId() { return threadIdx.x / WARPSIZE; }
+// Lane ID in the CUDA warp.
+INLINE static unsigned getLaneId() { return threadIdx.x % WARPSIZE; }
+
 // Return true if this is the first active thread in the warp.
 INLINE static bool IsWarpMasterActiveThread() {
   unsigned long long Mask = __kmpc_impl_activemask();
@@ -62,7 +67,7 @@ __kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *rootS,
   DSPRINT0(DSFLAG_INIT,
            "Entering __kmpc_initialize_data_sharing_environment\n");
 
-  unsigned WID = GetWarpId();
+  unsigned WID = getWarpId();
   DSPRINT(DSFLAG_INIT, "Warp ID: %u\n", WID);
 
   omptarget_nvptx_TeamDescr *teamDescr =
@@ -106,7 +111,7 @@ EXTERN void *__kmpc_data_sharing_environment_begin(
   DSPRINT(DSFLAG, "Default Data Size %016llx\n",
           (unsigned long long)SharingDefaultDataSize);
 
-  unsigned WID = GetWarpId();
+  unsigned WID = getWarpId();
   __kmpc_impl_lanemask_t CurActiveThreads = __kmpc_impl_activemask();
 
   __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
@@ -226,7 +231,7 @@ EXTERN void __kmpc_data_sharing_environment_end(
 
   DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_end\n");
 
-  unsigned WID = GetWarpId();
+  unsigned WID = getWarpId();
 
   if (IsEntryPoint) {
     if (IsWarpMasterActiveThread()) {
@@ -354,7 +359,7 @@ EXTERN void __kmpc_data_sharing_init_stack_spmd() {
   // This function initializes the stack pointer with the pointer to the
   // statically allocated shared memory slots. The size of a shared memory
   // slot is pre-determined to be 256 bytes.
-  if (GetThreadIdInBlock() == 0)
+  if (threadIdx.x == 0)
     data_sharing_init_stack_common();
 
   __threadfence_block();
@@ -372,7 +377,7 @@ INLINE static void* data_sharing_push_stack_common(size_t PushSize) {
   PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment;
 
   // Frame pointer must be visible to all workers in the same warp.
-  const unsigned WID = GetWarpId();
+  const unsigned WID = getWarpId();
   void *FrameP = 0;
   __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
 
@@ -462,7 +467,7 @@ EXTERN void *__kmpc_data_sharing_push_stack(size_t DataSize,
   // Compute the start address of the frame of each thread in the warp.
   uintptr_t FrameStartAddress =
       (uintptr_t) data_sharing_push_stack_common(PushSize);
-  FrameStartAddress += (uintptr_t) (GetLaneId() * DataSize);
+  FrameStartAddress += (uintptr_t) (getLaneId() * DataSize);
   return (void *)FrameStartAddress;
 }
 
@@ -477,7 +482,7 @@ EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
   __threadfence_block();
 
   if (GetThreadIdInBlock() % WARPSIZE == 0) {
-    unsigned WID = GetWarpId();
+    unsigned WID = getWarpId();
 
     // Current slot
     __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];

diff  --git a/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h b/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h
index 3388b04616f4..1052392155a7 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h
@@ -128,12 +128,12 @@
 
 #if OMPTARGET_NVPTX_DEBUG || OMPTARGET_NVPTX_TEST || OMPTARGET_NVPTX_WARNING
 #include <stdio.h>
-#include "support.h"
+#include "target_impl.h"
 
 template <typename... Arguments>
 NOINLINE static void log(const char *fmt, Arguments... parameters) {
-  printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(),
-         (int)GetWarpId(), (int)GetLaneId(), parameters...);
+  printf(fmt, (int)blockIdx.x, (int)threadIdx.x, (int)(threadIdx.x / WARPSIZE),
+         (int)(threadIdx.x & 0x1F), parameters...);
 }
 
 #endif
@@ -144,8 +144,9 @@ template <typename... Arguments>
 NOINLINE static void check(bool cond, const char *fmt,
                            Arguments... parameters) {
   if (!cond)
-    printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(),
-           (int)GetWarpId(), (int)GetLaneId(), parameters...);
+    printf(fmt, (int)blockIdx.x, (int)threadIdx.x,
+           (int)(threadIdx.x / WARPSIZE), (int)(threadIdx.x & 0x1F),
+           parameters...);
   assert(cond);
 }
 

diff  --git a/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu
index e86b1d1fdbf3..20a22f425324 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu
@@ -364,7 +364,7 @@ EXTERN void omp_set_lock(omp_lock_t *lock) {
     for (;;) {
       now = clock();
       clock_t cycles = now > start ? now - start : now + (0xffffffff - start);
-      if (cycles >= __OMP_SPIN * GetBlockIdInKernel()) {
+      if (cycles >= __OMP_SPIN * blockIdx.x) {
         break;
       }
     }

diff  --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
index 6c98f0104efc..986150402f16 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -385,5 +385,6 @@ INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId);
 ////////////////////////////////////////////////////////////////////////////////
 
 #include "omptarget-nvptxi.h"
+#include "supporti.h"
 
 #endif

diff  --git a/openmp/libomptarget/deviceRTLs/nvptx/src/support.h b/openmp/libomptarget/deviceRTLs/nvptx/src/support.h
index 1adabaf67ea7..de685b894509 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/support.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/support.h
@@ -10,12 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef OMPTARGET_SUPPORT_H
-#define OMPTARGET_SUPPORT_H
-
-#include "interface.h"
 #include "target_impl.h"
-
 ////////////////////////////////////////////////////////////////////////////////
 // Execution Parameters
 ////////////////////////////////////////////////////////////////////////////////
@@ -31,70 +26,58 @@ enum RuntimeMode {
   RuntimeMask = 0x02u,
 };
 
-DEVICE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode);
-DEVICE bool isGenericMode();
-DEVICE bool isSPMDMode();
-DEVICE bool isRuntimeUninitialized();
-DEVICE bool isRuntimeInitialized();
-
-////////////////////////////////////////////////////////////////////////////////
-// Execution Modes based on location parameter fields
-////////////////////////////////////////////////////////////////////////////////
-
-DEVICE bool checkSPMDMode(kmp_Ident *loc);
-
-DEVICE bool checkGenericMode(kmp_Ident *loc);
-
-DEVICE bool checkRuntimeUninitialized(kmp_Ident *loc);
-
-DEVICE bool checkRuntimeInitialized(kmp_Ident *loc);
+INLINE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode);
+INLINE bool isGenericMode();
+INLINE bool isSPMDMode();
+INLINE bool isRuntimeUninitialized();
+INLINE bool isRuntimeInitialized();
 
 ////////////////////////////////////////////////////////////////////////////////
 // get info from machine
 ////////////////////////////////////////////////////////////////////////////////
 
 // get low level ids of resources
-DEVICE int GetThreadIdInBlock();
-DEVICE int GetBlockIdInKernel();
-DEVICE int GetNumberOfBlocksInKernel();
-DEVICE int GetNumberOfThreadsInBlock();
-DEVICE unsigned GetWarpId();
-DEVICE unsigned GetLaneId();
+INLINE int GetThreadIdInBlock();
+INLINE int GetBlockIdInKernel();
+INLINE int GetNumberOfBlocksInKernel();
+INLINE int GetNumberOfThreadsInBlock();
+INLINE unsigned GetWarpId();
+INLINE unsigned GetLaneId();
 
 // get global ids to locate tread/team info (constant regardless of OMP)
-DEVICE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode);
-DEVICE int GetMasterThreadID();
-DEVICE int GetNumberOfWorkersInTeam();
+INLINE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode);
+INLINE int GetMasterThreadID();
+INLINE int GetNumberOfWorkersInTeam();
 
 // get OpenMP thread and team ids
-DEVICE int GetOmpThreadId(int threadId,
+INLINE int GetOmpThreadId(int threadId,
                           bool isSPMDExecutionMode);    // omp_thread_num
-DEVICE int GetOmpTeamId();                              // omp_team_num
+INLINE int GetOmpTeamId();                              // omp_team_num
 
 // get OpenMP number of threads and team
-DEVICE int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads
-DEVICE int GetNumberOfOmpTeams();                           // omp_num_teams
+INLINE int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads
+INLINE int GetNumberOfOmpTeams();                           // omp_num_teams
 
 // get OpenMP number of procs
-DEVICE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode);
-DEVICE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode);
+INLINE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode);
+INLINE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode);
 
 // masters
-DEVICE int IsTeamMaster(int ompThreadId);
+INLINE int IsTeamMaster(int ompThreadId);
 
 // Parallel level
-DEVICE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
-DEVICE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
+INLINE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
+INLINE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
 
 ////////////////////////////////////////////////////////////////////////////////
 // Memory
 ////////////////////////////////////////////////////////////////////////////////
 
 // safe alloc and free
-DEVICE void *SafeMalloc(size_t size, const char *msg); // check if success
-DEVICE void *SafeFree(void *ptr, const char *msg);
+INLINE void *SafeMalloc(size_t size, const char *msg); // check if success
+INLINE void *SafeFree(void *ptr, const char *msg);
 // pad to a alignment (power of 2 only)
-DEVICE unsigned long PadBytes(unsigned long size, unsigned long alignment);
+INLINE unsigned long PadBytes(unsigned long size, unsigned long alignment);
 #define ADD_BYTES(_addr, _bytes)                                               \
   ((void *)((char *)((void *)(_addr)) + (_bytes)))
 #define SUB_BYTES(_addr, _bytes)                                               \
@@ -103,8 +86,6 @@ DEVICE unsigned long PadBytes(unsigned long size, unsigned long alignment);
 ////////////////////////////////////////////////////////////////////////////////
 // Teams Reduction Scratchpad Helpers
 ////////////////////////////////////////////////////////////////////////////////
-DEVICE unsigned int *GetTeamsReductionTimestamp();
-DEVICE char *GetTeamsReductionScratchpad();
-DEVICE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr);
-
-#endif
+INLINE unsigned int *GetTeamsReductionTimestamp();
+INLINE char *GetTeamsReductionScratchpad();
+INLINE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr);

diff  --git a/openmp/libomptarget/deviceRTLs/nvptx/src/support.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
similarity index 96%
rename from openmp/libomptarget/deviceRTLs/nvptx/src/support.cu
rename to openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
index 27675975041a..6fa857899905 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/support.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -1,4 +1,4 @@
-//===--------- support.cu - NVPTX OpenMP support functions ------- CUDA -*-===//
+//===--------- supporti.h - NVPTX OpenMP support functions ------- CUDA -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -10,14 +10,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "support.h"
-#include "debug.h"
-#include "omptarget-nvptx.h"
-
 ////////////////////////////////////////////////////////////////////////////////
 // Execution Parameters
 ////////////////////////////////////////////////////////////////////////////////
 
+#include "target_impl.h"
+
 INLINE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode) {
   execution_param = EMode;
   execution_param |= RMode;
@@ -106,9 +104,9 @@ INLINE int GetNumberOfBlocksInKernel() { return gridDim.x; }
 
 INLINE int GetNumberOfThreadsInBlock() { return blockDim.x; }
 
-INLINE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
+INLINE unsigned GetWarpId() { return threadIdx.x / WARPSIZE; }
 
-INLINE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
+INLINE unsigned GetLaneId() { return threadIdx.x & (WARPSIZE - 1); }
 
 ////////////////////////////////////////////////////////////////////////////////
 //
@@ -124,9 +122,7 @@ INLINE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
 //      If NumThreads is 1024, master id is 992.
 //
 // Called in Generic Execution Mode only.
-INLINE int GetMasterThreadID() {
-  return (GetNumberOfThreadsInBlock() - 1) & ~(WARPSIZE - 1);
-}
+INLINE int GetMasterThreadID() { return (blockDim.x - 1) & ~(WARPSIZE - 1); }
 
 // The last warp is reserved for the master; other warps are workers.
 // Called in Generic Execution Mode only.

diff  --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
index e0bacab3c643..95fe2ad3d3d5 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -15,9 +15,8 @@
 #include <cuda.h>
 #include "nvptx_interface.h"
 
-#define DEVICE __device__
-#define INLINE __forceinline__ DEVICE
-#define NOINLINE __noinline__ DEVICE
+#define INLINE __forceinline__ __device__
+#define NOINLINE __noinline__ __device__
 
 ////////////////////////////////////////////////////////////////////////////////
 // Kernel options

diff  --git a/openmp/libomptarget/deviceRTLs/nvptx/unity.cu b/openmp/libomptarget/deviceRTLs/nvptx/unity.cu
deleted file mode 100644
index f5f92f3919e7..000000000000
--- a/openmp/libomptarget/deviceRTLs/nvptx/unity.cu
+++ /dev/null
@@ -1,26 +0,0 @@
-//===------ unity.cu - Unity build of NVPTX deviceRTL ------------ CUDA -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Support compilers, specifically NVCC, which have not implemented link time
-// optimisation. This removes the runtime cost of moving inline functions into
-// source files in exchange for preventing efficient incremental builds.
-//
-//===----------------------------------------------------------------------===//
-
-#include "src/cancel.cu"
-#include "src/critical.cu"
-#include "src/data_sharing.cu"
-#include "src/libcall.cu"
-#include "src/loop.cu"
-#include "src/omp_data.cu"
-#include "src/omptarget-nvptx.cu"
-#include "src/parallel.cu"
-#include "src/reduction.cu"
-#include "src/support.cu"
-#include "src/sync.cu"
-#include "src/task.cu"


        


More information about the Openmp-commits mailing list