[Openmp-commits] [openmp] 47e95e8 - [libomptarget] Build cuda plugin without cuda installed locally
Jon Chesterfield via Openmp-commits
openmp-commits at lists.llvm.org
Fri Jan 22 16:15:25 PST 2021
Author: Jon Chesterfield
Date: 2021-01-23T00:15:04Z
New Revision: 47e95e87a3e4f738635ff965616d4e2d96bf838a
URL: https://github.com/llvm/llvm-project/commit/47e95e87a3e4f738635ff965616d4e2d96bf838a
DIFF: https://github.com/llvm/llvm-project/commit/47e95e87a3e4f738635ff965616d4e2d96bf838a.diff
LOG: [libomptarget] Build cuda plugin without cuda installed locally
[libomptarget] Build cuda plugin without cuda installed locally
Compiles a new file, `plugins/cuda/dynamic_cuda/cuda.cpp`, to an object file that exposes the same symbols that the plugin presently uses from libcuda. The object file contains dlopen of libcuda and cached dlsym calls. Also provides a cuda.h containing the subset that is used.
This lets the cmake file choose between the system cuda and a dlopen shim, with no changes to rtl.cpp.
The corresponding change to amdgpu is postponed until after a refactor of the plugin to reduce the size of the hsa.h stub required
Reviewed By: jdoerfert
Differential Revision: https://reviews.llvm.org/D95155
Added:
openmp/libomptarget/include/dlwrap.h
openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp
openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h
Modified:
openmp/libomptarget/plugins/cuda/CMakeLists.txt
Removed:
################################################################################
diff --git a/openmp/libomptarget/include/dlwrap.h b/openmp/libomptarget/include/dlwrap.h
new file mode 100644
index 000000000000..9e078b34ca57
--- /dev/null
+++ b/openmp/libomptarget/include/dlwrap.h
@@ -0,0 +1,277 @@
+//===------- dlwrap.h - Convenience wrapper around dlopen/dlsym -- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The openmp plugins depend on extern libraries. These can be used via:
+// - bitcode file statically linked
+// - (relocatable) object file statically linked
+// - static library
+// - dynamic library, linked at build time
+// - dynamic library, loaded at application run time by dlopen
+//
+// This file factors out most boilerplate for using a dlopened library.
+// - Function symbols are generated that are statically linked against
+// - The dlopen can be done implicitly when initializing the library
+// - dlsym lookups are done once and cached
+// - The abstraction is very thin to permit varied uses of the library
+//
+// Given int foo(char, double, void*);, writing DLWRAP(foo, 3) will expand to:
+// int foo(char x0, double x1, void* x2) {
+// constexpr size_t index = id();
+// void * dlsymResult = pointer(index);
+// return ((int (*)(char, double, void*))dlsymResult)(x0, x1, x2);
+// }
+//
+// Multiple calls to DLWRAP(symbol_name, arity) with bespoke
+// initialization code that can use the thin abstraction:
+// namespace dlwrap {
+// static size_t size();
+// static const char *symbol(size_t);
+// static void **pointer(size_t);
+// }
+// will compile to an object file that only exposes the symbols that the
+// dynamic library would do, with the right function types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef DLWRAP_H_INCLUDED
+#define DLWRAP_H_INCLUDED
+
+#include <array>
+#include <cstddef>
+#include <tuple>
+#include <type_traits>
+
+// Where symbol is a function, these expand to some book keeping and an
+// implementation of that function
+#define DLWRAP(SYMBOL, ARITY) DLWRAP_IMPL(SYMBOL, ARITY)
+#define DLWRAP_INTERNAL(SYMBOL, ARITY) DLWRAP_INTERNAL_IMPL(SYMBOL, ARITY)
+
+// For example, given a prototype:
+// int foo(char, double);
+//
+// DLWRAP(foo, 2) expands to:
+//
+// namespace dlwrap {
+// struct foo_Trait : public dlwrap::trait<decltype(&foo)> {
+// using T = dlwrap::trait<decltype(&foo)>;
+// static T::FunctionType get() {
+// constexpr size_t Index = getIndex();
+// void *P = *dlwrap::pointer(Index);
+// return reinterpret_cast<T::FunctionType>(P);
+// }
+// };
+// }
+// int foo(char x0, double x1) { return dlwrap::foo_Trait::get()(x0, x1); }
+//
+// DLWRAP_INTERNAL is similar, except the function it expands to is:
+// static int dlwrap_foo(char x0, double x1) { ... }
+// so that the function pointer call can be wrapped in library-specific code
+
+// DLWRAP_FINALIZE() expands to definitions of:
+#define DLWRAP_FINALIZE() DLWRAP_FINALIZE_IMPL()
+namespace dlwrap {
+static size_t size();
+static const char *symbol(size_t); // get symbol name in [0, size())
+static void **pointer(size_t); // get pointer to function pointer in [0, size())
+} // namespace dlwrap
+
+// Implementation details follow.
+
+namespace dlwrap {
+
+// Extract return / argument types from address of function symbol
+template <typename F> struct trait;
+template <typename R, typename... Ts> struct trait<R (*)(Ts...)> {
+ constexpr static const size_t nargs = sizeof...(Ts);
+ typedef R ReturnType;
+ template <size_t i> struct arg {
+ typedef typename std::tuple_element<i, std::tuple<Ts...>>::type type;
+ };
+
+ typedef R (*FunctionType)(Ts...);
+};
+
+namespace type {
+// Book keeping is by type specialization
+
+template <size_t S> struct count {
+ static constexpr size_t N = count<S - 1>::N;
+};
+
+template <> struct count<0> { static constexpr size_t N = 0; };
+
+// Get a constexpr size_t ID, starts at zero
+#define DLWRAP_ID() (dlwrap::type::count<__LINE__>::N)
+
+// Increment value returned by DLWRAP_ID
+#define DLWRAP_INC() \
+ template <> struct dlwrap::type::count<__LINE__> { \
+ static constexpr size_t N = 1 + dlwrap::type::count<__LINE__ - 1>::N; \
+ }
+
+template <size_t N> struct symbol;
+#define DLWRAP_SYMBOL(SYMBOL, ID) \
+ template <> struct dlwrap::type::symbol<ID> { \
+ static constexpr const char *call() { return #SYMBOL; } \
+ }
+} // namespace type
+
+template <size_t N, size_t... Is>
+constexpr std::array<const char *, N> static getSymbolArray(
+ std::index_sequence<Is...>) {
+ return {{dlwrap::type::symbol<Is>::call()...}};
+}
+
+} // namespace dlwrap
+
+#define DLWRAP_INSTANTIATE(SYM_USE, SYM_DEF, ARITY) \
+ DLWRAP_INSTANTIATE_##ARITY(SYM_USE, SYM_DEF, \
+ dlwrap::trait<decltype(&SYM_USE)>)
+
+#define DLWRAP_FINALIZE_IMPL() \
+ static size_t dlwrap::size() { return DLWRAP_ID(); } \
+ static const char *dlwrap::symbol(size_t i) { \
+ static constexpr const std::array<const char *, DLWRAP_ID()> \
+ dlwrap_symbols = getSymbolArray<DLWRAP_ID()>( \
+ std::make_index_sequence<DLWRAP_ID()>()); \
+ return dlwrap_symbols[i]; \
+ } \
+ static void **dlwrap::pointer(size_t i) { \
+ static std::array<void *, DLWRAP_ID()> dlwrap_pointers; \
+ return &dlwrap_pointers.data()[i]; \
+ }
+
+#define DLWRAP_COMMON(SYMBOL, ARITY) \
+ DLWRAP_INC(); \
+ DLWRAP_SYMBOL(SYMBOL, DLWRAP_ID() - 1); \
+ namespace dlwrap { \
+ struct SYMBOL##_Trait : public dlwrap::trait<decltype(&SYMBOL)> { \
+ using T = dlwrap::trait<decltype(&SYMBOL)>; \
+ static T::FunctionType get() { \
+ constexpr size_t Index = DLWRAP_ID() - 1; \
+ void *P = *dlwrap::pointer(Index); \
+ return reinterpret_cast<T::FunctionType>(P); \
+ } \
+ }; \
+ }
+
+#define DLWRAP_IMPL(SYMBOL, ARITY) \
+ DLWRAP_COMMON(SYMBOL, ARITY); \
+ DLWRAP_INSTANTIATE(SYMBOL, SYMBOL, ARITY)
+
+#define DLWRAP_INTERNAL_IMPL(SYMBOL, ARITY) \
+ DLWRAP_COMMON(SYMBOL, ARITY); \
+ static DLWRAP_INSTANTIATE(SYMBOL, dlwrap_##SYMBOL, ARITY)
+
+#define DLWRAP_INSTANTIATE_0(SYM_USE, SYM_DEF, T) \
+ T::ReturnType SYM_DEF() { return dlwrap::SYM_USE##_Trait::get()(); }
+#define DLWRAP_INSTANTIATE_1(SYM_USE, SYM_DEF, T) \
+ T::ReturnType SYM_DEF(typename T::template arg<0>::type x0) { \
+ return dlwrap::SYM_USE##_Trait::get()(x0); \
+ }
+#define DLWRAP_INSTANTIATE_2(SYM_USE, SYM_DEF, T) \
+ T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
+ typename T::template arg<1>::type x1) { \
+ return dlwrap::SYM_USE##_Trait::get()(x0, x1); \
+ }
+#define DLWRAP_INSTANTIATE_3(SYM_USE, SYM_DEF, T) \
+ T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
+ typename T::template arg<1>::type x1, \
+ typename T::template arg<2>::type x2) { \
+ return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2); \
+ }
+#define DLWRAP_INSTANTIATE_4(SYM_USE, SYM_DEF, T) \
+ T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
+ typename T::template arg<1>::type x1, \
+ typename T::template arg<2>::type x2, \
+ typename T::template arg<3>::type x3) { \
+ return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3); \
+ }
+#define DLWRAP_INSTANTIATE_5(SYM_USE, SYM_DEF, T) \
+ T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
+ typename T::template arg<1>::type x1, \
+ typename T::template arg<2>::type x2, \
+ typename T::template arg<3>::type x3, \
+ typename T::template arg<4>::type x4) { \
+ return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4); \
+ }
+#define DLWRAP_INSTANTIATE_6(SYM_USE, SYM_DEF, T) \
+ T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
+ typename T::template arg<1>::type x1, \
+ typename T::template arg<2>::type x2, \
+ typename T::template arg<3>::type x3, \
+ typename T::template arg<4>::type x4, \
+ typename T::template arg<5>::type x5) { \
+ return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5); \
+ }
+
+#define DLWRAP_INSTANTIATE_7(SYM_USE, SYM_DEF, T) \
+ T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
+ typename T::template arg<1>::type x1, \
+ typename T::template arg<2>::type x2, \
+ typename T::template arg<3>::type x3, \
+ typename T::template arg<4>::type x4, \
+ typename T::template arg<5>::type x5, \
+ typename T::template arg<6>::type x6) { \
+ return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6); \
+ }
+
+#define DLWRAP_INSTANTIATE_8(SYM_USE, SYM_DEF, T) \
+ T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
+ typename T::template arg<1>::type x1, \
+ typename T::template arg<2>::type x2, \
+ typename T::template arg<3>::type x3, \
+ typename T::template arg<4>::type x4, \
+ typename T::template arg<5>::type x5, \
+ typename T::template arg<6>::type x6, \
+ typename T::template arg<7>::type x7) { \
+ return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7); \
+ }
+#define DLWRAP_INSTANTIATE_9(SYM_USE, SYM_DEF, T) \
+ T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
+ typename T::template arg<1>::type x1, \
+ typename T::template arg<2>::type x2, \
+ typename T::template arg<3>::type x3, \
+ typename T::template arg<4>::type x4, \
+ typename T::template arg<5>::type x5, \
+ typename T::template arg<6>::type x6, \
+ typename T::template arg<7>::type x7, \
+ typename T::template arg<8>::type x8) { \
+ return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8); \
+ }
+#define DLWRAP_INSTANTIATE_10(SYM_USE, SYM_DEF, T) \
+ T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
+ typename T::template arg<1>::type x1, \
+ typename T::template arg<2>::type x2, \
+ typename T::template arg<3>::type x3, \
+ typename T::template arg<4>::type x4, \
+ typename T::template arg<5>::type x5, \
+ typename T::template arg<6>::type x6, \
+ typename T::template arg<7>::type x7, \
+ typename T::template arg<8>::type x8, \
+ typename T::template arg<9>::type x9) { \
+ return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8, \
+ x9); \
+ }
+#define DLWRAP_INSTANTIATE_11(SYM_USE, SYM_DEF, T) \
+ T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
+ typename T::template arg<1>::type x1, \
+ typename T::template arg<2>::type x2, \
+ typename T::template arg<3>::type x3, \
+ typename T::template arg<4>::type x4, \
+ typename T::template arg<5>::type x5, \
+ typename T::template arg<6>::type x6, \
+ typename T::template arg<7>::type x7, \
+ typename T::template arg<8>::type x8, \
+ typename T::template arg<9>::type x9, \
+ typename T::template arg<10>::type x10) { \
+ return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8, \
+ x9, x10); \
+ }
+
+#endif
diff --git a/openmp/libomptarget/plugins/cuda/CMakeLists.txt b/openmp/libomptarget/plugins/cuda/CMakeLists.txt
index 93887154bf19..e5b2edfa664e 100644
--- a/openmp/libomptarget/plugins/cuda/CMakeLists.txt
+++ b/openmp/libomptarget/plugins/cuda/CMakeLists.txt
@@ -15,12 +15,6 @@ if (NOT(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE
elseif (NOT LIBOMPTARGET_DEP_LIBELF_FOUND)
libomptarget_say("Not building CUDA offloading plugin: libelf dependency not found.")
return()
-elseif(NOT LIBOMPTARGET_DEP_CUDA_FOUND)
- libomptarget_say("Not building CUDA offloading plugin: CUDA not found in system.")
- return()
-elseif(NOT LIBOMPTARGET_DEP_CUDA_DRIVER_FOUND)
- libomptarget_say("Not building CUDA offloading plugin: CUDA Driver API not found in system.")
- return()
endif()
libomptarget_say("Building CUDA offloading plugin.")
@@ -28,10 +22,22 @@ libomptarget_say("Building CUDA offloading plugin.")
# Define the suffix for the runtime messaging dumps.
add_definitions(-DTARGET_NAME=CUDA)
-include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS})
include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS})
-add_library(omptarget.rtl.cuda SHARED src/rtl.cpp)
+option(LIBOMPTARGET_DLOPEN_LIBCUDA "Build with dlopened libcuda" OFF)
+
+if (LIBOMPTARGET_DEP_CUDA_FOUND AND LIBOMPTARGET_DEP_CUDA_DRIVER_FOUND
+ AND NOT LIBOMPTARGET_DLOPEN_LIBCUDA)
+ libomptarget_say("Building CUDA plugin linked against libcuda")
+ include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS})
+ add_library(omptarget.rtl.cuda SHARED src/rtl.cpp)
+ set (LIBOMPTARGET_DEP_LIBRARIES ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES})
+else()
+ libomptarget_say("Building CUDA plugin for dlopened libcuda")
+ include_directories(dynamic_cuda)
+ add_library(omptarget.rtl.cuda SHARED src/rtl.cpp dynamic_cuda/cuda.cpp)
+ set (LIBOMPTARGET_DEP_LIBRARIES ${CMAKE_DL_LIBS})
+endif()
# Install plugin under the lib destination folder.
install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
@@ -39,7 +45,7 @@ install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}
target_link_libraries(omptarget.rtl.cuda
elf_common
MemoryManager
- ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES}
+ ${LIBOMPTARGET_DEP_LIBRARIES}
${LIBOMPTARGET_DEP_LIBELF_LIBRARIES}
"-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
"-Wl,-z,defs")
diff --git a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp
new file mode 100644
index 000000000000..cc7bc42412f6
--- /dev/null
+++ b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp
@@ -0,0 +1,99 @@
+//===--- cuda/dynamic_cuda/cuda.pp ------------------------------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implement subset of cuda api by calling into cuda library via dlopen
+// Does the dlopen/dlsym calls as part of the call to cuInit
+//
+//===----------------------------------------------------------------------===//
+
+#include "cuda.h"
+#include "Debug.h"
+#include "dlwrap.h"
+
+#include <dlfcn.h>
+
+DLWRAP_INTERNAL(cuInit, 1);
+
+DLWRAP(cuCtxGetDevice, 1);
+DLWRAP(cuDeviceGet, 2);
+DLWRAP(cuDeviceGetAttribute, 3);
+DLWRAP(cuDeviceGetCount, 1);
+DLWRAP(cuFuncGetAttribute, 3);
+
+DLWRAP(cuGetErrorString, 2);
+DLWRAP(cuLaunchKernel, 11);
+
+DLWRAP(cuMemAlloc, 2);
+DLWRAP(cuMemcpyDtoDAsync, 4);
+
+DLWRAP(cuMemcpyDtoH, 3);
+DLWRAP(cuMemcpyDtoHAsync, 4);
+DLWRAP(cuMemcpyHtoD, 3);
+DLWRAP(cuMemcpyHtoDAsync, 4);
+
+DLWRAP(cuMemFree, 1);
+DLWRAP(cuModuleGetFunction, 3);
+DLWRAP(cuModuleGetGlobal, 4);
+
+DLWRAP(cuModuleUnload, 1);
+DLWRAP(cuStreamCreate, 2);
+DLWRAP(cuStreamDestroy, 1);
+DLWRAP(cuStreamSynchronize, 1);
+DLWRAP(cuCtxSetCurrent, 1);
+DLWRAP(cuDevicePrimaryCtxRelease, 1);
+DLWRAP(cuDevicePrimaryCtxGetState, 3);
+DLWRAP(cuDevicePrimaryCtxSetFlags, 2);
+DLWRAP(cuDevicePrimaryCtxRetain, 2);
+DLWRAP(cuModuleLoadDataEx, 5);
+
+DLWRAP(cuDeviceCanAccessPeer, 3);
+DLWRAP(cuCtxEnablePeerAccess, 2);
+DLWRAP(cuMemcpyPeerAsync, 6);
+
+DLWRAP_FINALIZE();
+
+#ifndef DYNAMIC_CUDA_PATH
+#define DYNAMIC_CUDA_PATH "libcuda.so"
+#endif
+
+#define TARGET_NAME CUDA
+#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
+
+static bool checkForCUDA() {
+ // return true if dlopen succeeded and all functions found
+
+ const char *CudaLib = DYNAMIC_CUDA_PATH;
+ void *DynlibHandle = dlopen(CudaLib, RTLD_NOW);
+ if (!DynlibHandle) {
+ DP("Unable to load library '%s': %s!\n", CudaLib, dlerror());
+ return false;
+ }
+
+ for (size_t I = 0; I < dlwrap::size(); I++) {
+ const char *Sym = dlwrap::symbol(I);
+
+ void *P = dlsym(DynlibHandle, Sym);
+ if (P == nullptr) {
+ DP("Unable to find '%s' in '%s'!\n", Sym, CudaLib);
+ return false;
+ }
+
+ *dlwrap::pointer(I) = P;
+ }
+
+ return true;
+}
+
+CUresult cuInit(unsigned X) {
+ // Note: Called exactly once from cuda rtl.cpp in a global constructor so
+ // does not need to handle being called repeatedly or concurrently
+ if (!checkForCUDA()) {
+ return CUDA_ERROR_INVALID_VALUE;
+ }
+ return dlwrap_cuInit(X);
+}
diff --git a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h
new file mode 100644
index 000000000000..832c26965144
--- /dev/null
+++ b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h
@@ -0,0 +1,104 @@
+//===--- cuda/dynamic_cuda/cuda.h --------------------------------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The parts of the cuda api that are presently in use by the openmp cuda plugin
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef DYNAMIC_CUDA_CUDA_H_INCLUDED
+#define DYNAMIC_CUDA_CUDA_H_INCLUDED
+
+#include <cstddef>
+#include <cstdint>
+
+typedef int CUdevice;
+typedef uintptr_t CUdeviceptr;
+typedef struct CUmod_st *CUmodule;
+typedef struct CUctx_st *CUcontext;
+typedef struct CUfunc_st *CUfunction;
+typedef struct CUstream_st *CUstream;
+
+typedef enum cudaError_enum {
+ CUDA_SUCCESS = 0,
+ CUDA_ERROR_INVALID_VALUE = 1,
+} CUresult;
+
+typedef enum CUstream_flags_enum {
+ CU_STREAM_DEFAULT = 0x0,
+ CU_STREAM_NON_BLOCKING = 0x1,
+} CUstream_flags;
+
+typedef enum CUdevice_attribute_enum {
+ CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,
+ CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5,
+ CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,
+} CUdevice_attribute;
+
+typedef enum CUfunction_attribute_enum {
+ CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
+} CUfunction_attribute;
+
+typedef enum CUctx_flags_enum {
+ CU_CTX_SCHED_BLOCKING_SYNC = 0x04,
+ CU_CTX_SCHED_MASK = 0x07,
+} CUctx_flags;
+
+#define cuMemFree cuMemFree_v2
+#define cuMemAlloc cuMemAlloc_v2
+#define cuMemcpyDtoH cuMemcpyDtoH_v2
+#define cuMemcpyHtoD cuMemcpyHtoD_v2
+#define cuStreamDestroy cuStreamDestroy_v2
+#define cuModuleGetGlobal cuModuleGetGlobal_v2
+#define cuMemcpyDtoHAsync cuMemcpyDtoHAsync_v2
+#define cuMemcpyDtoDAsync cuMemcpyDtoDAsync_v2
+#define cuMemcpyHtoDAsync cuMemcpyHtoDAsync_v2
+#define cuDevicePrimaryCtxRelease cuDevicePrimaryCtxRelease_v2
+#define cuDevicePrimaryCtxSetFlags cuDevicePrimaryCtxSetFlags_v2
+
+CUresult cuCtxGetDevice(CUdevice *);
+CUresult cuDeviceGet(CUdevice *, int);
+CUresult cuDeviceGetAttribute(int *, CUdevice_attribute, CUdevice);
+CUresult cuDeviceGetCount(int *);
+CUresult cuFuncGetAttribute(int *, CUfunction_attribute, CUfunction);
+
+CUresult cuGetErrorString(CUresult, const char **);
+CUresult cuInit(unsigned);
+CUresult cuLaunchKernel(CUfunction, unsigned, unsigned, unsigned, unsigned,
+ unsigned, unsigned, unsigned, CUstream, void **,
+ void **);
+
+CUresult cuMemAlloc(CUdeviceptr *, size_t);
+CUresult cuMemcpyDtoDAsync(CUdeviceptr, CUdeviceptr, size_t, CUstream);
+
+CUresult cuMemcpyDtoH(void *, CUdeviceptr, size_t);
+CUresult cuMemcpyDtoHAsync(void *, CUdeviceptr, size_t, CUstream);
+CUresult cuMemcpyHtoD(CUdeviceptr, const void *, size_t);
+CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream);
+
+CUresult cuMemFree(CUdeviceptr);
+CUresult cuModuleGetFunction(CUfunction *, CUmodule, const char *);
+CUresult cuModuleGetGlobal(CUdeviceptr *, size_t *, CUmodule, const char *);
+
+CUresult cuModuleUnload(CUmodule);
+CUresult cuStreamCreate(CUstream *, unsigned);
+CUresult cuStreamDestroy(CUstream);
+CUresult cuStreamSynchronize(CUstream);
+CUresult cuCtxSetCurrent(CUcontext);
+CUresult cuDevicePrimaryCtxRelease(CUdevice);
+CUresult cuDevicePrimaryCtxGetState(CUdevice, unsigned *, int *);
+CUresult cuDevicePrimaryCtxSetFlags(CUdevice, unsigned);
+CUresult cuDevicePrimaryCtxRetain(CUcontext *, CUdevice);
+CUresult cuModuleLoadDataEx(CUmodule *, const void *, unsigned, void *,
+ void **);
+
+CUresult cuDeviceCanAccessPeer(int *, CUdevice, CUdevice);
+CUresult cuCtxEnablePeerAccess(CUcontext, unsigned);
+CUresult cuMemcpyPeerAsync(CUdeviceptr, CUcontext, CUdeviceptr, CUcontext,
+ size_t, CUstream);
+
+#endif
More information about the Openmp-commits
mailing list