[clang] [llvm] [Offload][CUDA] Add initial cuda_runtime.h overlay (PR #94821)
Johannes Doerfert via cfe-commits
cfe-commits at lists.llvm.org
Fri Jun 7 17:11:32 PDT 2024
https://github.com/jdoerfert created https://github.com/llvm/llvm-project/pull/94821
This provides the header overlay for cuda_runtime.h which is found
before any CUDA installation (none is necessary). Some basic APIs are
defined in terms of the omp_target_* ones, but with the pending
LLVM/Offload API redesign the requirements of CUDA should be taken
into account.
Note: Async is not exposed by the existing runtime thus the streams are
ignored. I'll address this in a follow up.
>From 36618e65d94ffa3e83464b7d19ff6cd7d5855abf Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Wed, 5 Jun 2024 16:51:51 -0700
Subject: [PATCH 1/4] [Offload][NFCI] Initialize the KernelArgsTy to default
values
---
offload/include/Shared/APITypes.h | 30 +++++++++++++++++-------------
1 file changed, 17 insertions(+), 13 deletions(-)
diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index e8fc27785b6c2..fd315c6b992b9 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -89,22 +89,26 @@ struct __tgt_async_info {
/// This struct contains all of the arguments to a target kernel region launch.
struct KernelArgsTy {
- uint32_t Version; // Version of this struct for ABI compatibility.
- uint32_t NumArgs; // Number of arguments in each input pointer.
- void **ArgBasePtrs; // Base pointer of each argument (e.g. a struct).
- void **ArgPtrs; // Pointer to the argument data.
- int64_t *ArgSizes; // Size of the argument data in bytes.
- int64_t *ArgTypes; // Type of the data (e.g. to / from).
- void **ArgNames; // Name of the data for debugging, possibly null.
- void **ArgMappers; // User-defined mappers, possibly null.
- uint64_t Tripcount; // Tripcount for the teams / distribute loop, 0 otherwise.
+ uint32_t Version = 0; // Version of this struct for ABI compatibility.
+ uint32_t NumArgs = 0; // Number of arguments in each input pointer.
+ void **ArgBasePtrs =
+ nullptr; // Base pointer of each argument (e.g. a struct).
+ void **ArgPtrs = nullptr; // Pointer to the argument data.
+ int64_t *ArgSizes = nullptr; // Size of the argument data in bytes.
+ int64_t *ArgTypes = nullptr; // Type of the data (e.g. to / from).
+ void **ArgNames = nullptr; // Name of the data for debugging, possibly null.
+ void **ArgMappers = nullptr; // User-defined mappers, possibly null.
+ uint64_t Tripcount =
+ 0; // Tripcount for the teams / distribute loop, 0 otherwise.
struct {
uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause.
uint64_t Unused : 63;
- } Flags;
- uint32_t NumTeams[3]; // The number of teams (for x,y,z dimension).
- uint32_t ThreadLimit[3]; // The number of threads (for x,y,z dimension).
- uint32_t DynCGroupMem; // Amount of dynamic cgroup memory requested.
+ } Flags = {0, 0};
+ uint32_t NumTeams[3] = {0, 0,
+ 0}; // The number of teams (for x,y,z dimension).
+ uint32_t ThreadLimit[3] = {0, 0,
+ 0}; // The number of threads (for x,y,z dimension).
+ uint32_t DynCGroupMem = 0; // Amount of dynamic cgroup memory requested.
};
static_assert(sizeof(KernelArgsTy().Flags) == sizeof(uint64_t),
"Invalid struct size");
>From 3d5c61a78e91ecb379a2bfac71988eaf8e5cd9cd Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Mon, 3 Jun 2024 19:52:12 -0700
Subject: [PATCH 2/4] [Offload][CUDA] Allow CUDA kernels to use LLVM/Offload
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Through the new `-foffload-via-llvm` flag, CUDA kernels can now be
lowered to the LLVM/Offload API. On the Clang side, this is simply done
by using the OpenMP offload toolchain and emitting calls to `llvm*`
functions to orchestrate the kernel launch rather than `cuda*`
functions. These `llvm*` functions are implemented on top of the
existing LLVM/Offload API.
As we are about to redefine the Offload API, this wil help us in the
design process as a second offload language.
We do not support any CUDA APIs yet, however, we could:
https://www.osti.gov/servlets/purl/1892137
For proper host execution we need to resurrect/rebase
https://tianshilei.me/wp-content/uploads/2021/12/llpp-2021.pdf
(which was designed for debugging).
```
❯❯❯ cat test.cu
extern "C" {
void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum);
}
__global__ void square(int *A) { *A = 42; }
int main(int argc, char **argv) {
int DevNo = 0;
int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo));
*Ptr = 7;
printf("Ptr %p, *Ptr %i\n", Ptr, *Ptr);
square<<<1, 1>>>(Ptr);
printf("Ptr %p, *Ptr %i\n", Ptr, *Ptr);
llvm_omp_target_free_shared(Ptr, DevNo);
}
❯❯❯ clang++ test.cu -O3 -o test123 -foffload-via-llvm --offload-arch=native
❯❯❯ llvm-objdump --offloading test123
test123: file format elf64-x86-64
OFFLOADING IMAGE [0]:
kind elf
arch gfx90a
triple amdgcn-amd-amdhsa
producer openmp
❯❯❯ LIBOMPTARGET_INFO=16 ./test123
Ptr 0x155448ac8000, *Ptr 7
Ptr 0x155448ac8000, *Ptr 42
```
---
clang/include/clang/Basic/LangOptions.def | 1 +
clang/include/clang/Driver/Options.td | 6 ++
clang/lib/CodeGen/CGCUDANV.cpp | 34 ++++++---
clang/lib/Driver/Driver.cpp | 19 +++--
clang/lib/Driver/ToolChains/Clang.cpp | 26 ++++++-
clang/lib/Driver/ToolChains/CommonArgs.cpp | 7 +-
clang/lib/Headers/CMakeLists.txt | 3 +
.../__clang_openmp_device_functions.h | 6 +-
.../Headers/openmp_wrappers/__llvm_offload.h | 31 ++++++++
.../openmp_wrappers/__llvm_offload_device.h | 10 +++
.../openmp_wrappers/__llvm_offload_host.h | 15 ++++
clang/lib/Sema/SemaCUDA.cpp | 3 +
clang/test/Driver/cuda-via-liboffload.cu | 23 ++++++
offload/include/Shared/APITypes.h | 5 +-
offload/include/omptarget.h | 2 +-
offload/plugins-nextgen/amdgpu/src/rtl.cpp | 12 ++-
.../common/src/PluginInterface.cpp | 14 +++-
offload/src/CMakeLists.txt | 1 +
offload/src/KernelLanguage/API.cpp | 76 +++++++++++++++++++
offload/src/exports | 3 +
offload/test/offloading/CUDA/basic_launch.cu | 29 +++++++
.../CUDA/basic_launch_blocks_and_threads.cu | 30 ++++++++
.../offloading/CUDA/basic_launch_multi_arg.cu | 33 ++++++++
23 files changed, 356 insertions(+), 33 deletions(-)
create mode 100644 clang/lib/Headers/openmp_wrappers/__llvm_offload.h
create mode 100644 clang/lib/Headers/openmp_wrappers/__llvm_offload_device.h
create mode 100644 clang/lib/Headers/openmp_wrappers/__llvm_offload_host.h
create mode 100644 clang/test/Driver/cuda-via-liboffload.cu
create mode 100644 offload/src/KernelLanguage/API.cpp
create mode 100644 offload/test/offloading/CUDA/basic_launch.cu
create mode 100644 offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu
create mode 100644 offload/test/offloading/CUDA/basic_launch_multi_arg.cu
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 4061451b2150a..8aff98867202e 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -288,6 +288,7 @@ LANGOPT(GPUMaxThreadsPerBlock, 32, 1024, "default max threads per block for kern
LANGOPT(GPUDeferDiag, 1, 0, "defer host/device related diagnostic messages for CUDA/HIP")
LANGOPT(GPUExcludeWrongSideOverloads, 1, 0, "always exclude wrong side overloads in overloading resolution for CUDA/HIP")
LANGOPT(OffloadingNewDriver, 1, 0, "use the new driver for generating offloading code.")
+LANGOPT(OffloadViaLLVM, 1, 0, "target LLVM/Offload as portable offloading runtime.")
LANGOPT(SYCLIsDevice , 1, 0, "Generate code for SYCL device")
LANGOPT(SYCLIsHost , 1, 0, "SYCL host compilation")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 57f37c5023110..a09d75917ff98 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1275,6 +1275,12 @@ def no_offload_compress : Flag<["--"], "no-offload-compress">;
def offload_compression_level_EQ : Joined<["--"], "offload-compression-level=">,
Flags<[HelpHidden]>,
HelpText<"Compression level for offload device binaries (HIP only)">;
+
+defm offload_via_llvm : BoolFOption<"offload-via-llvm",
+ LangOpts<"OffloadViaLLVM">, DefaultFalse,
+ PosFlag<SetTrue, [], [ClangOption, CC1Option], "Use">,
+ NegFlag<SetFalse, [], [ClangOption], "Don't use">,
+ BothFlags<[], [ClangOption], " LLVM/Offload as portable offloading runtime.">>;
}
// CUDA options
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 43dfbbb90dd52..acbc7e9e43635 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -19,6 +19,7 @@
#include "clang/Basic/Cuda.h"
#include "clang/CodeGen/CodeGenABITypes.h"
#include "clang/CodeGen/ConstantInitBuilder.h"
+#include "llvm/ADT/StringRef.h"
#include "llvm/Frontend/Offloading/Utility.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
@@ -36,6 +37,11 @@ constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
class CGNVCUDARuntime : public CGCUDARuntime {
+ /// The prefix used for function calls and section names (CUDA, HIP, LLVM)
+ StringRef Prefix;
+ /// TODO: We should transition the OpenMP section to LLVM/Offload
+ StringRef SectionPrefix;
+
private:
llvm::IntegerType *IntTy, *SizeTy;
llvm::Type *VoidTy;
@@ -191,15 +197,11 @@ class CGNVCUDARuntime : public CGCUDARuntime {
} // end anonymous namespace
std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
- if (CGM.getLangOpts().HIP)
- return ((Twine("hip") + Twine(FuncName)).str());
- return ((Twine("cuda") + Twine(FuncName)).str());
+ return (Prefix + FuncName).str();
}
std::string
CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
- if (CGM.getLangOpts().HIP)
- return ((Twine("__hip") + Twine(FuncName)).str());
- return ((Twine("__cuda") + Twine(FuncName)).str());
+ return ("__" + Prefix + FuncName).str();
}
static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) {
@@ -227,6 +229,14 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
SizeTy = CGM.SizeTy;
VoidTy = CGM.VoidTy;
PtrTy = CGM.UnqualPtrTy;
+
+ if (CGM.getLangOpts().OffloadViaLLVM) {
+ Prefix = "llvm";
+ SectionPrefix = "omp";
+ } else if (CGM.getLangOpts().HIP)
+ SectionPrefix = Prefix = "hip";
+ else
+ SectionPrefix = Prefix = "cuda";
}
llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
@@ -305,7 +315,8 @@ void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
}
if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
CudaFeature::CUDA_USES_NEW_LAUNCH) ||
- (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
+ (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI) ||
+ (CGF.getLangOpts().OffloadViaLLVM))
emitDeviceStubBodyNew(CGF, Args);
else
emitDeviceStubBodyLegacy(CGF, Args);
@@ -1129,8 +1140,9 @@ void CGNVCUDARuntime::transformManagedVars() {
// registered. The linker will provide a pointer to this section so we can
// register the symbols with the linked device image.
void CGNVCUDARuntime::createOffloadingEntries() {
- StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries"
- : "cuda_offloading_entries";
+ SmallVector<char, 32> Out;
+ StringRef Section = (SectionPrefix + "_offloading_entries").toStringRef(Out);
+
llvm::Module &M = CGM.getModule();
for (KernelInfo &I : EmittedKernels)
llvm::offloading::emitOffloadingEntry(
@@ -1199,7 +1211,9 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
}
return nullptr;
}
- if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)
+ if (CGM.getLangOpts().OffloadViaLLVM)
+ createOffloadingEntries();
+ else if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)
createOffloadingEntries();
else
return makeModuleCtorFunction();
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index f5ea73a04ae5c..815149a49d018 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -792,11 +792,13 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
}) ||
C.getInputArgs().hasArg(options::OPT_hip_link) ||
C.getInputArgs().hasArg(options::OPT_hipstdpar);
+ bool UseLLVMOffload = C.getInputArgs().hasArg(
+ options::OPT_foffload_via_llvm, options::OPT_fno_offload_via_llvm, false);
if (IsCuda && IsHIP) {
Diag(clang::diag::err_drv_mix_cuda_hip);
return;
}
- if (IsCuda) {
+ if (IsCuda && !UseLLVMOffload) {
const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
const llvm::Triple &HostTriple = HostTC->getTriple();
auto OFK = Action::OFK_Cuda;
@@ -818,7 +820,7 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
CudaInstallation.WarnIfUnsupportedVersion();
}
C.addOffloadDeviceToolChain(CudaTC.get(), OFK);
- } else if (IsHIP) {
+ } else if (IsHIP && !UseLLVMOffload) {
if (auto *OMPTargetArg =
C.getInputArgs().getLastArg(options::OPT_fopenmp_targets_EQ)) {
Diag(clang::diag::err_drv_unsupported_opt_for_language_mode)
@@ -842,10 +844,11 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
// We need to generate an OpenMP toolchain if the user specified targets with
// the -fopenmp-targets option or used --offload-arch with OpenMP enabled.
bool IsOpenMPOffloading =
- C.getInputArgs().hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
- options::OPT_fno_openmp, false) &&
- (C.getInputArgs().hasArg(options::OPT_fopenmp_targets_EQ) ||
- C.getInputArgs().hasArg(options::OPT_offload_arch_EQ));
+ ((IsCuda || IsHIP) && UseLLVMOffload) ||
+ (C.getInputArgs().hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
+ options::OPT_fno_openmp, false) &&
+ (C.getInputArgs().hasArg(options::OPT_fopenmp_targets_EQ) ||
+ C.getInputArgs().hasArg(options::OPT_offload_arch_EQ)));
if (IsOpenMPOffloading) {
// We expect that -fopenmp-targets is always used in conjunction with the
// option -fopenmp specifying a valid runtime with offloading support, i.e.
@@ -873,7 +876,7 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
for (StringRef T : OpenMPTargets->getValues())
OpenMPTriples.insert(T);
} else if (C.getInputArgs().hasArg(options::OPT_offload_arch_EQ) &&
- !IsHIP && !IsCuda) {
+ ((!IsHIP && !IsCuda) || UseLLVMOffload)) {
const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
auto AMDTriple = getHIPOffloadTargetTriple(*this, C.getInputArgs());
auto NVPTXTriple = getNVIDIAOffloadTargetTriple(*this, C.getInputArgs(),
@@ -4138,6 +4141,8 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
bool UseNewOffloadingDriver =
C.isOffloadingHostKind(Action::OFK_OpenMP) ||
+ Args.hasFlag(options::OPT_foffload_via_llvm,
+ options::OPT_fno_offload_via_llvm, false) ||
Args.hasFlag(options::OPT_offload_new_driver,
options::OPT_no_offload_new_driver, false);
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 4e1c52462e584..07b7e60caa2b4 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1125,6 +1125,18 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
CmdArgs.push_back("__clang_openmp_device_functions.h");
}
+ if (Args.hasArg(options::OPT_foffload_via_llvm)) {
+ CmdArgs.push_back("-include");
+ SmallString<128> P(D.ResourceDir);
+ llvm::sys::path::append(P, "include");
+ llvm::sys::path::append(P, "openmp_wrappers");
+ if (JA.isDeviceOffloading(Action::OFK_OpenMP))
+ llvm::sys::path::append(P, "__llvm_offload_device.h");
+ else
+ llvm::sys::path::append(P, "__llvm_offload_host.h");
+ CmdArgs.push_back(Args.MakeArgString(P));
+ }
+
// Add -i* options, and automatically translate to
// -include-pch/-include-pth for transparent PCH support. It's
// wonky, but we include looking for .gch so we can support seamless
@@ -6672,11 +6684,17 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
Args.addOptOutFlag(CmdArgs, options::OPT_fopenmp_extensions,
options::OPT_fno_openmp_extensions);
}
-
- // Forward the new driver to change offloading code generation.
- if (Args.hasFlag(options::OPT_offload_new_driver,
- options::OPT_no_offload_new_driver, false))
+ // Forward the offload runtime change to code generation, liboffload implies
+ // new driver. Otherwise, check if we should forward the new driver to change
+ // offloading code generation.
+ if (Args.hasFlag(options::OPT_foffload_via_llvm,
+ options::OPT_fno_offload_via_llvm, false)) {
+ CmdArgs.push_back("--offload-new-driver");
+ CmdArgs.push_back("-foffload-via-llvm");
+ } else if (Args.hasFlag(options::OPT_offload_new_driver,
+ options::OPT_no_offload_new_driver, false)) {
CmdArgs.push_back("--offload-new-driver");
+ }
SanitizeArgs.addArgs(TC, Args, CmdArgs, InputType);
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 71e993119436a..74a69f65f7ad5 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1144,8 +1144,13 @@ bool tools::addOpenMPRuntime(const Compilation &C, ArgStringList &CmdArgs,
bool ForceStaticHostRuntime, bool IsOffloadingHost,
bool GompNeedsRT) {
if (!Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
- options::OPT_fno_openmp, false))
+ options::OPT_fno_openmp, false)) {
+ // We need libomptarget (liboffload) if it's the choosen offloading runtime.
+ if (Args.hasFlag(options::OPT_foffload_via_llvm,
+ options::OPT_fno_offload_via_llvm, false))
+ CmdArgs.push_back("-lomptarget");
return false;
+ }
Driver::OpenMPRuntimeKind RTKind = TC.getDriver().getOpenMPRuntime(Args);
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index d3090e488306f..215dfe207c228 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -323,6 +323,9 @@ set(openmp_wrapper_files
openmp_wrappers/__clang_openmp_device_functions.h
openmp_wrappers/complex_cmath.h
openmp_wrappers/new
+ openmp_wrappers/__llvm_offload.h
+ openmp_wrappers/__llvm_offload_host.h
+ openmp_wrappers/__llvm_offload_device.h
)
set(llvm_libc_wrapper_files
diff --git a/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h b/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h
index d5b6846b03488..889d607557ecb 100644
--- a/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h
+++ b/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h
@@ -10,14 +10,11 @@
#ifndef __CLANG_OPENMP_DEVICE_FUNCTIONS_H__
#define __CLANG_OPENMP_DEVICE_FUNCTIONS_H__
-#ifndef _OPENMP
-#error "This file is for OpenMP compilation only."
-#endif
-
#ifdef __cplusplus
extern "C" {
#endif
+#ifdef __NVPTX__
#pragma omp begin declare variant match( \
device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
@@ -34,6 +31,7 @@ extern "C" {
#undef __CUDA__
#pragma omp end declare variant
+#endif
#ifdef __AMDGCN__
#pragma omp begin declare variant match(device = {arch(amdgcn)})
diff --git a/clang/lib/Headers/openmp_wrappers/__llvm_offload.h b/clang/lib/Headers/openmp_wrappers/__llvm_offload.h
new file mode 100644
index 0000000000000..d78e3b41f99a5
--- /dev/null
+++ b/clang/lib/Headers/openmp_wrappers/__llvm_offload.h
@@ -0,0 +1,31 @@
+/*===------ LLVM/Offload helpers for kernel languages (CUDA/HIP) -*- c++ -*-===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#include <stdlib.h>
+
+#define __host__ __attribute__((host))
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __shared__ __attribute__((shared))
+#define __constant__ __attribute__((constant))
+#define __managed__ __attribute__((managed))
+
+extern "C" {
+
+typedef struct dim3 {
+ dim3() {}
+ dim3(unsigned x) : x(x) {}
+ unsigned x = 0, y = 0, z = 0;
+} dim3;
+
+// TODO: For some reason the CUDA device compilation requires this declaration
+// to be present but it should not.
+unsigned __llvmPushCallConfiguration(dim3 gridDim, dim3 blockDim,
+ size_t sharedMem = 0, void *stream = 0);
+}
diff --git a/clang/lib/Headers/openmp_wrappers/__llvm_offload_device.h b/clang/lib/Headers/openmp_wrappers/__llvm_offload_device.h
new file mode 100644
index 0000000000000..1a813b331515b
--- /dev/null
+++ b/clang/lib/Headers/openmp_wrappers/__llvm_offload_device.h
@@ -0,0 +1,10 @@
+/*===------ LLVM/Offload helpers for kernel languages (CUDA/HIP) -*- c++ -*-===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#include "__llvm_offload.h"
diff --git a/clang/lib/Headers/openmp_wrappers/__llvm_offload_host.h b/clang/lib/Headers/openmp_wrappers/__llvm_offload_host.h
new file mode 100644
index 0000000000000..160289d169b55
--- /dev/null
+++ b/clang/lib/Headers/openmp_wrappers/__llvm_offload_host.h
@@ -0,0 +1,15 @@
+/*===------ LLVM/Offload helpers for kernel languages (CUDA/HIP) -*- c++ -*-===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#include "__llvm_offload.h"
+
+extern "C" {
+unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
+ void **args, size_t sharedMem = 0, void *stream = 0);
+}
diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp
index 80ea43dc5316e..b507e19556363 100644
--- a/clang/lib/Sema/SemaCUDA.cpp
+++ b/clang/lib/Sema/SemaCUDA.cpp
@@ -1059,6 +1059,9 @@ void SemaCUDA::inheritTargetAttrs(FunctionDecl *FD,
}
std::string SemaCUDA::getConfigureFuncName() const {
+ if (getLangOpts().OffloadViaLLVM)
+ return "__llvmPushCallConfiguration";
+
if (getLangOpts().HIP)
return getLangOpts().HIPUseNewLaunchAPI ? "__hipPushCallConfiguration"
: "hipConfigureCall";
diff --git a/clang/test/Driver/cuda-via-liboffload.cu b/clang/test/Driver/cuda-via-liboffload.cu
new file mode 100644
index 0000000000000..68dc963e906b2
--- /dev/null
+++ b/clang/test/Driver/cuda-via-liboffload.cu
@@ -0,0 +1,23 @@
+// RUN: %clang -### -target x86_64-linux-gnu -foffload-via-llvm -ccc-print-bindings \
+// RUN: --offload-arch=sm_35 --offload-arch=sm_70 %s 2>&1 \
+// RUN: | FileCheck -check-prefix BINDINGS %s
+
+// BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[HOST_BC:.+]]"
+// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[PTX_SM_35:.+]]"
+// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[PTX_SM_35]]"], output: "[[CUBIN_SM_35:.+]]"
+// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[PTX_SM_70:.+]]"
+// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[PTX_SM_70:.+]]"], output: "[[CUBIN_SM_70:.+]]"
+// BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[CUBIN_SM_35]]", "[[CUBIN_SM_70]]"], output: "[[BINARY:.+]]"
+// BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_BC]]", "[[BINARY]]"], output: "[[HOST_OBJ:.+]]"
+// BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[HOST_OBJ]]"], output: "a.out"
+
+// RUN: %clang -### -target x86_64-linux-gnu -foffload-via-llvm -ccc-print-bindings \
+// RUN: --offload-arch=sm_35 --offload-arch=sm_70 %s 2>&1 \
+// RUN: | FileCheck -check-prefix BINDINGS-DEVICE %s
+
+// BINDINGS-DEVICE: # "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[PTX:.+]]"
+// BINDINGS-DEVICE: # "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[PTX]]"], output: "[[CUBIN:.+]]"
+
+// RUN: %clang -### -target x86_64-linux-gnu -ccc-print-bindings --offload-link -foffload-via-llvm %s 2>&1 | FileCheck -check-prefix DEVICE-LINK %s
+
+// DEVICE-LINK: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[INPUT:.+]]"], output: "a.out"
diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index fd315c6b992b9..2c41a6e2ba43c 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -102,8 +102,9 @@ struct KernelArgsTy {
0; // Tripcount for the teams / distribute loop, 0 otherwise.
struct {
uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause.
- uint64_t Unused : 63;
- } Flags = {0, 0};
+ uint64_t IsCUDA : 1; // Was this kernel spawned via CUDA.
+ uint64_t Unused : 62;
+ } Flags = {0, 0, 0};
uint32_t NumTeams[3] = {0, 0,
0}; // The number of teams (for x,y,z dimension).
uint32_t ThreadLimit[3] = {0, 0,
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index 323dee41630f2..2b6445e9fbe55 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -107,7 +107,7 @@ enum TargetAllocTy : int32_t {
inline KernelArgsTy CTorDTorKernelArgs = {1, 0, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr,
- 0, {0,0}, {1, 0, 0}, {1, 0, 0}, 0};
+ 0, {0,0,0}, {1, 0, 0}, {1, 0, 0}, 0};
struct DeviceTy;
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index c6dd954746e4a..bb9c2291c80a8 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -3268,6 +3268,11 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t NumThreads, uint64_t NumBlocks,
KernelArgsTy &KernelArgs, void *Args,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
+ if (KernelArgs.Flags.IsCUDA) {
+ // For CUDA kernels we compute the number of arguments here.
+ KernelArgs.NumArgs = (ArgsSize - ImplicitArgsSize) / sizeof(void *);
+ }
+
const uint32_t KernelArgsSize = KernelArgs.NumArgs * sizeof(void *);
if (ArgsSize < KernelArgsSize)
@@ -3310,9 +3315,14 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
// Copy the explicit arguments.
// TODO: We should expose the args memory manager alloc to the common part as
// alternative to copying them twice.
- if (KernelArgs.NumArgs)
+ if (KernelArgs.NumArgs && !KernelArgs.Flags.IsCUDA) {
std::memcpy(AllArgs, *static_cast<void **>(Args),
sizeof(void *) * KernelArgs.NumArgs);
+ } else {
+ for (uint32_t I = 0; I < KernelArgs.NumArgs; ++I)
+ std::memcpy(advanceVoidPtr(AllArgs, sizeof(void *) * I),
+ static_cast<void **>(Args)[I], sizeof(void *));
+ }
AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(GenericDevice);
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 913721a15d713..7ecb98a156f7c 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -549,9 +549,17 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
if (!KernelLaunchEnvOrErr)
return KernelLaunchEnvOrErr.takeError();
- void *KernelArgsPtr =
- prepareArgs(GenericDevice, ArgPtrs, ArgOffsets, KernelArgs.NumArgs, Args,
- Ptrs, *KernelLaunchEnvOrErr);
+ void *KernelArgsPtr;
+
+ // Kernel languages don't use indirection.
+ // TODO: In the new API we should have the "source language" encoded.
+ if (KernelArgs.Flags.IsCUDA) {
+ KernelArgsPtr = KernelArgs.ArgPtrs;
+ } else {
+ KernelArgsPtr =
+ prepareArgs(GenericDevice, ArgPtrs, ArgOffsets, KernelArgs.NumArgs,
+ Args, Ptrs, *KernelLaunchEnvOrErr);
+ }
uint32_t NumThreads = getNumThreads(GenericDevice, KernelArgs.ThreadLimit);
uint64_t NumBlocks =
diff --git a/offload/src/CMakeLists.txt b/offload/src/CMakeLists.txt
index efa5cdab33ec9..b442df45deaa5 100644
--- a/offload/src/CMakeLists.txt
+++ b/offload/src/CMakeLists.txt
@@ -22,6 +22,7 @@ add_llvm_library(omptarget
OpenMP/InteropAPI.cpp
OpenMP/OMPT/Callback.cpp
+ KernelLanguage/API.cpp
ADDITIONAL_HEADER_DIRS
${LIBOMPTARGET_INCLUDE_DIR}
diff --git a/offload/src/KernelLanguage/API.cpp b/offload/src/KernelLanguage/API.cpp
new file mode 100644
index 0000000000000..07517ad1fc18a
--- /dev/null
+++ b/offload/src/KernelLanguage/API.cpp
@@ -0,0 +1,76 @@
+//===------ API.cpp - Kernel Language (CUDA/HIP) entry points ----- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "Shared/APITypes.h"
+
+#include <cstdio>
+
+struct dim3 {
+ unsigned x = 0, y = 0, z = 0;
+};
+
+struct __omp_kernel_t {
+ dim3 __grid_size;
+ dim3 __block_size;
+ size_t __shared_memory;
+
+ void *__stream;
+};
+
+static __omp_kernel_t __current_kernel = {};
+#pragma omp threadprivate(__current_kernel);
+
+extern "C" {
+
+// TODO: There is little reason we need to keep these names or the way calls are
+// issued. For now we do to avoid modifying Clang's CUDA codegen. Unclear when
+// we actually need to push/pop configurations.
+unsigned __llvmPushCallConfiguration(dim3 __grid_size, dim3 __block_size,
+ size_t __shared_memory, void *__stream) {
+ __omp_kernel_t &__kernel = __current_kernel;
+ __kernel.__grid_size = __grid_size;
+ __kernel.__block_size = __block_size;
+ __kernel.__shared_memory = __shared_memory;
+ __kernel.__stream = __stream;
+ return 0;
+}
+
+unsigned __llvmPopCallConfiguration(dim3 *__grid_size, dim3 *__block_size,
+ size_t *__shared_memory, void *__stream) {
+ __omp_kernel_t &__kernel = __current_kernel;
+ *__grid_size = __kernel.__grid_size;
+ *__block_size = __kernel.__block_size;
+ *__shared_memory = __kernel.__shared_memory;
+ *((void **)__stream) = __kernel.__stream;
+ return 0;
+}
+
+int __tgt_target_kernel(void *Loc, int64_t DeviceId, int32_t NumTeams,
+ int32_t ThreadLimit, const void *HostPtr,
+ KernelArgsTy *Args);
+
+unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
+ void **args, size_t sharedMem, void *stream) {
+ KernelArgsTy Args = {};
+ Args.DynCGroupMem = sharedMem;
+ Args.NumTeams[0] = gridDim.x;
+ Args.NumTeams[1] = gridDim.y;
+ Args.NumTeams[2] = gridDim.z;
+ Args.ThreadLimit[0] = blockDim.x;
+ Args.ThreadLimit[1] = blockDim.y;
+ Args.ThreadLimit[2] = blockDim.z;
+ Args.ArgPtrs = args;
+ Args.Flags.IsCUDA = true;
+ int rv = __tgt_target_kernel(nullptr, 0, gridDim.x,
+ blockDim.x, func, &Args);
+ return rv;
+}
+}
diff --git a/offload/src/exports b/offload/src/exports
index f95544ec8329c..7bdc7d2a531bb 100644
--- a/offload/src/exports
+++ b/offload/src/exports
@@ -71,6 +71,9 @@ VERS1.0 {
__tgt_interop_use;
__tgt_interop_destroy;
ompt_libomptarget_connect;
+ __llvmPushCallConfiguration;
+ __llvmPopCallConfiguration;
+ llvmLaunchKernel;
local:
*;
};
diff --git a/offload/test/offloading/CUDA/basic_launch.cu b/offload/test/offloading/CUDA/basic_launch.cu
new file mode 100644
index 0000000000000..a4955f8565f2d
--- /dev/null
+++ b/offload/test/offloading/CUDA/basic_launch.cu
@@ -0,0 +1,29 @@
+// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t
+// RUN: %t | %fcheck-generic
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+
+#include <stdio.h>
+
+extern "C" {
+void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
+void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum);
+}
+
+__global__ void square(int *A) { *A = 42; }
+
+int main(int argc, char **argv) {
+ int DevNo = 0;
+ int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo));
+ *Ptr = 7;
+ printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+ // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 7
+ square<<<1, 1>>>(Ptr);
+ printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+ // CHECK: Ptr [[Ptr]], *Ptr: 42
+ llvm_omp_target_free_shared(Ptr, DevNo);
+}
diff --git a/offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu b/offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu
new file mode 100644
index 0000000000000..401b9e2d559b1
--- /dev/null
+++ b/offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu
@@ -0,0 +1,30 @@
+// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t
+// RUN: %t | %fcheck-generic
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+#include <stdio.h>
+
+extern "C" {
+void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
+void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum);
+}
+
+__global__ void square(int *A) {
+ __scoped_atomic_fetch_add(A, 1, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE);
+}
+
+int main(int argc, char **argv) {
+ int DevNo = 0;
+ int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo));
+ *Ptr = 0;
+ printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+ // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 7
+ square<<<7, 6>>>(Ptr);
+ printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+ // CHECK: Ptr [[Ptr]], *Ptr: 42
+ llvm_omp_target_free_shared(Ptr, DevNo);
+}
diff --git a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
new file mode 100644
index 0000000000000..c836292127fea
--- /dev/null
+++ b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
@@ -0,0 +1,33 @@
+// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t
+// RUN: %t | %fcheck-generic
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+#include <stdio.h>
+
+extern "C" {
+void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
+void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum);
+}
+
+__global__ void square(int *Dst, int *Src, short Q, short P) {
+ *Dst = (Src[0] + Src[1]) * (Q + P);
+}
+
+int main(int argc, char **argv) {
+ int DevNo = 0;
+ int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo));
+ int *Src = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(8, DevNo));
+ *Ptr = 7;
+ Src[0] = -2;
+ Src[1] = 8;
+ printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+ // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 7
+ square<<<1, 1>>>(Ptr, Src, 3, 4);
+ printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+ // CHECK: Ptr [[Ptr]], *Ptr: 42
+ llvm_omp_target_free_shared(Ptr, DevNo);
+}
>From 038c7d57bfedd17c35e36d0a235542d9e10bb01a Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Fri, 7 Jun 2024 11:45:24 -0700
Subject: [PATCH 3/4] FIXES
---
clang/lib/CodeGen/CGCUDANV.cpp | 43 ++++++++++++++++---
clang/lib/Driver/ToolChains/Clang.cpp | 14 +++---
clang/lib/Headers/CMakeLists.txt | 21 ++++++---
.../__llvm_offload.h | 0
.../__llvm_offload_device.h | 0
.../__llvm_offload_host.h | 0
offload/plugins-nextgen/amdgpu/src/rtl.cpp | 18 +++-----
offload/src/KernelLanguage/API.cpp | 4 +-
offload/test/lit.cfg | 2 +-
.../CUDA/basic_launch_blocks_and_threads.cu | 2 +-
.../offloading/CUDA/basic_launch_multi_arg.cu | 8 +++-
11 files changed, 78 insertions(+), 34 deletions(-)
rename clang/lib/Headers/{openmp_wrappers => llvm_offload_wrappers}/__llvm_offload.h (100%)
rename clang/lib/Headers/{openmp_wrappers => llvm_offload_wrappers}/__llvm_offload_device.h (100%)
rename clang/lib/Headers/{openmp_wrappers => llvm_offload_wrappers}/__llvm_offload_host.h (100%)
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index acbc7e9e43635..8e32aad88a26d 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -15,6 +15,7 @@
#include "CGCXXABI.h"
#include "CodeGenFunction.h"
#include "CodeGenModule.h"
+#include "clang/AST/CharUnits.h"
#include "clang/AST/Decl.h"
#include "clang/Basic/Cuda.h"
#include "clang/CodeGen/CodeGenABITypes.h"
@@ -138,6 +139,9 @@ class CGNVCUDARuntime : public CGCUDARuntime {
return DummyFunc;
}
+ Address prepareKernelArgs(CodeGenFunction &CGF, FunctionArgList &Args);
+ Address prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
+ FunctionArgList &Args);
void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
std::string getDeviceSideName(const NamedDecl *ND) override;
@@ -322,12 +326,30 @@ void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
emitDeviceStubBodyLegacy(CGF, Args);
}
-// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
-// array and kernels are launched using cudaLaunchKernel().
-void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
- FunctionArgList &Args) {
- // Build the shadow stack entry at the very start of the function.
+/// CUDA passes the arguments with a level of indirection. For example, a
+/// (void*, short, void*) is passed as {void **, short *, void **} to the launch
+/// function. For the LLVM/offload launch we flatten the arguments into the
+/// struct directly, thus pass {void *, short, void *}
+Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
+ FunctionArgList &Args) {
+ SmallVector<llvm::Type *> ArgTypes;
+ for (auto &Arg : Args)
+ ArgTypes.push_back(CGF.ConvertTypeForMem(Arg->getType()));
+
+ llvm::StructType *ST = llvm::StructType::create(ArgTypes);
+ Address KernelArgs = CGF.CreateTempAllocaWithoutCast(
+ ST, CharUnits::fromQuantity(16), "kernel_args");
+
+ for (unsigned i = 0; i < Args.size(); ++i) {
+ auto *ArgVal = CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(Args[i]));
+ CGF.Builder.CreateStore(ArgVal, CGF.Builder.CreateStructGEP(KernelArgs, i));
+ }
+ return KernelArgs;
+}
+
+Address CGNVCUDARuntime::prepareKernelArgs(CodeGenFunction &CGF,
+ FunctionArgList &Args) {
// Calculate amount of space we will need for all arguments. If we have no
// args, allocate a single pointer so we still have a valid pointer to the
// argument array that we can pass to runtime, even if it will be unused.
@@ -342,6 +364,17 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
VoidVarPtr, CGF.Builder.CreateConstGEP1_32(
PtrTy, KernelArgs.emitRawPointer(CGF), i));
}
+ return KernelArgs;
+}
+
+// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
+// array and kernels are launched using cudaLaunchKernel().
+void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
+ FunctionArgList &Args) {
+ // Build the shadow stack entry at the very start of the function.
+ Address KernelArgs = CGF.getLangOpts().OffloadViaLLVM
+ ? prepareKernelArgsLLVMOffload(CGF, Args)
+ : prepareKernelArgs(CGF, Args);
llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 07b7e60caa2b4..e9589a691c8dc 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1126,15 +1126,19 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
}
if (Args.hasArg(options::OPT_foffload_via_llvm)) {
- CmdArgs.push_back("-include");
+ // Add llvm_wrappers/* to our system include path. This lets us wrap
+ // standard library headers and other headers.
SmallString<128> P(D.ResourceDir);
llvm::sys::path::append(P, "include");
- llvm::sys::path::append(P, "openmp_wrappers");
+ llvm::sys::path::append(P, "llvm_offload_wrappers");
+ CmdArgs.push_back("-internal-isystem");
+ CmdArgs.push_back(Args.MakeArgString(P));
+
+ CmdArgs.push_back("-include");
if (JA.isDeviceOffloading(Action::OFK_OpenMP))
- llvm::sys::path::append(P, "__llvm_offload_device.h");
+ CmdArgs.push_back("__llvm_offload_device.h");
else
- llvm::sys::path::append(P, "__llvm_offload_host.h");
- CmdArgs.push_back(Args.MakeArgString(P));
+ CmdArgs.push_back("__llvm_offload_host.h");
}
// Add -i* options, and automatically translate to
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 215dfe207c228..9e0eb0f4cde89 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -323,9 +323,12 @@ set(openmp_wrapper_files
openmp_wrappers/__clang_openmp_device_functions.h
openmp_wrappers/complex_cmath.h
openmp_wrappers/new
- openmp_wrappers/__llvm_offload.h
- openmp_wrappers/__llvm_offload_host.h
- openmp_wrappers/__llvm_offload_device.h
+)
+
+set(llvm_offload_wrapper_files
+ llvm_offload_wrappers/__llvm_offload.h
+ llvm_offload_wrappers/__llvm_offload_host.h
+ llvm_offload_wrappers/__llvm_offload_device.h
)
set(llvm_libc_wrapper_files
@@ -378,7 +381,7 @@ endfunction(clang_generate_header)
# Copy header files from the source directory to the build directory
foreach( f ${files} ${cuda_wrapper_files} ${cuda_wrapper_bits_files}
${ppc_wrapper_files} ${openmp_wrapper_files} ${zos_wrapper_files} ${hlsl_files}
- ${llvm_libc_wrapper_files})
+ ${llvm_libc_wrapper_files} ${llvm_offload_wrapper_files})
copy_header_to_output_dir(${CMAKE_CURRENT_SOURCE_DIR} ${f})
endforeach( f )
@@ -504,6 +507,7 @@ add_header_target("hlsl-resource-headers" ${hlsl_files})
add_header_target("opencl-resource-headers" ${opencl_files})
add_header_target("llvm-libc-resource-headers" ${llvm_libc_wrapper_files})
add_header_target("openmp-resource-headers" ${openmp_wrapper_files})
+add_header_target("llvm-offload-resource-headers" ${llvm_libc_wrapper_files})
add_header_target("windows-resource-headers" ${windows_only_files})
add_header_target("utility-resource-headers" ${utility_files})
@@ -545,6 +549,11 @@ install(
DESTINATION ${header_install_dir}/openmp_wrappers
COMPONENT clang-resource-headers)
+install(
+ FILES ${llvm_offload_wrapper_files}
+ DESTINATION ${header_install_dir}/llvm_offload_wrappers
+ COMPONENT clang-resource-headers)
+
install(
FILES ${zos_wrapper_files}
DESTINATION ${header_install_dir}/zos_wrappers
@@ -707,8 +716,8 @@ install(
COMPONENT openmp-resource-headers)
install(
- FILES ${openmp_wrapper_files}
- DESTINATION ${header_install_dir}/openmp_wrappers
+ FILES ${llvm_offload_wrapper_files}
+ DESTINATION ${header_install_dir}/llvm_offload_wrappers
EXCLUDE_FROM_ALL
COMPONENT openmp-resource-headers)
diff --git a/clang/lib/Headers/openmp_wrappers/__llvm_offload.h b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload.h
similarity index 100%
rename from clang/lib/Headers/openmp_wrappers/__llvm_offload.h
rename to clang/lib/Headers/llvm_offload_wrappers/__llvm_offload.h
diff --git a/clang/lib/Headers/openmp_wrappers/__llvm_offload_device.h b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_device.h
similarity index 100%
rename from clang/lib/Headers/openmp_wrappers/__llvm_offload_device.h
rename to clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_device.h
diff --git a/clang/lib/Headers/openmp_wrappers/__llvm_offload_host.h b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_host.h
similarity index 100%
rename from clang/lib/Headers/openmp_wrappers/__llvm_offload_host.h
rename to clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_host.h
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index bb9c2291c80a8..4843007e9c2ed 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -3268,13 +3268,13 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t NumThreads, uint64_t NumBlocks,
KernelArgsTy &KernelArgs, void *Args,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
+ uint32_t KernelArgsSize = KernelArgs.NumArgs * sizeof(void *);
+
if (KernelArgs.Flags.IsCUDA) {
- // For CUDA kernels we compute the number of arguments here.
- KernelArgs.NumArgs = (ArgsSize - ImplicitArgsSize) / sizeof(void *);
+ // For CUDA kernels we compute the kernel argument size explicitly.
+ KernelArgsSize = ArgsSize - ImplicitArgsSize;
}
- const uint32_t KernelArgsSize = KernelArgs.NumArgs * sizeof(void *);
-
if (ArgsSize < KernelArgsSize)
return Plugin::error("Mismatch of kernel arguments size");
@@ -3315,14 +3315,8 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
// Copy the explicit arguments.
// TODO: We should expose the args memory manager alloc to the common part as
// alternative to copying them twice.
- if (KernelArgs.NumArgs && !KernelArgs.Flags.IsCUDA) {
- std::memcpy(AllArgs, *static_cast<void **>(Args),
- sizeof(void *) * KernelArgs.NumArgs);
- } else {
- for (uint32_t I = 0; I < KernelArgs.NumArgs; ++I)
- std::memcpy(advanceVoidPtr(AllArgs, sizeof(void *) * I),
- static_cast<void **>(Args)[I], sizeof(void *));
- }
+ if (KernelArgsSize)
+ std::memcpy(AllArgs, *static_cast<void **>(Args), KernelArgsSize);
AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(GenericDevice);
diff --git a/offload/src/KernelLanguage/API.cpp b/offload/src/KernelLanguage/API.cpp
index 07517ad1fc18a..e3c75ab67c4cd 100644
--- a/offload/src/KernelLanguage/API.cpp
+++ b/offload/src/KernelLanguage/API.cpp
@@ -58,7 +58,7 @@ int __tgt_target_kernel(void *Loc, int64_t DeviceId, int32_t NumTeams,
KernelArgsTy *Args);
unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
- void **args, size_t sharedMem, void *stream) {
+ void *args, size_t sharedMem, void *stream) {
KernelArgsTy Args = {};
Args.DynCGroupMem = sharedMem;
Args.NumTeams[0] = gridDim.x;
@@ -67,7 +67,7 @@ unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
Args.ThreadLimit[0] = blockDim.x;
Args.ThreadLimit[1] = blockDim.y;
Args.ThreadLimit[2] = blockDim.z;
- Args.ArgPtrs = args;
+ Args.ArgPtrs = &args;
Args.Flags.IsCUDA = true;
int rv = __tgt_target_kernel(nullptr, 0, gridDim.x,
blockDim.x, func, &Args);
diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg
index 6c590603079c4..9053151e44a78 100644
--- a/offload/test/lit.cfg
+++ b/offload/test/lit.cfg
@@ -66,7 +66,7 @@ def evaluate_bool_env(env):
config.name = 'libomptarget :: ' + config.libomptarget_current_target
# suffixes: A list of file extensions to treat as test files.
-config.suffixes = ['.c', '.cpp', '.cc', '.f90']
+config.suffixes = ['.c', '.cpp', '.cc', '.f90', '.cu']
# excludes: A list of directories to exclude from the testuites.
config.excludes = ['Inputs']
diff --git a/offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu b/offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu
index 401b9e2d559b1..7f5cc054dbdec 100644
--- a/offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu
+++ b/offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu
@@ -22,7 +22,7 @@ int main(int argc, char **argv) {
int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo));
*Ptr = 0;
printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
- // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 7
+ // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 0
square<<<7, 6>>>(Ptr);
printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
// CHECK: Ptr [[Ptr]], *Ptr: 42
diff --git a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
index c836292127fea..5976ad728fd9f 100644
--- a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
+++ b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
@@ -13,8 +13,10 @@ void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum);
}
-__global__ void square(int *Dst, int *Src, short Q, short P) {
+__global__ void square(int *Dst, short Q, int *Src, short P) {
*Dst = (Src[0] + Src[1]) * (Q + P);
+ Src[0] = Q;
+ Src[1] = P;
}
int main(int argc, char **argv) {
@@ -25,9 +27,11 @@ int main(int argc, char **argv) {
Src[0] = -2;
Src[1] = 8;
printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+ printf("%i : %i\n", Src[0], Src[1]);
// CHECK: Ptr [[Ptr:0x.*]], *Ptr: 7
- square<<<1, 1>>>(Ptr, Src, 3, 4);
+ square<<<1, 1>>>(Ptr, 3, Src, 4);
printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+ printf("%i : %i\n", Src[0], Src[1]);
// CHECK: Ptr [[Ptr]], *Ptr: 42
llvm_omp_target_free_shared(Ptr, DevNo);
}
>From 6a6809a50ac35e12416eb0602bdb18189c3054f3 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes at jdoerfert.de>
Date: Fri, 7 Jun 2024 17:06:02 -0700
Subject: [PATCH 4/4] [Offload][CUDA] Add initial cuda_runtime.h overlay
This provides the header overlay for cuda_runtime.h which is found
before any CUDA installation (none is necessary). Some basic APIs are
defined in terms of the omp_target_* ones, but with the API redesign
the requirements of CUDA should be taken into account.
---
clang/lib/Headers/CMakeLists.txt | 1 +
.../llvm_offload_wrappers/cuda_runtime.h | 131 ++++++++++++++++++
.../offloading/CUDA/basic_api_malloc_free.cu | 41 ++++++
.../test/offloading/CUDA/basic_api_memcpy.cu | 46 ++++++
.../test/offloading/CUDA/basic_api_memset.cu | 43 ++++++
5 files changed, 262 insertions(+)
create mode 100644 clang/lib/Headers/llvm_offload_wrappers/cuda_runtime.h
create mode 100644 offload/test/offloading/CUDA/basic_api_malloc_free.cu
create mode 100644 offload/test/offloading/CUDA/basic_api_memcpy.cu
create mode 100644 offload/test/offloading/CUDA/basic_api_memset.cu
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 9e0eb0f4cde89..251e5b0ba2381 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -329,6 +329,7 @@ set(llvm_offload_wrapper_files
llvm_offload_wrappers/__llvm_offload.h
llvm_offload_wrappers/__llvm_offload_host.h
llvm_offload_wrappers/__llvm_offload_device.h
+ llvm_offload_wrappers/cuda_runtime.h
)
set(llvm_libc_wrapper_files
diff --git a/clang/lib/Headers/llvm_offload_wrappers/cuda_runtime.h b/clang/lib/Headers/llvm_offload_wrappers/cuda_runtime.h
new file mode 100644
index 0000000000000..8718e462a82d3
--- /dev/null
+++ b/clang/lib/Headers/llvm_offload_wrappers/cuda_runtime.h
@@ -0,0 +1,131 @@
+/*===- __cuda_runtime.h - LLVM/Offload wrappers for CUDA runtime API -------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CUDA_RUNTIME_API__
+#define __CUDA_RUNTIME_API__
+
+#include <cstddef>
+#include <optional>
+
+extern "C" {
+int omp_get_initial_device(void);
+void omp_target_free(void *Ptr, int Device);
+void *omp_target_alloc(size_t Size, int Device);
+int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
+ size_t DstOffset, size_t SrcOffset, int DstDevice,
+ int SrcDevice);
+void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum);
+}
+
+// TODO: There are many fields missing in this enumeration.
+typedef enum cudaError {
+ cudaSuccess = 0,
+ cudaErrorInvalidValue = 1,
+ cudaErrorMemoryAllocation = 2,
+ cudaErrorNoDevice = 100,
+ cudaErrorInvalidDevice = 101,
+ cudaErrorOTHER = -1,
+} cudaError_t;
+
+enum cudaMemcpyKind {
+ cudaMemcpyHostToHost = 0,
+ cudaMemcpyHostToDevice = 1,
+ cudaMemcpyDeviceToHost = 2,
+ cudaMemcpyDeviceToDevice = 3,
+ cudaMemcpyDefault = 4
+};
+
+typedef void *cudaStream_t;
+
+static thread_local cudaError_t __cudaomp_last_error = cudaSuccess;
+
+// Returns the last error that has been produced and resets it to cudaSuccess.
+inline cudaError_t cudaGetLastError() {
+ cudaError_t TempError = __cudaomp_last_error;
+ __cudaomp_last_error = cudaSuccess;
+ return TempError;
+}
+
+// Returns the last error that has been produced without reseting it.
+inline cudaError_t cudaPeekAtLastError() { return __cudaomp_last_error; }
+
+inline cudaError_t __cudaMalloc(void **devPtr, size_t size) {
+ int DeviceNum = 0;
+ *devPtr = omp_target_alloc(size, DeviceNum);
+ if (*devPtr == NULL)
+ return __cudaomp_last_error = cudaErrorMemoryAllocation;
+
+ return __cudaomp_last_error = cudaSuccess;
+}
+
+template <class T> cudaError_t cudaMalloc(T **devPtr, size_t size) {
+ return __cudaMalloc((void **)devPtr, size);
+}
+
+inline cudaError_t __cudaFree(void *devPtr) {
+ int DeviceNum = 0;
+ omp_target_free(devPtr, DeviceNum);
+ return __cudaomp_last_error = cudaSuccess;
+}
+
+template <class T> inline cudaError_t cudaFree(T *ptr) {
+ return __cudaFree((void *)ptr);
+}
+
+inline cudaError_t __cudaMemcpy(void *dst, const void *src, size_t count,
+ cudaMemcpyKind kind) {
+ // get the host device number (which is the inital device)
+ int HostDeviceNum = omp_get_initial_device();
+
+ // use the default device for gpu
+ int GPUDeviceNum = 0;
+
+ // default to copy from host to device
+ int DstDeviceNum = GPUDeviceNum;
+ int SrcDeviceNum = HostDeviceNum;
+
+ if (kind == cudaMemcpyDeviceToHost)
+ std::swap(DstDeviceNum, SrcDeviceNum);
+
+ // omp_target_memcpy returns 0 on success and non-zero on failure
+ if (omp_target_memcpy(dst, src, count, 0, 0, DstDeviceNum, SrcDeviceNum))
+ return __cudaomp_last_error = cudaErrorInvalidValue;
+ return __cudaomp_last_error = cudaSuccess;
+}
+
+template <class T>
+inline cudaError_t cudaMemcpy(T *dst, const T *src, size_t count,
+ cudaMemcpyKind kind) {
+ return __cudaMemcpy((void *)dst, (const void *)src, count, kind);
+}
+
+inline cudaError_t __cudaMemset(void *devPtr, int value, size_t count,
+ cudaStream_t stream = 0) {
+ int DeviceNum = 0;
+ if (!omp_target_memset(devPtr, value, count, DeviceNum))
+ return __cudaomp_last_error = cudaErrorInvalidValue;
+ return __cudaomp_last_error = cudaSuccess;
+}
+
+template <class T>
+inline cudaError_t cudaMemset(T *devPtr, int value, size_t count) {
+ return __cudaMemset((void *)devPtr, value, count);
+}
+
+inline cudaError_t cudaDeviceSynchronize() {
+ // TODO: not implemented, not async yet.
+ return __cudaomp_last_error = cudaSuccess;
+}
+
+inline cudaError_t cudaDeviceReset(void) {
+ // TODO: not implemented.
+ return __cudaomp_last_error = cudaSuccess;
+}
+
+#endif
diff --git a/offload/test/offloading/CUDA/basic_api_malloc_free.cu b/offload/test/offloading/CUDA/basic_api_malloc_free.cu
new file mode 100644
index 0000000000000..86dc5f8fef68e
--- /dev/null
+++ b/offload/test/offloading/CUDA/basic_api_malloc_free.cu
@@ -0,0 +1,41 @@
+// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t
+// RUN: %t | %fcheck-generic
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+extern "C" {
+void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
+void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum);
+}
+
+__global__ void kernel(int *A, int *DevPtr, int N) {
+ for (int i = 0; i < N; ++i)
+ DevPtr[i] = 1;
+ for (int i = 0; i < N; ++i)
+ *A += DevPtr[i];
+}
+
+int main(int argc, char **argv) {
+ int DevNo = 0;
+ int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo));
+ int *DevPtr;
+ auto Err = cudaMalloc(&DevPtr, 42 * sizeof(int));
+ if (Err != cudaSuccess)
+ return -1;
+ *Ptr = 0;
+ printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+ // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 0
+ kernel<<<1, 1>>>(Ptr, DevPtr, 42);
+ printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+ // CHECK: Ptr [[Ptr]], *Ptr: 42
+ Err = cudaFree(DevPtr);
+ if (Err != cudaSuccess)
+ return -1;
+ llvm_omp_target_free_shared(Ptr, DevNo);
+}
diff --git a/offload/test/offloading/CUDA/basic_api_memcpy.cu b/offload/test/offloading/CUDA/basic_api_memcpy.cu
new file mode 100644
index 0000000000000..d5c0929abe43c
--- /dev/null
+++ b/offload/test/offloading/CUDA/basic_api_memcpy.cu
@@ -0,0 +1,46 @@
+// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t
+// RUN: %t | %fcheck-generic
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+__global__ void kernel(int *DevPtr, int N) {
+ for (int i = 0; i < N; ++i)
+ DevPtr[i]--;
+}
+
+int main(int argc, char **argv) {
+ int DevNo = 0;
+ int Res = 0;
+ int *DevPtr;
+ auto Err = cudaMalloc(&DevPtr, 42 * sizeof(int));
+ if (Err != cudaSuccess)
+ return -1;
+ int HstPtr[42];
+ for (int i = 0; i < 42; ++i) {
+ HstPtr[i] = 2;
+ }
+ Err = cudaMemcpy(DevPtr, HstPtr, 42 * sizeof(int), cudaMemcpyHostToDevice);
+ if (Err != cudaSuccess)
+ return -1;
+ printf("Res: %i\n", Res);
+ // CHECK: Res: 0
+ kernel<<<1, 1>>>(DevPtr, 42);
+ Err = cudaMemcpy(HstPtr, DevPtr, 42 * sizeof(int), cudaMemcpyDeviceToHost);
+ if (Err != cudaSuccess)
+ return -1;
+ for (int i = 0; i < 42; ++i) {
+ printf("%i : %i\n", i, HstPtr[i]);
+ Res += HstPtr[i];
+ }
+ printf("Res: %i\n", Res);
+ // CHECK: Res: 42
+ Err = cudaFree(DevPtr);
+ if (Err != cudaSuccess)
+ return -1;
+}
diff --git a/offload/test/offloading/CUDA/basic_api_memset.cu b/offload/test/offloading/CUDA/basic_api_memset.cu
new file mode 100644
index 0000000000000..36dcc729a3a9e
--- /dev/null
+++ b/offload/test/offloading/CUDA/basic_api_memset.cu
@@ -0,0 +1,43 @@
+// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t
+// RUN: %t | %fcheck-generic
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+extern "C" {
+void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
+void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum);
+}
+
+__global__ void kernel(int *A, int *DevPtr, int N) {
+ for (int i = 0; i < N; ++i)
+ *A += DevPtr[i];
+ *A *= -1;
+}
+
+int main(int argc, char **argv) {
+ int DevNo = 0;
+ int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo));
+ int *DevPtr;
+ auto Err = cudaMalloc(&DevPtr, 42 * sizeof(int));
+ if (Err != cudaSuccess)
+ return -1;
+ Err = cudaMemset(DevPtr, -1, 42 * sizeof(int));
+ if (Err != cudaSuccess)
+ return -1;
+ *Ptr = 0;
+ printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+ // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 0
+ kernel<<<1, 1>>>(Ptr, DevPtr, 42);
+ printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+ // CHECK: Ptr [[Ptr]], *Ptr: 42
+ Err = cudaFree(DevPtr);
+ if (Err != cudaSuccess)
+ return -1;
+ llvm_omp_target_free_shared(Ptr, DevNo);
+}
More information about the cfe-commits
mailing list