[clang] [llvm] [Offload][CUDA] Add initial cuda_runtime.h overlay (PR #94821)
via cfe-commits
cfe-commits at lists.llvm.org
Fri Jun 7 17:12:03 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-clang-driver
Author: Johannes Doerfert (jdoerfert)
<details>
<summary>Changes</summary>
This provides the header overlay for cuda_runtime.h which is found
before any CUDA installation (none is necessary). Some basic APIs are
defined in terms of the omp_target_* ones, but with the pending
LLVM/Offload API redesign the requirements of CUDA should be taken
into account.
Note: Async is not exposed by the existing runtime thus the streams are
ignored. I'll address this in a follow up.
---
Patch is 45.18 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/94821.diff
28 Files Affected:
- (modified) clang/include/clang/Basic/LangOptions.def (+1)
- (modified) clang/include/clang/Driver/Options.td (+6)
- (modified) clang/lib/CodeGen/CGCUDANV.cpp (+62-15)
- (modified) clang/lib/Driver/Driver.cpp (+12-7)
- (modified) clang/lib/Driver/ToolChains/Clang.cpp (+26-4)
- (modified) clang/lib/Driver/ToolChains/CommonArgs.cpp (+6-1)
- (modified) clang/lib/Headers/CMakeLists.txt (+16-3)
- (added) clang/lib/Headers/llvm_offload_wrappers/__llvm_offload.h (+31)
- (added) clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_device.h (+10)
- (added) clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_host.h (+15)
- (added) clang/lib/Headers/llvm_offload_wrappers/cuda_runtime.h (+131)
- (modified) clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h (+2-4)
- (modified) clang/lib/Sema/SemaCUDA.cpp (+3)
- (added) clang/test/Driver/cuda-via-liboffload.cu (+23)
- (modified) offload/include/Shared/APITypes.h (+19-14)
- (modified) offload/include/omptarget.h (+1-1)
- (modified) offload/plugins-nextgen/amdgpu/src/rtl.cpp (+8-4)
- (modified) offload/plugins-nextgen/common/src/PluginInterface.cpp (+11-3)
- (modified) offload/src/CMakeLists.txt (+1)
- (added) offload/src/KernelLanguage/API.cpp (+76)
- (modified) offload/src/exports (+3)
- (modified) offload/test/lit.cfg (+1-1)
- (added) offload/test/offloading/CUDA/basic_api_malloc_free.cu (+41)
- (added) offload/test/offloading/CUDA/basic_api_memcpy.cu (+46)
- (added) offload/test/offloading/CUDA/basic_api_memset.cu (+43)
- (added) offload/test/offloading/CUDA/basic_launch.cu (+29)
- (added) offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu (+30)
- (added) offload/test/offloading/CUDA/basic_launch_multi_arg.cu (+37)
``````````diff
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 4061451b2150a..8aff98867202e 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -288,6 +288,7 @@ LANGOPT(GPUMaxThreadsPerBlock, 32, 1024, "default max threads per block for kern
LANGOPT(GPUDeferDiag, 1, 0, "defer host/device related diagnostic messages for CUDA/HIP")
LANGOPT(GPUExcludeWrongSideOverloads, 1, 0, "always exclude wrong side overloads in overloading resolution for CUDA/HIP")
LANGOPT(OffloadingNewDriver, 1, 0, "use the new driver for generating offloading code.")
+LANGOPT(OffloadViaLLVM, 1, 0, "target LLVM/Offload as portable offloading runtime.")
LANGOPT(SYCLIsDevice , 1, 0, "Generate code for SYCL device")
LANGOPT(SYCLIsHost , 1, 0, "SYCL host compilation")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 57f37c5023110..a09d75917ff98 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1275,6 +1275,12 @@ def no_offload_compress : Flag<["--"], "no-offload-compress">;
def offload_compression_level_EQ : Joined<["--"], "offload-compression-level=">,
Flags<[HelpHidden]>,
HelpText<"Compression level for offload device binaries (HIP only)">;
+
+defm offload_via_llvm : BoolFOption<"offload-via-llvm",
+ LangOpts<"OffloadViaLLVM">, DefaultFalse,
+ PosFlag<SetTrue, [], [ClangOption, CC1Option], "Use">,
+ NegFlag<SetFalse, [], [ClangOption], "Don't use">,
+ BothFlags<[], [ClangOption], " LLVM/Offload as portable offloading runtime.">>;
}
// CUDA options
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 43dfbbb90dd52..8e32aad88a26d 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -15,10 +15,12 @@
#include "CGCXXABI.h"
#include "CodeGenFunction.h"
#include "CodeGenModule.h"
+#include "clang/AST/CharUnits.h"
#include "clang/AST/Decl.h"
#include "clang/Basic/Cuda.h"
#include "clang/CodeGen/CodeGenABITypes.h"
#include "clang/CodeGen/ConstantInitBuilder.h"
+#include "llvm/ADT/StringRef.h"
#include "llvm/Frontend/Offloading/Utility.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
@@ -36,6 +38,11 @@ constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
class CGNVCUDARuntime : public CGCUDARuntime {
+ /// The prefix used for function calls and section names (CUDA, HIP, LLVM)
+ StringRef Prefix;
+ /// TODO: We should transition the OpenMP section to LLVM/Offload
+ StringRef SectionPrefix;
+
private:
llvm::IntegerType *IntTy, *SizeTy;
llvm::Type *VoidTy;
@@ -132,6 +139,9 @@ class CGNVCUDARuntime : public CGCUDARuntime {
return DummyFunc;
}
+ Address prepareKernelArgs(CodeGenFunction &CGF, FunctionArgList &Args);
+ Address prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
+ FunctionArgList &Args);
void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
std::string getDeviceSideName(const NamedDecl *ND) override;
@@ -191,15 +201,11 @@ class CGNVCUDARuntime : public CGCUDARuntime {
} // end anonymous namespace
std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
- if (CGM.getLangOpts().HIP)
- return ((Twine("hip") + Twine(FuncName)).str());
- return ((Twine("cuda") + Twine(FuncName)).str());
+ return (Prefix + FuncName).str();
}
std::string
CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
- if (CGM.getLangOpts().HIP)
- return ((Twine("__hip") + Twine(FuncName)).str());
- return ((Twine("__cuda") + Twine(FuncName)).str());
+ return ("__" + Prefix + FuncName).str();
}
static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) {
@@ -227,6 +233,14 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
SizeTy = CGM.SizeTy;
VoidTy = CGM.VoidTy;
PtrTy = CGM.UnqualPtrTy;
+
+ if (CGM.getLangOpts().OffloadViaLLVM) {
+ Prefix = "llvm";
+ SectionPrefix = "omp";
+ } else if (CGM.getLangOpts().HIP)
+ SectionPrefix = Prefix = "hip";
+ else
+ SectionPrefix = Prefix = "cuda";
}
llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
@@ -305,18 +319,37 @@ void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
}
if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
CudaFeature::CUDA_USES_NEW_LAUNCH) ||
- (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
+ (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI) ||
+ (CGF.getLangOpts().OffloadViaLLVM))
emitDeviceStubBodyNew(CGF, Args);
else
emitDeviceStubBodyLegacy(CGF, Args);
}
-// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
-// array and kernels are launched using cudaLaunchKernel().
-void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
- FunctionArgList &Args) {
- // Build the shadow stack entry at the very start of the function.
+/// CUDA passes the arguments with a level of indirection. For example, a
+/// (void*, short, void*) is passed as {void **, short *, void **} to the launch
+/// function. For the LLVM/offload launch we flatten the arguments into the
+/// struct directly, thus pass {void *, short, void *}
+Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
+ FunctionArgList &Args) {
+ SmallVector<llvm::Type *> ArgTypes;
+ for (auto &Arg : Args)
+ ArgTypes.push_back(CGF.ConvertTypeForMem(Arg->getType()));
+
+ llvm::StructType *ST = llvm::StructType::create(ArgTypes);
+ Address KernelArgs = CGF.CreateTempAllocaWithoutCast(
+ ST, CharUnits::fromQuantity(16), "kernel_args");
+
+ for (unsigned i = 0; i < Args.size(); ++i) {
+ auto *ArgVal = CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(Args[i]));
+ CGF.Builder.CreateStore(ArgVal, CGF.Builder.CreateStructGEP(KernelArgs, i));
+ }
+ return KernelArgs;
+}
+
+Address CGNVCUDARuntime::prepareKernelArgs(CodeGenFunction &CGF,
+ FunctionArgList &Args) {
// Calculate amount of space we will need for all arguments. If we have no
// args, allocate a single pointer so we still have a valid pointer to the
// argument array that we can pass to runtime, even if it will be unused.
@@ -331,6 +364,17 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
VoidVarPtr, CGF.Builder.CreateConstGEP1_32(
PtrTy, KernelArgs.emitRawPointer(CGF), i));
}
+ return KernelArgs;
+}
+
+// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
+// array and kernels are launched using cudaLaunchKernel().
+void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
+ FunctionArgList &Args) {
+ // Build the shadow stack entry at the very start of the function.
+ Address KernelArgs = CGF.getLangOpts().OffloadViaLLVM
+ ? prepareKernelArgsLLVMOffload(CGF, Args)
+ : prepareKernelArgs(CGF, Args);
llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
@@ -1129,8 +1173,9 @@ void CGNVCUDARuntime::transformManagedVars() {
// registered. The linker will provide a pointer to this section so we can
// register the symbols with the linked device image.
void CGNVCUDARuntime::createOffloadingEntries() {
- StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries"
- : "cuda_offloading_entries";
+ SmallVector<char, 32> Out;
+ StringRef Section = (SectionPrefix + "_offloading_entries").toStringRef(Out);
+
llvm::Module &M = CGM.getModule();
for (KernelInfo &I : EmittedKernels)
llvm::offloading::emitOffloadingEntry(
@@ -1199,7 +1244,9 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
}
return nullptr;
}
- if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)
+ if (CGM.getLangOpts().OffloadViaLLVM)
+ createOffloadingEntries();
+ else if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)
createOffloadingEntries();
else
return makeModuleCtorFunction();
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index f5ea73a04ae5c..815149a49d018 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -792,11 +792,13 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
}) ||
C.getInputArgs().hasArg(options::OPT_hip_link) ||
C.getInputArgs().hasArg(options::OPT_hipstdpar);
+ bool UseLLVMOffload = C.getInputArgs().hasArg(
+ options::OPT_foffload_via_llvm, options::OPT_fno_offload_via_llvm, false);
if (IsCuda && IsHIP) {
Diag(clang::diag::err_drv_mix_cuda_hip);
return;
}
- if (IsCuda) {
+ if (IsCuda && !UseLLVMOffload) {
const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
const llvm::Triple &HostTriple = HostTC->getTriple();
auto OFK = Action::OFK_Cuda;
@@ -818,7 +820,7 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
CudaInstallation.WarnIfUnsupportedVersion();
}
C.addOffloadDeviceToolChain(CudaTC.get(), OFK);
- } else if (IsHIP) {
+ } else if (IsHIP && !UseLLVMOffload) {
if (auto *OMPTargetArg =
C.getInputArgs().getLastArg(options::OPT_fopenmp_targets_EQ)) {
Diag(clang::diag::err_drv_unsupported_opt_for_language_mode)
@@ -842,10 +844,11 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
// We need to generate an OpenMP toolchain if the user specified targets with
// the -fopenmp-targets option or used --offload-arch with OpenMP enabled.
bool IsOpenMPOffloading =
- C.getInputArgs().hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
- options::OPT_fno_openmp, false) &&
- (C.getInputArgs().hasArg(options::OPT_fopenmp_targets_EQ) ||
- C.getInputArgs().hasArg(options::OPT_offload_arch_EQ));
+ ((IsCuda || IsHIP) && UseLLVMOffload) ||
+ (C.getInputArgs().hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
+ options::OPT_fno_openmp, false) &&
+ (C.getInputArgs().hasArg(options::OPT_fopenmp_targets_EQ) ||
+ C.getInputArgs().hasArg(options::OPT_offload_arch_EQ)));
if (IsOpenMPOffloading) {
// We expect that -fopenmp-targets is always used in conjunction with the
// option -fopenmp specifying a valid runtime with offloading support, i.e.
@@ -873,7 +876,7 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
for (StringRef T : OpenMPTargets->getValues())
OpenMPTriples.insert(T);
} else if (C.getInputArgs().hasArg(options::OPT_offload_arch_EQ) &&
- !IsHIP && !IsCuda) {
+ ((!IsHIP && !IsCuda) || UseLLVMOffload)) {
const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
auto AMDTriple = getHIPOffloadTargetTriple(*this, C.getInputArgs());
auto NVPTXTriple = getNVIDIAOffloadTargetTriple(*this, C.getInputArgs(),
@@ -4138,6 +4141,8 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
bool UseNewOffloadingDriver =
C.isOffloadingHostKind(Action::OFK_OpenMP) ||
+ Args.hasFlag(options::OPT_foffload_via_llvm,
+ options::OPT_fno_offload_via_llvm, false) ||
Args.hasFlag(options::OPT_offload_new_driver,
options::OPT_no_offload_new_driver, false);
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 4e1c52462e584..e9589a691c8dc 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1125,6 +1125,22 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
CmdArgs.push_back("__clang_openmp_device_functions.h");
}
+ if (Args.hasArg(options::OPT_foffload_via_llvm)) {
+ // Add llvm_wrappers/* to our system include path. This lets us wrap
+ // standard library headers and other headers.
+ SmallString<128> P(D.ResourceDir);
+ llvm::sys::path::append(P, "include");
+ llvm::sys::path::append(P, "llvm_offload_wrappers");
+ CmdArgs.push_back("-internal-isystem");
+ CmdArgs.push_back(Args.MakeArgString(P));
+
+ CmdArgs.push_back("-include");
+ if (JA.isDeviceOffloading(Action::OFK_OpenMP))
+ CmdArgs.push_back("__llvm_offload_device.h");
+ else
+ CmdArgs.push_back("__llvm_offload_host.h");
+ }
+
// Add -i* options, and automatically translate to
// -include-pch/-include-pth for transparent PCH support. It's
// wonky, but we include looking for .gch so we can support seamless
@@ -6672,11 +6688,17 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
Args.addOptOutFlag(CmdArgs, options::OPT_fopenmp_extensions,
options::OPT_fno_openmp_extensions);
}
-
- // Forward the new driver to change offloading code generation.
- if (Args.hasFlag(options::OPT_offload_new_driver,
- options::OPT_no_offload_new_driver, false))
+ // Forward the offload runtime change to code generation, liboffload implies
+ // new driver. Otherwise, check if we should forward the new driver to change
+ // offloading code generation.
+ if (Args.hasFlag(options::OPT_foffload_via_llvm,
+ options::OPT_fno_offload_via_llvm, false)) {
CmdArgs.push_back("--offload-new-driver");
+ CmdArgs.push_back("-foffload-via-llvm");
+ } else if (Args.hasFlag(options::OPT_offload_new_driver,
+ options::OPT_no_offload_new_driver, false)) {
+ CmdArgs.push_back("--offload-new-driver");
+ }
SanitizeArgs.addArgs(TC, Args, CmdArgs, InputType);
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 71e993119436a..74a69f65f7ad5 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1144,8 +1144,13 @@ bool tools::addOpenMPRuntime(const Compilation &C, ArgStringList &CmdArgs,
bool ForceStaticHostRuntime, bool IsOffloadingHost,
bool GompNeedsRT) {
if (!Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
- options::OPT_fno_openmp, false))
+ options::OPT_fno_openmp, false)) {
+ // We need libomptarget (liboffload) if it's the choosen offloading runtime.
+ if (Args.hasFlag(options::OPT_foffload_via_llvm,
+ options::OPT_fno_offload_via_llvm, false))
+ CmdArgs.push_back("-lomptarget");
return false;
+ }
Driver::OpenMPRuntimeKind RTKind = TC.getDriver().getOpenMPRuntime(Args);
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index d3090e488306f..251e5b0ba2381 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -325,6 +325,13 @@ set(openmp_wrapper_files
openmp_wrappers/new
)
+set(llvm_offload_wrapper_files
+ llvm_offload_wrappers/__llvm_offload.h
+ llvm_offload_wrappers/__llvm_offload_host.h
+ llvm_offload_wrappers/__llvm_offload_device.h
+ llvm_offload_wrappers/cuda_runtime.h
+)
+
set(llvm_libc_wrapper_files
llvm_libc_wrappers/assert.h
llvm_libc_wrappers/stdio.h
@@ -375,7 +382,7 @@ endfunction(clang_generate_header)
# Copy header files from the source directory to the build directory
foreach( f ${files} ${cuda_wrapper_files} ${cuda_wrapper_bits_files}
${ppc_wrapper_files} ${openmp_wrapper_files} ${zos_wrapper_files} ${hlsl_files}
- ${llvm_libc_wrapper_files})
+ ${llvm_libc_wrapper_files} ${llvm_offload_wrapper_files})
copy_header_to_output_dir(${CMAKE_CURRENT_SOURCE_DIR} ${f})
endforeach( f )
@@ -501,6 +508,7 @@ add_header_target("hlsl-resource-headers" ${hlsl_files})
add_header_target("opencl-resource-headers" ${opencl_files})
add_header_target("llvm-libc-resource-headers" ${llvm_libc_wrapper_files})
add_header_target("openmp-resource-headers" ${openmp_wrapper_files})
+add_header_target("llvm-offload-resource-headers" ${llvm_libc_wrapper_files})
add_header_target("windows-resource-headers" ${windows_only_files})
add_header_target("utility-resource-headers" ${utility_files})
@@ -542,6 +550,11 @@ install(
DESTINATION ${header_install_dir}/openmp_wrappers
COMPONENT clang-resource-headers)
+install(
+ FILES ${llvm_offload_wrapper_files}
+ DESTINATION ${header_install_dir}/llvm_offload_wrappers
+ COMPONENT clang-resource-headers)
+
install(
FILES ${zos_wrapper_files}
DESTINATION ${header_install_dir}/zos_wrappers
@@ -704,8 +717,8 @@ install(
COMPONENT openmp-resource-headers)
install(
- FILES ${openmp_wrapper_files}
- DESTINATION ${header_install_dir}/openmp_wrappers
+ FILES ${llvm_offload_wrapper_files}
+ DESTINATION ${header_install_dir}/llvm_offload_wrappers
EXCLUDE_FROM_ALL
COMPONENT openmp-resource-headers)
diff --git a/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload.h b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload.h
new file mode 100644
index 0000000000000..d78e3b41f99a5
--- /dev/null
+++ b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload.h
@@ -0,0 +1,31 @@
+/*===------ LLVM/Offload helpers for kernel languages (CUDA/HIP) -*- c++ -*-===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#include <stdlib.h>
+
+#define __host__ __attribute__((host))
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __shared__ __attribute__((shared))
+#define __constant__ __attribute__((constant))
+#define __managed__ __attribute__((managed))
+
+extern "C" {
+
+typedef struct dim3 {
+ dim3() {}
+ dim3(unsigned x) : x(x) {}
+ unsigned x = 0, y = 0, z = 0;
+} dim3;
+
+// TODO: For some reason the CUDA device compilation requires this declaration
+// to be present but it should not.
+unsigned __llvmPushCallConfiguration(dim3 gridDim, dim3 blockDim,
+ size_t sharedMem = 0, void *stream = 0);
+}
diff --git a/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_device.h b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_device.h
new file mode 100644
index 0000000000000..1a813b331515b
--- /dev/null
+++ b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_device.h
@@ -0,0 +1,10 @@
+/*===------ LLVM/Offload helpers for kernel languages (CUDA/HIP) -*- c++ -*-===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#include "__llvm_offload.h"
diff --git a/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_host.h b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_host.h
new file mode 100644
index 0000000000000..160289d169b55
--- /dev/null
+++ b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_host.h
@@ -0,0 +1,15 @@
+/*===------ LLVM/Offload helpers for kernel languages (CUDA/HIP) -*- c++ -*-===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#include "__llvm_offload.h"
+
+extern "C" {
+unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
+ void **args, size_t sharedMem = 0, void *stream = 0);
+}
diff --git a/clang/lib/Headers/llvm_offload_wrappers/cuda_runtime.h b/clang/lib/Headers/llvm_offload_wrappers/cuda_runtime.h
new file mode 100644
index 0000000000000..8718e462a82d3
--- /dev/null
+++ b/clang/lib/Headers/llvm_offload_wrappers/cuda_runtime.h
@@ -0,0 +1,131 @@
+/*===- __cuda_...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/94821
More information about the cfe-commits
mailing list