[clang] 9d899d8 - [HIP] Support `-fgpu-default-stream`
Yaxun Liu via cfe-commits
cfe-commits at lists.llvm.org
Wed Feb 23 19:30:45 PST 2022
Author: Yaxun (Sam) Liu
Date: 2022-02-23T22:28:29-05:00
New Revision: 9d899d8f01872e91b9909a6ee5937a796a399276
URL: https://github.com/llvm/llvm-project/commit/9d899d8f01872e91b9909a6ee5937a796a399276
DIFF: https://github.com/llvm/llvm-project/commit/9d899d8f01872e91b9909a6ee5937a796a399276.diff
LOG: [HIP] Support `-fgpu-default-stream`
Introduce -fgpu-default-stream={legacy|per-thread} option to
support per-thread default stream for HIP runtime.
When -fgpu-default-stream=per-thread, HIP kernels are
launched through hipLaunchKernel_spt instead of
hipLaunchKernel. Also HIP_API_PER_THREAD_DEFAULT_STREAM=1
is defined by the preprocessor to enable other per-thread stream
API's.
Reviewed by: Artem Belevich
Differential Revision: https://reviews.llvm.org/D120298
Added:
Modified:
clang/include/clang/Basic/LangOptions.h
clang/include/clang/Driver/Options.td
clang/lib/CodeGen/CGCUDANV.cpp
clang/lib/Driver/ToolChains/Clang.cpp
clang/lib/Frontend/InitPreprocessor.cpp
clang/test/CodeGenCUDA/Inputs/cuda.h
clang/test/CodeGenCUDA/kernel-call.cu
clang/test/Driver/hip-options.hip
clang/test/Preprocessor/predefined-macros.c
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index 2e334e375950e..6aa24d2facc2a 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -309,6 +309,13 @@ class LangOptions : public LangOptionsBase {
ExtendTo64
};
+ enum class GPUDefaultStreamKind {
+ /// Legacy default stream
+ Legacy,
+ /// Per-thread default stream
+ PerThread,
+ };
+
public:
/// The used language standard.
LangStandard::Kind LangStd;
@@ -402,6 +409,9 @@ class LangOptions : public LangOptionsBase {
/// input is a header file (i.e. -x c-header).
bool IsHeaderFile = false;
+ /// The default stream kind used for HIP kernel launching.
+ GPUDefaultStreamKind GPUDefaultStream;
+
LangOptions();
// Define accessors/mutators for language options of enumeration type.
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index f9d8e32169635..8dd16ca990a14 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -959,6 +959,13 @@ defm cuda_short_ptr : BoolFOption<"cuda-short-ptr",
TargetOpts<"NVPTXUseShortPointers">, DefaultFalse,
PosFlag<SetTrue, [CC1Option], "Use 32-bit pointers for accessing const/local/shared address spaces">,
NegFlag<SetFalse>>;
+def fgpu_default_stream_EQ : Joined<["-"], "fgpu-default-stream=">,
+ HelpText<"Specify default stream. Valid values are 'legacy' and 'per-thread'. The default value is 'legacy'. (HIP only)">,
+ Flags<[CC1Option]>,
+ Values<"legacy,per-thread">,
+ NormalizedValuesScope<"LangOptions::GPUDefaultStreamKind">,
+ NormalizedValues<["Legacy", "PerThread"]>,
+ MarshallingInfoEnum<LangOpts<"GPUDefaultStream">, "Legacy">;
def rocm_path_EQ : Joined<["--"], "rocm-path=">, Group<i_Group>,
HelpText<"ROCm installation path, used for finding and automatically linking required bitcode libraries.">;
def hip_path_EQ : Joined<["--"], "hip-path=">, Group<i_Group>,
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 293bdf99d272f..b832c686b8b69 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -332,15 +332,22 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
// Lookup cudaLaunchKernel/hipLaunchKernel function.
+ // HIP kernel launching API name depends on -fgpu-default-stream option. For
+ // the default value 'legacy', it is hipLaunchKernel. For 'per-thread',
+ // it is hipLaunchKernel_spt.
// cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
// void **args, size_t sharedMem,
// cudaStream_t stream);
- // hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
- // void **args, size_t sharedMem,
- // hipStream_t stream);
+ // hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim,
+ // dim3 blockDim, void **args,
+ // size_t sharedMem, hipStream_t stream);
TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
- auto LaunchKernelName = addPrefixToName("LaunchKernel");
+ std::string KernelLaunchAPI = "LaunchKernel";
+ if (CGF.getLangOpts().HIP && CGF.getLangOpts().GPUDefaultStream ==
+ LangOptions::GPUDefaultStreamKind::PerThread)
+ KernelLaunchAPI = KernelLaunchAPI + "_spt";
+ auto LaunchKernelName = addPrefixToName(KernelLaunchAPI);
IdentifierInfo &cudaLaunchKernelII =
CGM.getContext().Idents.get(LaunchKernelName);
FunctionDecl *cudaLaunchKernelFD = nullptr;
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 8df56a2df5b12..341e108ed65da 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -6915,8 +6915,10 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
CmdArgs.push_back(Args.MakeArgString(Twine("-cuid=") + Twine(CUID)));
}
- if (IsHIP)
+ if (IsHIP) {
CmdArgs.push_back("-fcuda-allow-variadic-functions");
+ Args.AddLastArg(CmdArgs, options::OPT_fgpu_default_stream_EQ);
+ }
if (IsCudaDevice || IsHIPDevice) {
StringRef InlineThresh =
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index ff507e2c00aaa..6b7c743d4004f 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -538,6 +538,9 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI,
Builder.defineMacro("__HIP_MEMORY_SCOPE_SYSTEM", "5");
if (LangOpts.CUDAIsDevice)
Builder.defineMacro("__HIP_DEVICE_COMPILE__");
+ if (LangOpts.GPUDefaultStream ==
+ LangOptions::GPUDefaultStreamKind::PerThread)
+ Builder.defineMacro("HIP_API_PER_THREAD_DEFAULT_STREAM");
}
}
diff --git a/clang/test/CodeGenCUDA/Inputs/cuda.h b/clang/test/CodeGenCUDA/Inputs/cuda.h
index af395b3b97bb6..25f64ccefe937 100644
--- a/clang/test/CodeGenCUDA/Inputs/cuda.h
+++ b/clang/test/CodeGenCUDA/Inputs/cuda.h
@@ -35,11 +35,18 @@ int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
extern "C" hipError_t __hipPushCallConfiguration(dim3 gridSize, dim3 blockSize,
size_t sharedSize = 0,
hipStream_t stream = 0);
+#ifndef HIP_API_PER_THREAD_DEFAULT_STREAM
extern "C" hipError_t hipLaunchKernel(const void *func, dim3 gridDim,
dim3 blockDim, void **args,
size_t sharedMem,
hipStream_t stream);
#else
+extern "C" hipError_t hipLaunchKernel_spt(const void *func, dim3 gridDim,
+ dim3 blockDim, void **args,
+ size_t sharedMem,
+ hipStream_t stream);
+#endif //HIP_API_PER_THREAD_DEFAULT_STREAM
+#else
typedef struct cudaStream *cudaStream_t;
typedef enum cudaError {} cudaError_t;
extern "C" int cudaConfigureCall(dim3 gridSize, dim3 blockSize,
diff --git a/clang/test/CodeGenCUDA/kernel-call.cu b/clang/test/CodeGenCUDA/kernel-call.cu
index b76f2c1883576..40407f1c29a38 100644
--- a/clang/test/CodeGenCUDA/kernel-call.cu
+++ b/clang/test/CodeGenCUDA/kernel-call.cu
@@ -5,7 +5,13 @@
// RUN: %clang_cc1 -x hip -emit-llvm %s -o - \
// RUN: | FileCheck %s --check-prefixes=HIP-OLD,CHECK
// RUN: %clang_cc1 -fhip-new-launch-api -x hip -emit-llvm %s -o - \
-// RUN: | FileCheck %s --check-prefixes=HIP-NEW,CHECK
+// RUN: | FileCheck %s --check-prefixes=HIP-NEW,LEGACY,CHECK
+// RUN: %clang_cc1 -fhip-new-launch-api -x hip -emit-llvm %s -o - \
+// RUN: -fgpu-default-stream=legacy \
+// RUN: | FileCheck %s --check-prefixes=HIP-NEW,LEGACY,CHECK
+// RUN: %clang_cc1 -fhip-new-launch-api -x hip -emit-llvm %s -o - \
+// RUN: -fgpu-default-stream=per-thread -DHIP_API_PER_THREAD_DEFAULT_STREAM \
+// RUN: | FileCheck %s --check-prefixes=HIP-NEW,PTH,CHECK
#include "Inputs/cuda.h"
@@ -13,7 +19,8 @@
// HIP-OLD: call{{.*}}hipSetupArgument
// HIP-OLD: call{{.*}}hipLaunchByPtr
// HIP-NEW: call{{.*}}__hipPopCallConfiguration
-// HIP-NEW: call{{.*}}hipLaunchKernel
+// LEGACY: call{{.*}}hipLaunchKernel
+// PTH: call{{.*}}hipLaunchKernel_spt
// CUDA-OLD: call{{.*}}cudaSetupArgument
// CUDA-OLD: call{{.*}}cudaLaunch
// CUDA-NEW: call{{.*}}__cudaPopCallConfiguration
diff --git a/clang/test/Driver/hip-options.hip b/clang/test/Driver/hip-options.hip
index da82410a4fcfd..1169444a0dd7c 100644
--- a/clang/test/Driver/hip-options.hip
+++ b/clang/test/Driver/hip-options.hip
@@ -14,6 +14,14 @@
// DEVINIT: clang{{.*}}" "-cc1" {{.*}}"-fgpu-allow-device-init"
// DEVINIT: clang{{.*}}" "-cc1" {{.*}}"-fgpu-allow-device-init"
+// Check -fgpu-default-stream=per-thread.
+// RUN: %clang -### -nogpuinc -nogpulib -fgpu-default-stream=per-thread \
+// RUN: %s -save-temps 2>&1 | FileCheck -check-prefix=PTH %s
+// PTH: clang{{.*}}" "-cc1" {{.*}}"-E" {{.*}}"-fgpu-default-stream=per-thread"
+// PTH: clang{{.*}}" "-cc1" {{.*}}"-fgpu-default-stream=per-thread" {{.*}}"-x" "hip-cpp-output"
+// PTH: clang{{.*}}" "-cc1" {{.*}}"-E" {{.*}}"-fgpu-default-stream=per-thread"
+// PTH: clang{{.*}}" "-cc1" {{.*}}"-fgpu-default-stream=per-thread" {{.*}}"-x" "hip-cpp-output"
+
// RUN: %clang -### -x hip -target x86_64-pc-windows-msvc -fms-extensions \
// RUN: -mllvm -amdgpu-early-inline-all=true %s 2>&1 | \
// RUN: FileCheck -check-prefix=MLLVM %s
diff --git a/clang/test/Preprocessor/predefined-macros.c b/clang/test/Preprocessor/predefined-macros.c
index 0b67cbe233ca2..897145516c52c 100644
--- a/clang/test/Preprocessor/predefined-macros.c
+++ b/clang/test/Preprocessor/predefined-macros.c
@@ -247,6 +247,7 @@
// CHECK-HIP-NEG-NOT: #define __CUDA_ARCH__
// CHECK-HIP-NEG-NOT: #define __HIP_DEVICE_COMPILE__ 1
// CHECK-HIP-NEG-NOT: #define __CLANG_RDC__ 1
+// CHECK-HIP-NEG-NOT: #define HIP_API_PER_THREAD_DEFAULT_STREAM
// RUN: %clang_cc1 %s -E -dM -o - -x hip -triple amdgcn-amd-amdhsa \
// RUN: -fcuda-is-device \
@@ -265,6 +266,7 @@
// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-HIP-DEV-NEG
// CHECK-HIP-DEV-NEG-NOT: #define __CUDA_ARCH__
// CHECK-HIP-DEV-NEG-NOT: #define __CLANG_RDC__ 1
+// CHECK-HIP-DEV-NEG-NOT: #define HIP_API_PER_THREAD_DEFAULT_STREAM
// RUN: %clang_cc1 %s -E -dM -o - -x cuda -triple x86_64-unknown-linux-gnu \
// RUN: -fgpu-rdc | FileCheck %s --check-prefix=CHECK-RDC
@@ -277,3 +279,11 @@
// RUN: -fgpu-rdc -fcuda-is-device \
// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-RDC
// CHECK-RDC: #define __CLANG_RDC__ 1
+
+// RUN: %clang_cc1 %s -E -dM -o - -x hip -triple x86_64-unknown-linux-gnu \
+// RUN: -fgpu-default-stream=per-thread \
+// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-PTH
+// RUN: %clang_cc1 %s -E -dM -o - -x hip -triple amdgcn-amd-amdhsa \
+// RUN: -fcuda-is-device -fgpu-default-stream=per-thread \
+// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-PTH
+// CHECK-PTH: #define HIP_API_PER_THREAD_DEFAULT_STREAM 1
More information about the cfe-commits
mailing list