[Openmp-commits] [openmp] 9b6ea5e - [OpenMP] Improve omp offload profiler (#68016)
via Openmp-commits
openmp-commits at lists.llvm.org
Fri Dec 22 11:58:15 PST 2023
Author: Felipe Cabarcas
Date: 2023-12-22T14:58:11-05:00
New Revision: 9b6ea5e8f8df3c043fba0a2896ab16d682af01cc
URL: https://github.com/llvm/llvm-project/commit/9b6ea5e8f8df3c043fba0a2896ab16d682af01cc
DIFF: https://github.com/llvm/llvm-project/commit/9b6ea5e8f8df3c043fba0a2896ab16d682af01cc.diff
LOG: [OpenMP] Improve omp offload profiler (#68016)
Summary:
Adding information to the LIBOMPTARGET profiler runtime kernel and API
calls.
Key changes:
* Adding information to runtime calls for better understanding of how
the application
is executing. For example teams requested by the user, size of memory
transfers.
* Profile timer was changed from 'us' to 'ns', since 'us' was too
coarse-grain
to register some important details like key kernel duration
* Removed non API or Runtime calls, to reduce complexity of profile for
application
developers.
---------
Co-authored-by: Felipe Cabarcas <cabarcas at leia.crpl.cis.udel.edu>
Co-authored-by: fel-cab <fel-cab at github.com>
Added:
Modified:
openmp/libomptarget/include/Shared/Profile.h
openmp/libomptarget/src/OpenMP/API.cpp
openmp/libomptarget/src/interface.cpp
openmp/libomptarget/src/omptarget.cpp
Removed:
################################################################################
diff --git a/openmp/libomptarget/include/Shared/Profile.h b/openmp/libomptarget/include/Shared/Profile.h
index 19ca0cf2275186..7e580988a39baf 100644
--- a/openmp/libomptarget/include/Shared/Profile.h
+++ b/openmp/libomptarget/include/Shared/Profile.h
@@ -97,4 +97,16 @@ class Profiler {
std::string RTM = RegionTypeMsg; \
llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM)
+/// Time spend in the current scope, assigned to the regionType
+/// with details from runtime
+#define TIMESCOPE_WITH_DETAILS_AND_IDENT(RegionTypeMsg, Details, IDENT) \
+ SourceInfo SI(IDENT); \
+ std::string ProfileLocation = SI.getProfileLocation(); \
+ llvm::TimeTraceScope TimeScope(RegionTypeMsg, ProfileLocation + Details)
+
+/// Time spend in the current scope, assigned to the function name and source
+/// with details
+#define TIMESCOPE_WITH_DETAILS(Details) \
+ llvm::TimeTraceScope TimeScope(__FUNCTION__, Details)
+
#endif // OMPTARGET_SHARED_PROFILE_H
diff --git a/openmp/libomptarget/src/OpenMP/API.cpp b/openmp/libomptarget/src/OpenMP/API.cpp
index 1769404faf8884..a7b6eac8bcd658 100644
--- a/openmp/libomptarget/src/OpenMP/API.cpp
+++ b/openmp/libomptarget/src/OpenMP/API.cpp
@@ -83,6 +83,8 @@ EXTERN int omp_get_initial_device(void) {
}
EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) {
+ TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DeviceNum) +
+ ";size=" + std::to_string(Size));
return targetAllocExplicit(Size, DeviceNum, TARGET_ALLOC_DEFAULT, __func__);
}
@@ -99,6 +101,7 @@ EXTERN void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum) {
}
EXTERN void omp_target_free(void *Ptr, int DeviceNum) {
+ TIMESCOPE();
return targetFreeExplicit(Ptr, DeviceNum, TARGET_ALLOC_DEFAULT, __func__);
}
@@ -161,7 +164,9 @@ EXTERN int omp_target_is_present(const void *Ptr, int DeviceNum) {
EXTERN int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
size_t DstOffset, size_t SrcOffset, int DstDevice,
int SrcDevice) {
- TIMESCOPE();
+ TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DstDevice) +
+ ";src_dev=" + std::to_string(SrcDevice) +
+ ";size=" + std::to_string(Length));
DP("Call to omp_target_memcpy, dst device %d, src device %d, "
"dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
"src offset %zu, length %zu\n",
@@ -400,7 +405,9 @@ EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length,
size_t DstOffset, size_t SrcOffset,
int DstDevice, int SrcDevice,
int DepObjCount, omp_depend_t *DepObjList) {
- TIMESCOPE();
+ TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DstDevice) +
+ ";src_dev=" + std::to_string(SrcDevice) +
+ ";size=" + std::to_string(Length));
DP("Call to omp_target_memcpy_async, dst device %d, src device %d, "
"dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
"src offset %zu, length %zu\n",
@@ -429,7 +436,6 @@ omp_target_memcpy_rect(void *Dst, const void *Src, size_t ElementSize,
const size_t *DstOffsets, const size_t *SrcOffsets,
const size_t *DstDimensions, const size_t *SrcDimensions,
int DstDevice, int SrcDevice) {
- TIMESCOPE();
DP("Call to omp_target_memcpy_rect, dst device %d, src device %d, "
"dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
"src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
@@ -488,7 +494,10 @@ EXTERN int omp_target_memcpy_rect_async(
const size_t *Volume, const size_t *DstOffsets, const size_t *SrcOffsets,
const size_t *DstDimensions, const size_t *SrcDimensions, int DstDevice,
int SrcDevice, int DepObjCount, omp_depend_t *DepObjList) {
- TIMESCOPE();
+ TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DstDevice) +
+ ";src_dev=" + std::to_string(SrcDevice) +
+ ";size=" + std::to_string(ElementSize) +
+ ";num_dims=" + std::to_string(NumDims));
DP("Call to omp_target_memcpy_rect_async, dst device %d, src device %d, "
"dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
"src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index d9e87640161f26..61d9db17f51006 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -33,14 +33,12 @@ using namespace llvm::omp::target::ompt;
////////////////////////////////////////////////////////////////////////////////
/// adds requires flags
EXTERN void __tgt_register_requires(int64_t Flags) {
- TIMESCOPE();
PM->addRequirements(Flags);
}
////////////////////////////////////////////////////////////////////////////////
/// adds a target shared library to the target execution image
EXTERN void __tgt_register_lib(__tgt_bin_desc *Desc) {
- TIMESCOPE();
if (PM->delayRegisterLib(Desc))
return;
@@ -54,7 +52,6 @@ EXTERN void __tgt_init_all_rtls() { PM->initAllPlugins(); }
////////////////////////////////////////////////////////////////////////////////
/// unloads a target shared library
EXTERN void __tgt_unregister_lib(__tgt_bin_desc *Desc) {
- TIMESCOPE();
PM->unregisterLib(Desc);
}
@@ -68,7 +65,8 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
"TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
- TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, Loc);
+ TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy",
+ "NumArgs=" + std::to_string(ArgNum), Loc);
DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
RegionName, DeviceId, ArgNum);
@@ -240,9 +238,6 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
KernelArgsTy *KernelArgs) {
static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
"Target AsyncInfoTy must be convertible to AsyncInfoTy.");
-
- TIMESCOPE_WITH_IDENT(Loc);
-
DP("Entering target region for device %" PRId64 " with entry point " DPxMOD
"\n",
DeviceId, DPxPTR(HostPtr));
@@ -267,6 +262,11 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
assert(KernelArgs->ThreadLimit[0] == static_cast<uint32_t>(ThreadLimit) &&
!KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&
"OpenMP interface should not use multiple dimensions");
+ TIMESCOPE_WITH_DETAILS_AND_IDENT(
+ "Runtime: target exe",
+ "NumTeams=" + std::to_string(NumTeams) +
+ ";NumArgs=" + std::to_string(KernelArgs->NumArgs),
+ Loc);
if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS)
printKernelArguments(Loc, DeviceId, KernelArgs->NumArgs,
@@ -297,13 +297,14 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
int Rc = OFFLOAD_SUCCESS;
Rc = target(Loc, *DeviceOrErr, HostPtr, *KernelArgs, AsyncInfo);
+ { // required to show syncronization
+ TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: syncronize", "", Loc);
+ if (Rc == OFFLOAD_SUCCESS)
+ Rc = AsyncInfo.synchronize();
- if (Rc == OFFLOAD_SUCCESS)
- Rc = AsyncInfo.synchronize();
-
- handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
- assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!");
-
+ handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
+ assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!");
+ }
return OMP_TGT_SUCCESS;
}
@@ -402,7 +403,6 @@ EXTERN int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId,
// Get the current number of components for a user-defined mapper.
EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) {
- TIMESCOPE();
auto *MapperComponentsPtr = (struct MapperComponentsTy *)RtMapperHandle;
int64_t Size = MapperComponentsPtr->Components.size();
DP("__tgt_mapper_num_components(Handle=" DPxMOD ") returns %" PRId64 "\n",
@@ -414,7 +414,6 @@ EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) {
EXTERN void __tgt_push_mapper_component(void *RtMapperHandle, void *Base,
void *Begin, int64_t Size, int64_t Type,
void *Name) {
- TIMESCOPE();
DP("__tgt_push_mapper_component(Handle=" DPxMOD
") adds an entry (Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
", Type=0x%" PRIx64 ", Name=%s).\n",
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index e724b2f6db8b5f..a7d55d7ebd5391 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -392,7 +392,6 @@ static int32_t getParentIndex(int64_t Type) {
void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
const char *Name) {
- TIMESCOPE();
DP("Call to %s for device %d requesting %zu bytes\n", Name, DeviceNum, Size);
if (Size <= 0) {
@@ -419,7 +418,6 @@ void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
const char *Name) {
- TIMESCOPE();
DP("Call to %s for device %d and address " DPxMOD "\n", Name, DeviceNum,
DPxPTR(DevicePtr));
@@ -444,7 +442,6 @@ void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
const char *Name) {
- TIMESCOPE();
DP("Call to %s for device %d locking %zu bytes\n", Name, DeviceNum, Size);
if (Size <= 0) {
@@ -471,7 +468,6 @@ void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
}
void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) {
- TIMESCOPE();
DP("Call to %s for device %d unlocking\n", Name, DeviceNum);
auto DeviceOrErr = PM->getDevice(DeviceNum);
@@ -531,14 +527,14 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
int64_t *ArgTypes, map_var_info_t *ArgNames,
void **ArgMappers, AsyncInfoTy &AsyncInfo,
bool FromMapper) {
- TIMESCOPE_WITH_IDENT(Loc);
// process each input.
for (int32_t I = 0; I < ArgNum; ++I) {
// Ignore private variables and arrays - there is no mapping for them.
if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
(ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
continue;
-
+ TIMESCOPE_WITH_DETAILS_AND_IDENT(
+ "HostToDev", "Size=" + std::to_string(ArgSizes[I]) + "B", Loc);
if (ArgMappers && ArgMappers[I]) {
// Instead of executing the regular path of targetDataBegin, call the
// targetDataMapper variant which will call targetDataBegin again
@@ -913,7 +909,8 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
!TPR.Flags.IsHostPointer && DataSize != 0) {
DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
-
+ TIMESCOPE_WITH_DETAILS_AND_IDENT(
+ "DevToHost", "Size=" + std::to_string(DataSize) + "B", Loc);
// Wait for any previous transfer if an event is present.
if (void *Event = TPR.getEntry()->getEvent()) {
if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
@@ -1403,7 +1400,6 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
SmallVector<ptr
diff _t> &TgtOffsets,
PrivateArgumentManagerTy &PrivateArgumentManager,
AsyncInfoTy &AsyncInfo) {
- TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", Loc);
auto DeviceOrErr = PM->getDevice(DeviceId);
if (!DeviceOrErr)
@@ -1537,7 +1533,7 @@ static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr,
map_var_info_t *ArgNames, void **ArgMappers,
PrivateArgumentManagerTy &PrivateArgumentManager,
AsyncInfoTy &AsyncInfo) {
- TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", Loc);
+
auto DeviceOrErr = PM->getDevice(DeviceId);
if (!DeviceOrErr)
FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
@@ -1639,7 +1635,12 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
{
assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!");
- TIMESCOPE_WITH_NAME_AND_IDENT("Initiate Kernel Launch", Loc);
+ TIMESCOPE_WITH_DETAILS_AND_IDENT(
+ "Kernel Target",
+ "NumArguments=" + std::to_string(KernelArgs.NumArgs) +
+ ";NumTeams=" + std::to_string(KernelArgs.NumTeams[0]) +
+ ";TripCount=" + std::to_string(KernelArgs.Tripcount),
+ Loc);
#ifdef OMPT_SUPPORT
assert(KernelArgs.NumTeams[1] == 0 && KernelArgs.NumTeams[2] == 0 &&
More information about the Openmp-commits
mailing list