[Openmp-commits] [openmp] 0b9350f - Add mean_anyway to hpc config
Ron Lieberman via Openmp-commits
openmp-commits at lists.llvm.org
Tue Nov 29 13:12:33 PST 2022
Author: Ron Lieberman
Date: 2022-11-29T15:11:57-06:00
New Revision: 0b9350f3da7daf1d740bbbfab79d01613fcd29f4
URL: https://github.com/llvm/llvm-project/commit/0b9350f3da7daf1d740bbbfab79d01613fcd29f4
DIFF: https://github.com/llvm/llvm-project/commit/0b9350f3da7daf1d740bbbfab79d01613fcd29f4.diff
LOG: Add mean_anyway to hpc config
Added:
Modified:
openmp/libomptarget/DeviceRTL/src/Mapping.cpp
openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp
openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h
openmp/libomptarget/plugins/amdgpu/impl/internal.h
openmp/libomptarget/plugins/amdgpu/impl/system.cpp
openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
openmp/runtime/cmake/LibompHandleFlags.cmake
openmp/runtime/cmake/config-ix.cmake
Removed:
################################################################################
diff --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
index 512577c06f9e..6dd935e1128a 100644
--- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
@@ -16,7 +16,7 @@
#include "Utils.h"
#pragma omp begin declare target device_type(nohost)
-
+extern const uint16_t __oclc_ABI_version;
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
using namespace _OMP;
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp
index 69f2a716a8fd..f4a4ceaa92a8 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp
@@ -11,6 +11,7 @@
// identifier) and contains more up to date values for the enum checked here.
// rtl.cpp uses the system elf.h.
#include "llvm/BinaryFormat/ELF.h"
+using namespace llvm::ELF;
const char *get_elf_mach_gfx_name(uint32_t EFlags) {
using namespace llvm::ELF;
@@ -78,3 +79,8 @@ const char *get_elf_mach_gfx_name(uint32_t EFlags) {
return "--unknown gfx";
}
}
+
+const uint16_t implicitArgsSize(uint16_t Version) {
+ return Version < ELFABIVERSION_AMDGPU_HSA_V5 ? IMPLICITARGS::COV4_SIZE
+ : IMPLICITARGS::COV5_SIZE;
+}
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h
index 177963e1b8b5..a5404bd3d793 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h
+++ b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h
@@ -12,4 +12,49 @@
const char *get_elf_mach_gfx_name(uint32_t EFlags);
+enum IMPLICITARGS : uint16_t {
+ COV4_SIZE = 56,
+ COV4_HOSTCALL_PTR_OFFSET = 24,
+ HOSTCALL_PTR_SIZE = 8,
+
+ COV5_SIZE = 256,
+
+ COV5_BLOCK_COUNT_X_OFFSET = 0,
+ COV5_BLOCK_COUNT_X_SIZE = 4,
+
+ COV5_BLOCK_COUNT_Y_OFFSET = 4,
+ COV5_BLOCK_COUNT_Y_SIZE = 4,
+
+ COV5_BLOCK_COUNT_Z_OFFSET = 8,
+ COV5_BLOCK_COUNT_Z_SIZE = 4,
+
+ COV5_GROUP_SIZE_X_OFFSET = 12,
+ COV5_GROUP_SIZE_X_SIZE = 2,
+
+ COV5_GROUP_SIZE_Y_OFFSET = 14,
+ COV5_GROUP_SIZE_Y_SIZE = 2,
+
+ COV5_GROUP_SIZE_Z_OFFSET = 16,
+ COV5_GROUP_SIZE_Z_SIZE = 2,
+
+ COV5_REMAINDER_X_OFFSET = 18,
+ COV5_REMAINDER_X_SIZE = 2,
+
+ COV5_REMAINDER_Y_OFFSET = 20,
+ COV5_REMAINDER_Y_SIZE = 2,
+
+ COV5_REMAINDER_Z_OFFSET = 22,
+ COV5_REMAINDER_Z_SIZE = 2,
+
+ COV5_GRID_DIMS_OFFSET = 64,
+ COV5_GRID_DIMS_SIZE = 2,
+
+ COV5_HOSTCALL_PTR_OFFSET = 80,
+
+ COV5_HEAPV1_PTR_OFFSET = 96,
+ COV5_HEAPV1_PTR_SIZE = 8
+};
+
+const uint16_t implicitArgsSize(uint16_t Version);
+
#endif
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/internal.h b/openmp/libomptarget/plugins/amdgpu/impl/internal.h
index 63b60b24a557..dc94b0ed01f2 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/internal.h
+++ b/openmp/libomptarget/plugins/amdgpu/impl/internal.h
@@ -33,17 +33,6 @@
#define MAX_NUM_KERNELS (1024 * 16)
-typedef struct impl_implicit_args_s {
- uint64_t offset_x;
- uint64_t offset_y;
- uint64_t offset_z;
- uint64_t hostcall_ptr;
- uint64_t unused0;
- uint64_t unused1;
- uint64_t unused2;
-} impl_implicit_args_t;
-static_assert(sizeof(impl_implicit_args_t) == 56, "");
-
// ---------------------- Kernel Start -------------
typedef struct atl_kernel_info_s {
uint64_t kernel_object;
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
index e8dba47b6cde..0170cd4440f5 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
@@ -67,6 +67,17 @@ class KernelArgMD {
HiddenMultiGridSyncArg,
HiddenHostcallBuffer,
HiddenHeapV1,
+ HiddenBlockCountX,
+ HiddenBlockCountY,
+ HiddenBlockCountZ,
+ HiddenGroupSizeX,
+ HiddenGroupSizeY,
+ HiddenGroupSizeZ,
+ HiddenRemainderX,
+ HiddenRemainderY,
+ HiddenRemainderZ,
+ HiddenGridDims,
+ HiddenQueuePtr,
Unknown
};
@@ -102,7 +113,19 @@ static const std::map<std::string, KernelArgMD::ValueKind> ArgValueKind = {
{"hidden_multigrid_sync_arg",
KernelArgMD::ValueKind::HiddenMultiGridSyncArg},
{"hidden_hostcall_buffer", KernelArgMD::ValueKind::HiddenHostcallBuffer},
- {"hidden_heap_v1", KernelArgMD::ValueKind::HiddenHeapV1}};
+ {"hidden_heap_v1", KernelArgMD::ValueKind::HiddenHeapV1},
+ {"hidden_block_count_x", KernelArgMD::ValueKind::HiddenBlockCountX},
+ {"hidden_block_count_y", KernelArgMD::ValueKind::HiddenBlockCountY},
+ {"hidden_block_count_z", KernelArgMD::ValueKind::HiddenBlockCountZ},
+ {"hidden_group_size_x", KernelArgMD::ValueKind::HiddenGroupSizeX},
+ {"hidden_group_size_y", KernelArgMD::ValueKind::HiddenGroupSizeY},
+ {"hidden_group_size_z", KernelArgMD::ValueKind::HiddenGroupSizeZ},
+ {"hidden_remainder_x", KernelArgMD::ValueKind::HiddenRemainderX},
+ {"hidden_remainder_y", KernelArgMD::ValueKind::HiddenRemainderY},
+ {"hidden_remainder_z", KernelArgMD::ValueKind::HiddenRemainderZ},
+ {"hidden_grid_dims", KernelArgMD::ValueKind::HiddenGridDims},
+ {"hidden_queue_ptr", KernelArgMD::ValueKind::HiddenQueuePtr},
+};
namespace core {
@@ -164,6 +187,17 @@ static bool isImplicit(KernelArgMD::ValueKind value_kind) {
case KernelArgMD::ValueKind::HiddenMultiGridSyncArg:
case KernelArgMD::ValueKind::HiddenHostcallBuffer:
case KernelArgMD::ValueKind::HiddenHeapV1:
+ case KernelArgMD::ValueKind::HiddenBlockCountX:
+ case KernelArgMD::ValueKind::HiddenBlockCountY:
+ case KernelArgMD::ValueKind::HiddenBlockCountZ:
+ case KernelArgMD::ValueKind::HiddenGroupSizeX:
+ case KernelArgMD::ValueKind::HiddenGroupSizeY:
+ case KernelArgMD::ValueKind::HiddenGroupSizeZ:
+ case KernelArgMD::ValueKind::HiddenRemainderX:
+ case KernelArgMD::ValueKind::HiddenRemainderY:
+ case KernelArgMD::ValueKind::HiddenRemainderZ:
+ case KernelArgMD::ValueKind::HiddenGridDims:
+ case KernelArgMD::ValueKind::HiddenQueuePtr:
return true;
default:
return false;
@@ -473,8 +507,7 @@ static hsa_status_t get_code_object_custom_metadata(
size_t new_offset = lcArg.offset_;
size_t padding = new_offset - offset;
offset = new_offset;
- DP("Arg[%lu] \"%s\" (%u, %u)\n", i, lcArg.name_.c_str(), lcArg.size_,
- lcArg.offset_);
+
offset += lcArg.size_;
// check if the arg is a hidden/implicit arg
@@ -482,9 +515,13 @@ static hsa_status_t get_code_object_custom_metadata(
if (!isImplicit(lcArg.valueKind_)) {
info.explicit_argument_count++;
kernel_explicit_args_size += lcArg.size_;
+ DP("Explicit Kernel Arg[%lu] \"%s\" (%u, %u)\n", i,
+ lcArg.name_.c_str(), lcArg.size_, lcArg.offset_);
} else {
info.implicit_argument_count++;
hasHiddenArgs = true;
+ DP("Implicit Kernel Arg[%lu] \"%s\" (%u, %u)\n", i,
+ lcArg.name_.c_str(), lcArg.size_, lcArg.offset_);
}
kernel_explicit_args_size += padding;
}
@@ -492,7 +529,7 @@ static hsa_status_t get_code_object_custom_metadata(
// TODO: Probably don't want this arithmetic
info.kernel_segment_size =
- (hasHiddenArgs ? kernel_explicit_args_size : kernel_segment_size);
+ (!hasHiddenArgs ? kernel_explicit_args_size : kernel_segment_size);
DP("[%s: kernarg seg size] (%lu --> %u)\n", kernelName.c_str(),
kernel_segment_size, info.kernel_segment_size);
diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
index b0e29cb6e4e9..38879c8e6eb8 100644
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -124,9 +124,10 @@ struct KernelArgPool {
uint32_t KernargSegmentSize;
void *KernargRegion = nullptr;
std::queue<int> FreeKernargSegments;
+ uint16_t CodeObjectVersion;
uint32_t kernargSizeIncludingImplicit() {
- return KernargSegmentSize + sizeof(impl_implicit_args_t);
+ return KernargSegmentSize + implicitArgsSize(CodeObjectVersion);
}
~KernelArgPool() {
@@ -143,8 +144,10 @@ struct KernelArgPool {
KernelArgPool(const KernelArgPool &) = delete;
KernelArgPool(KernelArgPool &&) = delete;
- KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool)
- : KernargSegmentSize(KernargSegmentSize) {
+ KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool,
+ uint16_t CodeObjectVersion)
+ : KernargSegmentSize(KernargSegmentSize),
+ CodeObjectVersion(CodeObjectVersion) {
// impl uses one pool per kernel for all gpus, with a fixed upper size
// preserving that exact scheme here, including the queue<int>
@@ -228,16 +231,16 @@ struct KernelTy {
KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize,
int32_t DeviceId, void *CallStackAddr, const char *Name,
uint32_t KernargSegmentSize,
- hsa_amd_memory_pool_t &KernArgMemoryPool)
+ hsa_amd_memory_pool_t &KernArgMemoryPool, uint16_t CodeObjectVersion)
: ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize),
DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) {
DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode);
std::string N(Name);
if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) {
- KernelArgPoolMap.insert(
- std::make_pair(N, std::unique_ptr<KernelArgPool>(new KernelArgPool(
- KernargSegmentSize, KernArgMemoryPool))));
+ KernelArgPoolMap.insert(std::make_pair(
+ N, std::unique_ptr<KernelArgPool>(new KernelArgPool(
+ KernargSegmentSize, KernArgMemoryPool, CodeObjectVersion))));
}
}
};
@@ -474,6 +477,7 @@ class RTLDeviceInfoTy : HSALifetime {
std::vector<int> WarpSize;
std::vector<std::string> GPUName;
std::vector<std::string> TargetID;
+ uint16_t CodeObjectVersion;
// OpenMP properties
std::vector<int> NumTeams;
@@ -487,6 +491,7 @@ class RTLDeviceInfoTy : HSALifetime {
// Resource pools
SignalPoolT FreeSignalPool;
+ std::vector<void *> PreallocatedDeviceHeap;
bool HostcallRequired = false;
@@ -861,7 +866,6 @@ class RTLDeviceInfoTy : HSALifetime {
"Unexpected device id!");
FuncGblEntries[DeviceId].emplace_back();
FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
- // KernelArgPoolMap.clear();
E.Entries.clear();
E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
}
@@ -1032,6 +1036,7 @@ class RTLDeviceInfoTy : HSALifetime {
SymbolInfoTable.resize(NumberOfDevices);
DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices);
DeviceFineGrainedMemoryPools.resize(NumberOfDevices);
+ PreallocatedDeviceHeap.resize(NumberOfDevices);
Err = setupDevicePools(HSAAgents);
if (Err != HSA_STATUS_SUCCESS) {
@@ -1361,6 +1366,27 @@ static uint64_t acquireAvailablePacketId(hsa_queue_t *Queue) {
return PacketId;
}
+const uint16_t getCodeObjectVersionFromELF(__tgt_device_image *Image) {
+ char *ImageBegin = (char *)Image->ImageStart;
+ size_t ImageSize = (char *)Image->ImageEnd - ImageBegin;
+
+ StringRef Buffer = StringRef(ImageBegin, ImageSize);
+ auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""),
+ /*InitContent=*/false);
+ if (!ElfOrErr) {
+ REPORT("Failed to load ELF: %s\n", toString(ElfOrErr.takeError()).c_str());
+ return 1;
+ }
+
+ if (const auto *ELFObj = dyn_cast<ELF64LEObjectFile>(ElfOrErr->get())) {
+ auto Header = ELFObj->getELFFile().getHeader();
+ uint16_t Version = (uint8_t)(Header.e_ident[EI_ABIVERSION]);
+ DP("ELFABIVERSION Version: %u\n", Version);
+ return Version;
+ }
+ return 0;
+}
+
int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
ptr
diff _t *TgtOffsets, int32_t ArgNum, int32_t NumTeams,
int32_t ThreadLimit, uint64_t LoopTripcount) {
@@ -1438,6 +1464,7 @@ int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
}
uint64_t PacketId = acquireAvailablePacketId(Queue);
+ uint16_t CodeObjectVersion = DeviceInfo().CodeObjectVersion;
const uint32_t Mask = Queue->size - 1; // size is a power of 2
hsa_kernel_dispatch_packet_t *Packet =
(hsa_kernel_dispatch_packet_t *)Queue->base_address + (PacketId & Mask);
@@ -2160,6 +2187,40 @@ __tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
return Res;
}
+static void preAllocateHeapMemoryForCov5() {
+ void *DevPtr;
+ for (int I = 0; I < DeviceInfo().NumberOfDevices; I++) {
+ DevPtr = nullptr;
+ size_t PreAllocSize = 131072; // 128KB per device
+
+ hsa_amd_memory_pool_t MemoryPool =
+ DeviceInfo().DeviceCoarseGrainedMemoryPools[I];
+ hsa_status_t Err =
+ hsa_amd_memory_pool_allocate(MemoryPool, PreAllocSize, 0, &DevPtr);
+ if (Err != HSA_STATUS_SUCCESS) {
+ DP("Error allocating preallocated heap device memory: %s\n",
+ get_error_string(Err));
+ }
+
+ Err = hsa_amd_agents_allow_access(1, &DeviceInfo().HSAAgents[I], NULL,
+ DevPtr);
+ if (Err != HSA_STATUS_SUCCESS) {
+ DP("hsa allow_access_to_all_gpu_agents failed: %s\n",
+ get_error_string(Err));
+ }
+
+ uint64_t Rounded =
+ sizeof(uint32_t) * ((PreAllocSize + 3) / sizeof(uint32_t));
+ Err = hsa_amd_memory_fill(DevPtr, 0, Rounded / sizeof(uint32_t));
+ if (Err != HSA_STATUS_SUCCESS) {
+ DP("Error zero-initializing preallocated heap device memory:%s\n",
+ get_error_string(Err));
+ }
+
+ DeviceInfo().PreallocatedDeviceHeap[I] = DevPtr;
+ }
+}
+
__tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
__tgt_device_image *Image) {
// This function loads the device image onto gpu[DeviceId] and does other
@@ -2194,6 +2255,12 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
if (!elfMachineIdIsAmdgcn(Image))
return NULL;
+ DeviceInfo().CodeObjectVersion = getCodeObjectVersionFromELF(Image);
+ if (DeviceInfo().CodeObjectVersion >=
+ llvm::ELF::ELFABIVERSION_AMDGPU_HSA_V5) {
+ preAllocateHeapMemoryForCov5();
+ }
+
{
auto Env =
DeviceEnvironment(DeviceId, DeviceInfo().NumberOfDevices,
@@ -2517,7 +2584,8 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId,
CallStackAddr, E->name, KernargSegmentSize,
- DeviceInfo().KernArgPool));
+ DeviceInfo().KernArgPool,
+ DeviceInfo().CodeObjectVersion));
__tgt_offload_entry Entry = *E;
Entry.addr = (void *)&KernelsList.back();
DeviceInfo().addOffloadEntry(DeviceId, Entry);
diff --git a/openmp/runtime/cmake/LibompHandleFlags.cmake b/openmp/runtime/cmake/LibompHandleFlags.cmake
index a6adbe3f2f54..684eae9f0b25 100644
--- a/openmp/runtime/cmake/LibompHandleFlags.cmake
+++ b/openmp/runtime/cmake/LibompHandleFlags.cmake
@@ -100,7 +100,7 @@ function(libomp_get_ldflags ldflags)
libomp_append(ldflags_local "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}${LIBOMP_VERSION_MAJOR}.${LIBOMP_VERSION_MINOR}"
IF_DEFINED CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG)
libomp_append(ldflags_local -Wl,--as-needed LIBOMP_HAVE_AS_NEEDED_FLAG)
- libomp_append(ldflags_local "-Wl,--version-script=${LIBOMP_SRC_DIR}/exports_so.txt" LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
+ libomp_append(ldflags_local "-Wl,--version-script=${LIBOMP_SRC_DIR}/exports_so.txt -Wl,--undefined-version" LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
libomp_append(ldflags_local "-Wl,--undefined-version" LIBOMP_HAVE_UNDEFINED_VERSION_FLAG) # FIXME issue #58858
libomp_append(ldflags_local -static-libgcc LIBOMP_HAVE_STATIC_LIBGCC_FLAG)
libomp_append(ldflags_local -Wl,-z,noexecstack LIBOMP_HAVE_Z_NOEXECSTACK_FLAG)
diff --git a/openmp/runtime/cmake/config-ix.cmake b/openmp/runtime/cmake/config-ix.cmake
index 1e02d5a8b5cf..d1346121edf5 100644
--- a/openmp/runtime/cmake/config-ix.cmake
+++ b/openmp/runtime/cmake/config-ix.cmake
@@ -131,7 +131,7 @@ if(WIN32)
elseif(NOT APPLE)
libomp_check_linker_flag(-Wl,-x LIBOMP_HAVE_X_FLAG)
libomp_check_linker_flag(-Wl,--as-needed LIBOMP_HAVE_AS_NEEDED_FLAG)
- libomp_check_linker_flag("-Wl,--version-script=${LIBOMP_SRC_DIR}/exports_so.txt" LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
+ libomp_check_linker_flag("-Wl,--version-script=${LIBOMP_SRC_DIR}/exports_so.txt -Wl,--undefined-version" LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
libomp_check_linker_flag("-Wl,--undefined-version" LIBOMP_HAVE_UNDEFINED_VERSION_FLAG) # FIXME issue #58858
libomp_check_linker_flag(-static-libgcc LIBOMP_HAVE_STATIC_LIBGCC_FLAG)
libomp_check_linker_flag(-Wl,-z,noexecstack LIBOMP_HAVE_Z_NOEXECSTACK_FLAG)
More information about the Openmp-commits
mailing list