[llvm-branch-commits] [openmp] ee1c408 - Revert "[OpenMP][libomptarget] Enable automatic unified shared memory executi… (#75999)"

via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Mon Jan 8 12:37:30 PST 2024


Author: carlobertolli
Date: 2024-01-08T14:37:25-06:00
New Revision: ee1c408206ac48c612f9f4b8cc526d9167097090

URL: https://github.com/llvm/llvm-project/commit/ee1c408206ac48c612f9f4b8cc526d9167097090
DIFF: https://github.com/llvm/llvm-project/commit/ee1c408206ac48c612f9f4b8cc526d9167097090.diff

LOG: Revert "[OpenMP][libomptarget] Enable automatic unified shared memory executi… (#75999)"

This reverts commit 22a73e7c4616e0405db85598c049a7ca70cca7cc.

Added: 
    

Modified: 
    openmp/libomptarget/include/Shared/PluginAPI.h
    openmp/libomptarget/include/Shared/PluginAPI.inc
    openmp/libomptarget/include/Shared/Requirements.h
    openmp/libomptarget/include/device.h
    openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
    openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
    openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
    openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
    openmp/libomptarget/src/OpenMP/Mapping.cpp
    openmp/libomptarget/src/PluginManager.cpp
    openmp/libomptarget/src/device.cpp

Removed: 
    openmp/libomptarget/test/mapping/auto_zero_copy.cpp


################################################################################
diff  --git a/openmp/libomptarget/include/Shared/PluginAPI.h b/openmp/libomptarget/include/Shared/PluginAPI.h
index aece53d7ee1caa..c6aacf4ce2124b 100644
--- a/openmp/libomptarget/include/Shared/PluginAPI.h
+++ b/openmp/libomptarget/include/Shared/PluginAPI.h
@@ -219,9 +219,6 @@ int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId, int64_t MemorySize,
                                            void *VAddr, bool isRecord,
                                            bool SaveOutput,
                                            uint64_t &ReqPtrArgOffset);
-
-// Returns true if the device \p DeviceId suggests to use auto zero-copy.
-int32_t __tgt_rtl_use_auto_zero_copy(int32_t DeviceId);
 }
 
 #endif // OMPTARGET_SHARED_PLUGIN_API_H

diff  --git a/openmp/libomptarget/include/Shared/PluginAPI.inc b/openmp/libomptarget/include/Shared/PluginAPI.inc
index b842c6eef1d4fc..25ebe7d437f9d1 100644
--- a/openmp/libomptarget/include/Shared/PluginAPI.inc
+++ b/openmp/libomptarget/include/Shared/PluginAPI.inc
@@ -47,4 +47,3 @@ PLUGIN_API_HANDLE(data_notify_mapped, false);
 PLUGIN_API_HANDLE(data_notify_unmapped, false);
 PLUGIN_API_HANDLE(set_device_offset, false);
 PLUGIN_API_HANDLE(initialize_record_replay, false);
-PLUGIN_API_HANDLE(use_auto_zero_copy, false);

diff  --git a/openmp/libomptarget/include/Shared/Requirements.h b/openmp/libomptarget/include/Shared/Requirements.h
index b16a1650f0c403..19d6b8ffca495f 100644
--- a/openmp/libomptarget/include/Shared/Requirements.h
+++ b/openmp/libomptarget/include/Shared/Requirements.h
@@ -33,12 +33,7 @@ enum OpenMPOffloadingRequiresDirFlags : int64_t {
   /// unified_shared_memory clause.
   OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
   /// dynamic_allocators clause.
-  OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
-  /// Auto zero-copy extension:
-  /// when running on an APU, the GPU plugin may decide to
-  /// run in zero-copy even though the user did not program
-  /// their application with unified_shared_memory requirement.
-  OMPX_REQ_AUTO_ZERO_COPY = 0x020
+  OMP_REQ_DYNAMIC_ALLOCATORS = 0x010
 };
 
 class RequirementCollection {
@@ -70,14 +65,6 @@ class RequirementCollection {
       return;
     }
 
-    // Auto zero-copy is only valid when no other requirement has been set
-    // and it is computed at device initialization time, after the requirement
-    // flag has already been set to OMP_REQ_NONE.
-    if (SetFlags == OMP_REQ_NONE && NewFlags == OMPX_REQ_AUTO_ZERO_COPY) {
-      SetFlags = NewFlags;
-      return;
-    }
-
     // If multiple compilation units are present enforce
     // consistency across all of them for require clauses:
     //  - reverse_offload

diff  --git a/openmp/libomptarget/include/device.h b/openmp/libomptarget/include/device.h
index 8b4396ac468d78..d28d3c508faf56 100644
--- a/openmp/libomptarget/include/device.h
+++ b/openmp/libomptarget/include/device.h
@@ -164,9 +164,6 @@ struct DeviceTy {
   /// Print all offload entries to stderr.
   void dumpOffloadEntries();
 
-  /// Ask the device whether the runtime should use auto zero-copy.
-  bool useAutoZeroCopy();
-
 private:
   /// Deinitialize the device (and plugin).
   void deinit();

diff  --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index b5f0baee23dc2c..b67642e9e1bcb3 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -1848,9 +1848,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
         OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT", 2000000),
         OMPX_UseMultipleSdmaEngines(
             "LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES", false),
-        HSAXnackEnv("HSA_XNACK", false), AMDGPUStreamManager(*this, Agent),
-        AMDGPUEventManager(*this), AMDGPUSignalManager(*this), Agent(Agent),
-        HostDevice(HostDevice) {}
+        AMDGPUStreamManager(*this, Agent), AMDGPUEventManager(*this),
+        AMDGPUSignalManager(*this), Agent(Agent), HostDevice(HostDevice) {}
 
   ~AMDGPUDeviceTy() {}
 
@@ -1941,10 +1940,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     if (auto Err = AMDGPUSignalManager.init(OMPX_InitialNumSignals))
       return Err;
 
-    // detect if device is an APU.
-    if (auto Err = checkIfAPU())
-      return Err;
-
     return Plugin::success();
   }
 
@@ -2636,14 +2631,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     return Plugin::success();
   }
 
-  /// Returns true if auto zero-copy the best configuration for the current
-  /// arch.
-  bool useAutoZeroCopyImpl() override {
-    // XNACK can be enabled with with kernel boot parameter or with
-    // environment variable.
-    return (IsAPU && (HSAXnackEnv || utils::isXnackEnabledViaKernelParam()));
-  }
-
   /// Getters and setters for stack and heap sizes.
   Error getDeviceStackSize(uint64_t &Value) override {
     Value = StackSize;
@@ -2741,30 +2728,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     return Err;
   }
 
-  /// Detect if current architecture is an APU.
-  Error checkIfAPU() {
-    std::string StrGfxName(ComputeUnitKind);
-    std::transform(std::begin(StrGfxName), std::end(StrGfxName),
-                   std::begin(StrGfxName),
-                   [](char c) { return std::tolower(c); });
-    if (StrGfxName == "gfx940") {
-      IsAPU = true;
-      return Plugin::success();
-    }
-    if (StrGfxName == "gfx942") {
-      // can be MI300A or MI300X
-      uint32_t ChipID = 0;
-      if (auto Err = getDeviceAttr(HSA_AMD_AGENT_INFO_CHIP_ID, ChipID))
-        return Err;
-
-      if (!(ChipID & 0x1)) {
-        IsAPU = true;
-        return Plugin::success();
-      }
-    }
-    return Plugin::success();
-  }
-
   /// Envar for controlling the number of HSA queues per device. High number of
   /// queues may degrade performance.
   UInt32Envar OMPX_NumQueues;
@@ -2801,9 +2764,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// Use ROCm 5.7 interface for multiple SDMA engines
   BoolEnvar OMPX_UseMultipleSdmaEngines;
 
-  /// Value of HSA_XNACK environment variable.
-  BoolEnvar HSAXnackEnv;
-
   /// Stream manager for AMDGPU streams.
   AMDGPUStreamManagerTy AMDGPUStreamManager;
 
@@ -2834,9 +2794,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// The current size of the stack that will be used in cases where it could
   /// not be statically determined.
   uint64_t StackSize = 16 * 1024 /* 16 KB */;
-
-  /// Is the plugin associated with an APU?
-  bool IsAPU{false};
 };
 
 Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {

diff  --git a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
index c5a58f82441458..58a3b5df00fac6 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
@@ -116,34 +116,6 @@ inline bool isImageCompatibleWithEnv(StringRef ImageArch, uint32_t ImageFlags,
   return true;
 }
 
-inline bool isXnackEnabledViaKernelParam() {
-
-  ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrError =
-      MemoryBuffer::getFileAsStream("/proc/cmdline");
-
-  if (std::error_code ErrorCode = FileOrError.getError()) {
-    FAILURE_MESSAGE("Cannot open /proc/cmdline : %s\n",
-                    ErrorCode.message().c_str());
-    return false;
-  }
-
-  StringRef FileContent = (FileOrError.get())->getBuffer();
-
-  StringRef RefString("amdgpu.noretry=");
-  int SizeOfRefString = RefString.size();
-
-  size_t Pos = FileContent.find_insensitive(RefString);
-  // Is noretry defined?
-  if (Pos != StringRef::npos) {
-    bool NoRetryValue = FileContent[Pos + SizeOfRefString] - '0';
-    // is noretry set to 0
-    if (!NoRetryValue)
-      return true;
-  }
-
-  return false;
-}
-
 struct KernelMetaDataTy {
   uint64_t KernelObject;
   uint32_t GroupSegmentList;

diff  --git a/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
index abe85f43c2e726..b85dc146d86d2f 100644
--- a/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
@@ -872,11 +872,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
 
   virtual Error getDeviceStackSize(uint64_t &V) = 0;
 
-  /// Returns true if current plugin architecture is an APU
-  /// and unified_shared_memory was not requested by the program.
-  bool useAutoZeroCopy();
-  virtual bool useAutoZeroCopyImpl() { return false; }
-
 private:
   /// Register offload entry for global variable.
   Error registerGlobalOffloadEntry(DeviceImageTy &DeviceImage,

diff  --git a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
index e82c2f7bef14f0..9490e58fc669cd 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1561,8 +1561,6 @@ Error GenericDeviceTy::syncEvent(void *EventPtr) {
   return syncEventImpl(EventPtr);
 }
 
-bool GenericDeviceTy::useAutoZeroCopy() { return useAutoZeroCopyImpl(); }
-
 Error GenericPluginTy::init() {
   auto NumDevicesOrErr = initImpl();
   if (!NumDevicesOrErr)
@@ -2075,14 +2073,6 @@ int32_t __tgt_rtl_set_device_offset(int32_t DeviceIdOffset) {
   return OFFLOAD_SUCCESS;
 }
 
-int32_t __tgt_rtl_use_auto_zero_copy(int32_t DeviceId) {
-  // Automatic zero-copy only applies to programs that did
-  // not request unified_shared_memory and are deployed on an
-  // APU with XNACK enabled.
-  if (Plugin::get().getRequiresFlags() & OMP_REQ_UNIFIED_SHARED_MEMORY)
-    return false;
-  return Plugin::get().getDevice(DeviceId).useAutoZeroCopy();
-}
 #ifdef __cplusplus
 }
 #endif

diff  --git a/openmp/libomptarget/src/OpenMP/Mapping.cpp b/openmp/libomptarget/src/OpenMP/Mapping.cpp
index 87ab70dec2a2d8..a5c24810e0af95 100644
--- a/openmp/libomptarget/src/OpenMP/Mapping.cpp
+++ b/openmp/libomptarget/src/OpenMP/Mapping.cpp
@@ -252,9 +252,8 @@ TargetPointerResultTy MappingInfoTy::getTargetPointer(
       MESSAGE("device mapping required by 'present' map type modifier does not "
               "exist for host address " DPxMOD " (%" PRId64 " bytes)",
               DPxPTR(HstPtrBegin), Size);
-  } else if ((PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY &&
-              !HasCloseModifier) ||
-             (PM->getRequirements() & OMPX_REQ_AUTO_ZERO_COPY)) {
+  } else if (PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY &&
+             !HasCloseModifier) {
     // If unified shared memory is active, implicitly mapped variables that are
     // not privatized use host address. Any explicitly mapped variables also use
     // host address where correctness is not impeded. In all other cases maps
@@ -262,10 +261,6 @@ TargetPointerResultTy MappingInfoTy::getTargetPointer(
     // In addition to the mapping rules above, the close map modifier forces the
     // mapping of the variable to the device.
     if (Size) {
-      INFO(OMP_INFOTYPE_MAPPING_CHANGED, Device.DeviceID,
-           "Return HstPtrBegin " DPxMOD " Size=%" PRId64 " for unified shared "
-           "memory\n",
-           DPxPTR((uintptr_t)HstPtrBegin), Size);
       DP("Return HstPtrBegin " DPxMOD " Size=%" PRId64 " for unified shared "
          "memory\n",
          DPxPTR((uintptr_t)HstPtrBegin), Size);
@@ -420,8 +415,7 @@ TargetPointerResultTy MappingInfoTy::getTgtPtrBegin(
          LR.TPR.getEntry()->dynRefCountToStr().c_str(), DynRefCountAction,
          LR.TPR.getEntry()->holdRefCountToStr().c_str(), HoldRefCountAction);
     LR.TPR.TargetPointer = (void *)TP;
-  } else if (PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY ||
-             PM->getRequirements() & OMPX_REQ_AUTO_ZERO_COPY) {
+  } else if (PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY) {
     // If the value isn't found in the mapping and unified shared memory
     // is on then it means we have stumbled upon a value which we need to
     // use directly from the host.

diff  --git a/openmp/libomptarget/src/PluginManager.cpp b/openmp/libomptarget/src/PluginManager.cpp
index 82b0ecdcd647a8..da2e08180eead8 100644
--- a/openmp/libomptarget/src/PluginManager.cpp
+++ b/openmp/libomptarget/src/PluginManager.cpp
@@ -144,33 +144,19 @@ void PluginAdaptorTy::initDevices(PluginManager &PM) {
 
   int32_t NumPD = getNumberOfPluginDevices();
   ExclusiveDevicesAccessor->reserve(DeviceOffset + NumPD);
-  // Auto zero-copy is a per-device property. We need to ensure
-  // that all devices are suggesting to use it.
-  bool UseAutoZeroCopy = true;
-  if (NumPD == 0)
-    UseAutoZeroCopy = false;
   for (int32_t PDevI = 0, UserDevId = DeviceOffset; PDevI < NumPD; PDevI++) {
     auto Device = std::make_unique<DeviceTy>(this, UserDevId, PDevI);
-
     if (auto Err = Device->init()) {
       DP("Skip plugin known device %d: %s\n", PDevI,
          toString(std::move(Err)).c_str());
       continue;
     }
-    UseAutoZeroCopy = UseAutoZeroCopy && Device->useAutoZeroCopy();
 
     ExclusiveDevicesAccessor->push_back(std::move(Device));
     ++NumberOfUserDevices;
     ++UserDevId;
   }
 
-  // Auto Zero-Copy can only be currently triggered when the system is an
-  // homogeneous APU architecture without attached discrete GPUs.
-  // If all devices suggest to use it, change requirment flags to trigger
-  // zero-copy behavior when mapping memory.
-  if (UseAutoZeroCopy)
-    PM.addRequirements(OMPX_REQ_AUTO_ZERO_COPY);
-
   DP("Plugin adaptor " DPxMOD " has index %d, exposes %d out of %d devices!\n",
      DPxPTR(LibraryHandler.get()), DeviceOffset, NumberOfUserDevices,
      NumberOfPluginDevices);

diff  --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp
index 919c4b55c03655..dbad13b92bcc14 100644
--- a/openmp/libomptarget/src/device.cpp
+++ b/openmp/libomptarget/src/device.cpp
@@ -340,9 +340,3 @@ void DeviceTy::dumpOffloadEntries() {
     fprintf(stderr, "  %11s: %s\n", Kind, It.second->getNameAsCStr());
   }
 }
-
-bool DeviceTy::useAutoZeroCopy() {
-  if (RTL->use_auto_zero_copy)
-    return RTL->use_auto_zero_copy(RTLDeviceID);
-  return false;
-}

diff  --git a/openmp/libomptarget/test/mapping/auto_zero_copy.cpp b/openmp/libomptarget/test/mapping/auto_zero_copy.cpp
deleted file mode 100644
index 80bb3d24a0c6e4..00000000000000
--- a/openmp/libomptarget/test/mapping/auto_zero_copy.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-
-// RUN: %libomptarget-compilexx-generic
-// RUN: env HSA_XNACK=1 LIBOMPTARGET_INFO=30 %libomptarget-run-generic 2>&1 \
-// RUN: | %fcheck-generic -check-prefix=INFO_ZERO -check-prefix=CHECK
-
-// RUN: %libomptarget-compilexx-generic
-// RUN: env HSA_XNACK=1 LIBOMPTARGET_INFO=30 USE_USM=1 %libomptarget-run-generic 2>&1 \
-// RUN: | %fcheck-generic -check-prefix=INFO_ZERO -check-prefix=CHECK
-
-// RUN: %libomptarget-compilexx-generic
-// RUN: env HSA_XNACK=0 LIBOMPTARGET_INFO=30 %libomptarget-run-generic 2>&1 \
-// RUN: | %fcheck-generic -check-prefix=INFO_COPY -check-prefix=CHECK
-
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: nvptx64-nvidia-cuda
-// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
-// UNSUPPORTED: x86_64-pc-linux-gnu
-// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
-
-#include <cstdio>
-
-#if (USE_USM == 1)
-#pragma omp requires unified_shared_memory
-#endif
-
-int main() {
-  int n = 1024;
-
-  // test various mapping types
-  int *a = new int[n];
-  int k = 3;
-  int b[n];
-
-  for (int i = 0; i < n; i++)
-    b[i] = i;
-
-  // INFO_ZERO: Return HstPtrBegin 0x{{.*}} Size=4096 for unified shared memory
-  // INFO_ZERO: Return HstPtrBegin 0x{{.*}} Size=4096 for unified shared memory
-
-  // INFO_COPY: Creating new map entry with HstPtrBase=0x{{.*}}, HstPtrBegin=0x{{.*}}, TgtAllocBegin=0x{{.*}}, TgtPtrBegin=0x{{.*}}, Size=4096,
-  // INFO_COPY: Creating new map entry with HstPtrBase=0x{{.*}}, HstPtrBegin=0x{{.*}}, TgtAllocBegin=0x{{.*}}, TgtPtrBegin=0x{{.*}}, Size=4096,
-  // INFO_COPY: Mapping exists with HstPtrBegin=0x{{.*}}, TgtPtrBegin=0x{{.*}}, Size=4096, DynRefCount=1 (update suppressed)
-  // INFO_COPY: Mapping exists with HstPtrBegin=0x{{.*}}, TgtPtrBegin=0x{{.*}}, Size=4096, DynRefCount=1 (update suppressed)
-#pragma omp target teams distribute parallel for map(tofrom : a[ : n])         \
-    map(to : b[ : n])
-  for (int i = 0; i < n; i++)
-    a[i] = i + b[i] + k;
-
-  int err = 0;
-  for (int i = 0; i < n; i++)
-    if (a[i] != i + b[i] + k)
-      err++;
-
-  // CHECK: PASS
-  if (err == 0)
-    printf("PASS\n");
-  return err;
-}


        


More information about the llvm-branch-commits mailing list