[Openmp-commits] [openmp] [OpenMP] Enable automatic unified shared memory on MI300A. (PR #77512)

via Openmp-commits openmp-commits at lists.llvm.org
Tue Jan 16 08:22:19 PST 2024


https://github.com/carlobertolli updated https://github.com/llvm/llvm-project/pull/77512

>From c4e765d0e1305cc961c7ee6de0db8d02387cbee5 Mon Sep 17 00:00:00 2001
From: Carlo Bertolli <carlo.bertolli at amd.com>
Date: Tue, 9 Jan 2024 11:53:42 -0600
Subject: [PATCH] [OpenMP] Enable automatic unified shared memory on MI300A.

This patch enables applications that did not request OpenMP unified_shared_memory to run with the same zero-copy behavior,
where mapped memory does not result in extra memory allocations and memory copies, but CPU-allocated memory is accessed from the device.
The name for this behavior is "automatic zero-copy" and it relies on detecting: that the runtime is running on a MI300A,
that the user did not select unified_shared_memory in their program, and that XNACK (unified memory support) is enabled in the current GPU configuration.
If all these conditions are met, then automatic zero-copy is triggered.

This patch also introduces an environment variable OMPX_APU_MAPS that, if set, triggers automatic zero-copy
also on non APU GPUs (e.g., on discrete GPUs).
This patch is still missing support for global variables, which will be provided in a subsequent patch.

Co-authored-by: Thorsten Blass <thorsten.blass at amd.com>
Co-authored-by: Carlo Bertolli <carlo.bertolli at amd.com>
---
 .../libomptarget/include/Shared/PluginAPI.h   |   3 +
 .../libomptarget/include/Shared/PluginAPI.inc |   1 +
 .../include/Shared/Requirements.h             |  15 ++-
 openmp/libomptarget/include/device.h          |   3 +
 .../amdgpu/dynamic_hsa/hsa_ext_amd.h          |   1 +
 .../plugins-nextgen/amdgpu/src/rtl.cpp        | 122 ++++++++++++++----
 .../common/include/PluginInterface.h          |   5 +
 .../common/src/PluginInterface.cpp            |  10 ++
 openmp/libomptarget/src/OpenMP/Mapping.cpp    |  13 +-
 openmp/libomptarget/src/PluginManager.cpp     |  11 ++
 openmp/libomptarget/src/device.cpp            |   6 +
 .../test/mapping/auto_zero_copy.cpp           |  55 ++++++++
 12 files changed, 217 insertions(+), 28 deletions(-)
 create mode 100644 openmp/libomptarget/test/mapping/auto_zero_copy.cpp

diff --git a/openmp/libomptarget/include/Shared/PluginAPI.h b/openmp/libomptarget/include/Shared/PluginAPI.h
index c6aacf4ce2124b..aece53d7ee1caa 100644
--- a/openmp/libomptarget/include/Shared/PluginAPI.h
+++ b/openmp/libomptarget/include/Shared/PluginAPI.h
@@ -219,6 +219,9 @@ int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId, int64_t MemorySize,
                                            void *VAddr, bool isRecord,
                                            bool SaveOutput,
                                            uint64_t &ReqPtrArgOffset);
+
+// Returns true if the device \p DeviceId suggests to use auto zero-copy.
+int32_t __tgt_rtl_use_auto_zero_copy(int32_t DeviceId);
 }
 
 #endif // OMPTARGET_SHARED_PLUGIN_API_H
diff --git a/openmp/libomptarget/include/Shared/PluginAPI.inc b/openmp/libomptarget/include/Shared/PluginAPI.inc
index 25ebe7d437f9d1..b842c6eef1d4fc 100644
--- a/openmp/libomptarget/include/Shared/PluginAPI.inc
+++ b/openmp/libomptarget/include/Shared/PluginAPI.inc
@@ -47,3 +47,4 @@ PLUGIN_API_HANDLE(data_notify_mapped, false);
 PLUGIN_API_HANDLE(data_notify_unmapped, false);
 PLUGIN_API_HANDLE(set_device_offset, false);
 PLUGIN_API_HANDLE(initialize_record_replay, false);
+PLUGIN_API_HANDLE(use_auto_zero_copy, false);
diff --git a/openmp/libomptarget/include/Shared/Requirements.h b/openmp/libomptarget/include/Shared/Requirements.h
index 19d6b8ffca495f..b16a1650f0c403 100644
--- a/openmp/libomptarget/include/Shared/Requirements.h
+++ b/openmp/libomptarget/include/Shared/Requirements.h
@@ -33,7 +33,12 @@ enum OpenMPOffloadingRequiresDirFlags : int64_t {
   /// unified_shared_memory clause.
   OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
   /// dynamic_allocators clause.
-  OMP_REQ_DYNAMIC_ALLOCATORS = 0x010
+  OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
+  /// Auto zero-copy extension:
+  /// when running on an APU, the GPU plugin may decide to
+  /// run in zero-copy even though the user did not program
+  /// their application with unified_shared_memory requirement.
+  OMPX_REQ_AUTO_ZERO_COPY = 0x020
 };
 
 class RequirementCollection {
@@ -65,6 +70,14 @@ class RequirementCollection {
       return;
     }
 
+    // Auto zero-copy is only valid when no other requirement has been set
+    // and it is computed at device initialization time, after the requirement
+    // flag has already been set to OMP_REQ_NONE.
+    if (SetFlags == OMP_REQ_NONE && NewFlags == OMPX_REQ_AUTO_ZERO_COPY) {
+      SetFlags = NewFlags;
+      return;
+    }
+
     // If multiple compilation units are present enforce
     // consistency across all of them for require clauses:
     //  - reverse_offload
diff --git a/openmp/libomptarget/include/device.h b/openmp/libomptarget/include/device.h
index e94f48891dab36..3023fba6cc64db 100644
--- a/openmp/libomptarget/include/device.h
+++ b/openmp/libomptarget/include/device.h
@@ -164,6 +164,9 @@ struct DeviceTy {
   /// Print all offload entries to stderr.
   void dumpOffloadEntries();
 
+  /// Ask the device whether the runtime should use auto zero-copy.
+  bool useAutoZeroCopy();
+
 private:
   /// Deinitialize the device (and plugin).
   void deinit();
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h b/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
index 9c59d3bf824de3..3117763e35896d 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
@@ -63,6 +63,7 @@ typedef enum {
 } hsa_amd_memory_pool_access_t;
 
 typedef enum hsa_amd_agent_info_s {
+  HSA_AMD_AGENT_INFO_CHIP_ID = 0xA000,
   HSA_AMD_AGENT_INFO_CACHELINE_SIZE = 0xA001,
   HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT = 0xA002,
   HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY = 0xA003,
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index b67642e9e1bcb3..5bf4fbca51b936 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -183,6 +183,29 @@ Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent,
 #endif
 }
 
+Expected<std::string> getTargetTripleAndFeatures(hsa_agent_t Agent) {
+  std::string Target;
+  auto Err = utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) {
+    uint32_t Length;
+    hsa_status_t Status;
+    Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME_LENGTH, &Length);
+    if (Status != HSA_STATUS_SUCCESS)
+      return Status;
+
+    llvm::SmallVector<char> ISAName(Length);
+    Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, ISAName.begin());
+    if (Status != HSA_STATUS_SUCCESS)
+      return Status;
+
+    llvm::StringRef TripleTarget(ISAName.begin(), Length);
+    if (TripleTarget.consume_front("amdgcn-amd-amdhsa"))
+      Target = TripleTarget.ltrim('-').rtrim('\0').str();
+    return HSA_STATUS_SUCCESS;
+  });
+  if (Err)
+    return Err;
+  return Target;
+}
 } // namespace utils
 
 /// Utility class representing generic resource references to AMDGPU resources.
@@ -1848,8 +1871,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
         OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT", 2000000),
         OMPX_UseMultipleSdmaEngines(
             "LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES", false),
-        AMDGPUStreamManager(*this, Agent), AMDGPUEventManager(*this),
-        AMDGPUSignalManager(*this), Agent(Agent), HostDevice(HostDevice) {}
+        OMPX_ApuMaps("OMPX_APU_MAPS", false), AMDGPUStreamManager(*this, Agent),
+        AMDGPUEventManager(*this), AMDGPUSignalManager(*this), Agent(Agent),
+        HostDevice(HostDevice) {}
 
   ~AMDGPUDeviceTy() {}
 
@@ -1940,6 +1964,19 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     if (auto Err = AMDGPUSignalManager.init(OMPX_InitialNumSignals))
       return Err;
 
+    // Detect if XNACK is enabled
+    auto TargeTripleAndFeaturesOrError =
+        utils::getTargetTripleAndFeatures(Agent);
+    if (!TargeTripleAndFeaturesOrError)
+      return TargeTripleAndFeaturesOrError.takeError();
+    StringRef TargeTripleAndFeatures(*TargeTripleAndFeaturesOrError);
+    if (TargeTripleAndFeatures.contains("xnack+"))
+      IsXnackEnabled = true;
+
+    // detect if device is an APU.
+    if (auto Err = checkIfAPU())
+      return Err;
+
     return Plugin::success();
   }
 
@@ -2631,6 +2668,21 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     return Plugin::success();
   }
 
+  /// Returns true if auto zero-copy the best configuration for the current
+  /// arch.
+  /// On AMDGPUs, automatic zero-copy is turned on
+  /// when running on an APU with XNACK (unified memory) support
+  /// enabled. On discrete GPUs, automatic zero-copy is triggered
+  /// if the user sets the environment variable OMPX_APU_MAPS=1
+  /// and if XNACK is enabled. The rationale is that zero-copy
+  /// is the best configuration (performance, memory footprint) on APUs,
+  /// while it is often not the best on discrete GPUs.
+  /// XNACK can be enabled with a kernel boot parameter or with
+  /// the HSA_XNACK environment variable.
+  bool useAutoZeroCopyImpl() override {
+    return ((IsAPU || OMPX_ApuMaps) && IsXnackEnabled);
+  }
+
   /// Getters and setters for stack and heap sizes.
   Error getDeviceStackSize(uint64_t &Value) override {
     Value = StackSize;
@@ -2728,6 +2780,34 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     return Err;
   }
 
+  /// Detect if current architecture is an APU.
+  Error checkIfAPU() {
+    llvm::StringRef StrGfxName(ComputeUnitKind);
+    IsAPU = llvm::StringSwitch<bool>(StrGfxName)
+                .Case("gfx940", true)
+                .Default(false);
+    if (IsAPU)
+      return Plugin::success();
+
+    bool MayBeAPU = llvm::StringSwitch<bool>(StrGfxName)
+                        .Case("gfx942", true)
+                        .Default(false);
+    if (!MayBeAPU)
+      return Plugin::success();
+    else {
+      // can be MI300A or MI300X
+      uint32_t ChipID = 0;
+      if (auto Err = getDeviceAttr(HSA_AMD_AGENT_INFO_CHIP_ID, ChipID))
+        return Err;
+
+      if (!(ChipID & 0x1)) {
+        IsAPU = true;
+        return Plugin::success();
+      }
+    }
+    return Plugin::success();
+  }
+
   /// Envar for controlling the number of HSA queues per device. High number of
   /// queues may degrade performance.
   UInt32Envar OMPX_NumQueues;
@@ -2764,6 +2844,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// Use ROCm 5.7 interface for multiple SDMA engines
   BoolEnvar OMPX_UseMultipleSdmaEngines;
 
+  /// Value of OMPX_APU_MAPS env var used to force
+  /// automatic zero-copy behavior on non-APU GPUs.
+  BoolEnvar OMPX_ApuMaps;
+
   /// Stream manager for AMDGPU streams.
   AMDGPUStreamManagerTy AMDGPUStreamManager;
 
@@ -2794,6 +2878,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// The current size of the stack that will be used in cases where it could
   /// not be statically determined.
   uint64_t StackSize = 16 * 1024 /* 16 KB */;
+
+  /// Is the plugin associated with an APU?
+  bool IsAPU = false;
+
+  /// True is the system is configured with XNACK-Enabled.
+  /// False otherwise.
+  bool IsXnackEnabled = false;
 };
 
 Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
@@ -3039,30 +3130,13 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
     std::optional<StringRef> Processor = ElfOrErr->tryGetCPUName();
 
     for (hsa_agent_t Agent : KernelAgents) {
-      std::string Target;
-      auto Err = utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) {
-        uint32_t Length;
-        hsa_status_t Status;
-        Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME_LENGTH, &Length);
-        if (Status != HSA_STATUS_SUCCESS)
-          return Status;
-
-        llvm::SmallVector<char> ISAName(Length);
-        Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, ISAName.begin());
-        if (Status != HSA_STATUS_SUCCESS)
-          return Status;
-
-        llvm::StringRef TripleTarget(ISAName.begin(), Length);
-        if (TripleTarget.consume_front("amdgcn-amd-amdhsa"))
-          Target = TripleTarget.ltrim('-').rtrim('\0').str();
-        return HSA_STATUS_SUCCESS;
-      });
-      if (Err)
-        return std::move(Err);
-
+      auto TargeTripleAndFeaturesOrError =
+          utils::getTargetTripleAndFeatures(Agent);
+      if (!TargeTripleAndFeaturesOrError)
+        return TargeTripleAndFeaturesOrError.takeError();
       if (!utils::isImageCompatibleWithEnv(Processor ? *Processor : "",
                                            ElfOrErr->getPlatformFlags(),
-                                           Target))
+                                           *TargeTripleAndFeaturesOrError))
         return false;
     }
     return true;
diff --git a/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
index b85dc146d86d2f..abe85f43c2e726 100644
--- a/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
@@ -872,6 +872,11 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
 
   virtual Error getDeviceStackSize(uint64_t &V) = 0;
 
+  /// Returns true if current plugin architecture is an APU
+  /// and unified_shared_memory was not requested by the program.
+  bool useAutoZeroCopy();
+  virtual bool useAutoZeroCopyImpl() { return false; }
+
 private:
   /// Register offload entry for global variable.
   Error registerGlobalOffloadEntry(DeviceImageTy &DeviceImage,
diff --git a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
index 9490e58fc669cd..e82c2f7bef14f0 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1561,6 +1561,8 @@ Error GenericDeviceTy::syncEvent(void *EventPtr) {
   return syncEventImpl(EventPtr);
 }
 
+bool GenericDeviceTy::useAutoZeroCopy() { return useAutoZeroCopyImpl(); }
+
 Error GenericPluginTy::init() {
   auto NumDevicesOrErr = initImpl();
   if (!NumDevicesOrErr)
@@ -2073,6 +2075,14 @@ int32_t __tgt_rtl_set_device_offset(int32_t DeviceIdOffset) {
   return OFFLOAD_SUCCESS;
 }
 
+int32_t __tgt_rtl_use_auto_zero_copy(int32_t DeviceId) {
+  // Automatic zero-copy only applies to programs that did
+  // not request unified_shared_memory and are deployed on an
+  // APU with XNACK enabled.
+  if (Plugin::get().getRequiresFlags() & OMP_REQ_UNIFIED_SHARED_MEMORY)
+    return false;
+  return Plugin::get().getDevice(DeviceId).useAutoZeroCopy();
+}
 #ifdef __cplusplus
 }
 #endif
diff --git a/openmp/libomptarget/src/OpenMP/Mapping.cpp b/openmp/libomptarget/src/OpenMP/Mapping.cpp
index a5c24810e0af95..d95513d068afe8 100644
--- a/openmp/libomptarget/src/OpenMP/Mapping.cpp
+++ b/openmp/libomptarget/src/OpenMP/Mapping.cpp
@@ -252,8 +252,10 @@ TargetPointerResultTy MappingInfoTy::getTargetPointer(
       MESSAGE("device mapping required by 'present' map type modifier does not "
               "exist for host address " DPxMOD " (%" PRId64 " bytes)",
               DPxPTR(HstPtrBegin), Size);
-  } else if (PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY &&
-             !HasCloseModifier) {
+  } else if ((PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY &&
+              !HasCloseModifier) ||
+             (PM->getRequirements() & OMPX_REQ_AUTO_ZERO_COPY)) {
+
     // If unified shared memory is active, implicitly mapped variables that are
     // not privatized use host address. Any explicitly mapped variables also use
     // host address where correctness is not impeded. In all other cases maps
@@ -261,6 +263,10 @@ TargetPointerResultTy MappingInfoTy::getTargetPointer(
     // In addition to the mapping rules above, the close map modifier forces the
     // mapping of the variable to the device.
     if (Size) {
+      INFO(OMP_INFOTYPE_MAPPING_CHANGED, Device.DeviceID,
+           "Return HstPtrBegin " DPxMOD " Size=%" PRId64 " for unified shared "
+           "memory\n",
+           DPxPTR((uintptr_t)HstPtrBegin), Size);
       DP("Return HstPtrBegin " DPxMOD " Size=%" PRId64 " for unified shared "
          "memory\n",
          DPxPTR((uintptr_t)HstPtrBegin), Size);
@@ -415,7 +421,8 @@ TargetPointerResultTy MappingInfoTy::getTgtPtrBegin(
          LR.TPR.getEntry()->dynRefCountToStr().c_str(), DynRefCountAction,
          LR.TPR.getEntry()->holdRefCountToStr().c_str(), HoldRefCountAction);
     LR.TPR.TargetPointer = (void *)TP;
-  } else if (PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY) {
+  } else if (PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY ||
+             PM->getRequirements() & OMPX_REQ_AUTO_ZERO_COPY) {
     // If the value isn't found in the mapping and unified shared memory
     // is on then it means we have stumbled upon a value which we need to
     // use directly from the host.
diff --git a/openmp/libomptarget/src/PluginManager.cpp b/openmp/libomptarget/src/PluginManager.cpp
index 83bf65f0f0de56..50059ba23b1a77 100644
--- a/openmp/libomptarget/src/PluginManager.cpp
+++ b/openmp/libomptarget/src/PluginManager.cpp
@@ -144,6 +144,9 @@ void PluginAdaptorTy::initDevices(PluginManager &PM) {
 
   int32_t NumPD = getNumberOfPluginDevices();
   ExclusiveDevicesAccessor->reserve(DeviceOffset + NumPD);
+  // Auto zero-copy is a per-device property. We need to ensure
+  // that all devices are suggesting to use it.
+  bool UseAutoZeroCopy = !(NumPD == 0);
   for (int32_t PDevI = 0, UserDevId = DeviceOffset; PDevI < NumPD; PDevI++) {
     auto Device = std::make_unique<DeviceTy>(this, UserDevId, PDevI);
     if (auto Err = Device->init()) {
@@ -151,12 +154,20 @@ void PluginAdaptorTy::initDevices(PluginManager &PM) {
          toString(std::move(Err)).c_str());
       continue;
     }
+    UseAutoZeroCopy = UseAutoZeroCopy && Device->useAutoZeroCopy();
 
     ExclusiveDevicesAccessor->push_back(std::move(Device));
     ++NumberOfUserDevices;
     ++UserDevId;
   }
 
+  // Auto Zero-Copy can only be currently triggered when the system is an
+  // homogeneous APU architecture without attached discrete GPUs.
+  // If all devices suggest to use it, change requirment flags to trigger
+  // zero-copy behavior when mapping memory.
+  if (UseAutoZeroCopy)
+    PM.addRequirements(OMPX_REQ_AUTO_ZERO_COPY);
+
   DP("Plugin adaptor " DPxMOD " has index %d, exposes %d out of %d devices!\n",
      DPxPTR(LibraryHandler.get()), DeviceOffset, NumberOfUserDevices,
      NumberOfPluginDevices);
diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp
index fa8932361a51c9..55f46e6bd6374e 100644
--- a/openmp/libomptarget/src/device.cpp
+++ b/openmp/libomptarget/src/device.cpp
@@ -339,3 +339,9 @@ void DeviceTy::dumpOffloadEntries() {
     fprintf(stderr, "  %11s: %s\n", Kind, It.second.getNameAsCStr());
   }
 }
+
+bool DeviceTy::useAutoZeroCopy() {
+  if (RTL->use_auto_zero_copy)
+    return RTL->use_auto_zero_copy(RTLDeviceID);
+  return false;
+}
diff --git a/openmp/libomptarget/test/mapping/auto_zero_copy.cpp b/openmp/libomptarget/test/mapping/auto_zero_copy.cpp
new file mode 100644
index 00000000000000..4201399d489599
--- /dev/null
+++ b/openmp/libomptarget/test/mapping/auto_zero_copy.cpp
@@ -0,0 +1,55 @@
+// clang-format off
+// RUN: %libomptarget-compilexx-generic
+// RUN: env OMPX_APU_MAPS=1 HSA_XNACK=1 LIBOMPTARGET_INFO=30 %libomptarget-run-generic 2>&1 \
+// RUN: | %fcheck-generic -check-prefix=INFO_ZERO -check-prefix=CHECK
+
+// RUN: %libomptarget-compilexx-generic
+// RUN: env HSA_XNACK=0 LIBOMPTARGET_INFO=30 %libomptarget-run-generic 2>&1 \
+// RUN: | %fcheck-generic -check-prefix=INFO_COPY -check-prefix=CHECK
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+// clang-format on
+
+#include <cstdio>
+
+int main() {
+  int n = 1024;
+
+  // test various mapping types
+  int *a = new int[n];
+  int k = 3;
+  int b[n];
+
+  for (int i = 0; i < n; i++)
+    b[i] = i;
+
+    // clang-format off
+  // INFO_ZERO: Return HstPtrBegin 0x{{.*}} Size=4096 for unified shared memory
+  // INFO_ZERO: Return HstPtrBegin 0x{{.*}} Size=4096 for unified shared memory
+
+  // INFO_COPY: Creating new map entry with HstPtrBase=0x{{.*}}, HstPtrBegin=0x{{.*}}, TgtAllocBegin=0x{{.*}}, TgtPtrBegin=0x{{.*}}, Size=4096,
+  // INFO_COPY: Creating new map entry with HstPtrBase=0x{{.*}}, HstPtrBegin=0x{{.*}}, TgtAllocBegin=0x{{.*}}, TgtPtrBegin=0x{{.*}}, Size=4096,
+  // INFO_COPY: Mapping exists with HstPtrBegin=0x{{.*}}, TgtPtrBegin=0x{{.*}}, Size=4096, DynRefCount=1 (update suppressed)
+  // INFO_COPY: Mapping exists with HstPtrBegin=0x{{.*}}, TgtPtrBegin=0x{{.*}}, Size=4096, DynRefCount=1 (update suppressed)
+// clang-format on
+#pragma omp target teams distribute parallel for map(tofrom : a[ : n])         \
+    map(to : b[ : n])
+  for (int i = 0; i < n; i++)
+    a[i] = i + b[i] + k;
+
+  int err = 0;
+  for (int i = 0; i < n; i++)
+    if (a[i] != i + b[i] + k)
+      err++;
+
+  // CHECK: PASS
+  if (err == 0)
+    printf("PASS\n");
+  return err;
+}



More information about the Openmp-commits mailing list