[llvm-branch-commits] [openmp] f628eef - [libomptarget][amdgpu] Fix latent race in load binary

Fri Dec 4 08:34:16 PST 2020

Author: Jon Chesterfield
Date: 2020-12-04T16:29:09Z
New Revision: f628eef98acd24f8eb6a52d67ee887bb18f04bca

URL: https://github.com/llvm/llvm-project/commit/f628eef98acd24f8eb6a52d67ee887bb18f04bca
DIFF: https://github.com/llvm/llvm-project/commit/f628eef98acd24f8eb6a52d67ee887bb18f04bca.diff

LOG: [libomptarget][amdgpu] Fix latent race in load binary

Added: 
    

Modified: 
    openmp/libomptarget/plugins/amdgpu/src/rtl.cpp

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
index f22b4697f30b..ea8770e4543a 100644

--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -925,6 +925,26 @@ __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
   return res;
 }
 
+static atmi_status_t atmi_calloc(void **ret_ptr, size_t size,
+                                 atmi_mem_place_t place) {
+  uint64_t rounded = 4 * ((size + 3) / 4);
+  void *ptr;
+  atmi_status_t err = atmi_malloc(&ptr, rounded, place);
+  if (err != ATMI_STATUS_SUCCESS) {
+    return err;
+  }
+
+  hsa_status_t rc = hsa_amd_memory_fill(ptr, 0, rounded / 4);
+  if (rc != HSA_STATUS_SUCCESS) {
+    fprintf(stderr, "zero fill device_state failed with %u\n", rc);
+    atmi_free(ptr);
+    return ATMI_STATUS_ERROR;
+  }
+
+  *ret_ptr = ptr;
+  return ATMI_STATUS_SUCCESS;
+}
+
 __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
                                                  __tgt_device_image *image) {
   // This function loads the device image onto gpu[device_id] and does other
@@ -1024,7 +1044,7 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
       assert(dss.second == 0);
       void *ptr = NULL;
       atmi_status_t err =
-          atmi_malloc(&ptr, device_State_bytes, get_gpu_mem_place(device_id));
+          atmi_calloc(&ptr, device_State_bytes, get_gpu_mem_place(device_id));
       if (err != ATMI_STATUS_SUCCESS) {
         fprintf(stderr, "Failed to allocate device_state array\n");
         return NULL;
@@ -1062,13 +1082,6 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id,
       fprintf(stderr, "memcpy install of state_ptr failed\n");
       return NULL;
     }
-
-    assert((device_State_bytes & 0x3) == 0); // known >= 4 byte aligned
-    hsa_status_t rc = hsa_amd_memory_fill(ptr, 0, device_State_bytes / 4);
-    if (rc != HSA_STATUS_SUCCESS) {
-      fprintf(stderr, "zero fill device_state failed with %u\n", rc);
-      return NULL;
-    }
   }
 
   // TODO: Check with Guansong to understand the below comment more thoroughly.