[llvm] 496f8e5 - [OpenMPOpt][HideMemTransfersLatency] Split __tgt_target_data_begin_mapper into its "issue" and "wait" counterparts.

Mon Aug 17 18:56:38 PDT 2020

Author: Hamilton Tobon Mosquera
Date: 2020-08-17T20:56:10-05:00
New Revision: 496f8e5b369f091def93482578232da8c6e77a7a

URL: https://github.com/llvm/llvm-project/commit/496f8e5b369f091def93482578232da8c6e77a7a
DIFF: https://github.com/llvm/llvm-project/commit/496f8e5b369f091def93482578232da8c6e77a7a.diff

LOG: [OpenMPOpt][HideMemTransfersLatency] Split __tgt_target_data_begin_mapper into its "issue" and "wait" counterparts.

WIP that tries to hide the latency of runtime calls that involve host to
device memory transfers by splitting them into their "issue" and "wait"
versions. The "issue" is moved upwards as much as possible. The "wait" is
moved downards as much as possible. The "issue" issues the memory transfer
asynchronously, returning a handle. The "wait" waits in the returned
handle for the memory transfer to finish. We still lack of the movement.

Added: 
    

Modified: 
    llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
    llvm/lib/Transforms/IPO/OpenMPOpt.cpp
    llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index 3fc87dc34cd3..9ad7efff6ef5 100644

--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -198,6 +198,7 @@ __OMP_ARRAY_TYPE(KmpCriticalName, Int32, 8)
   OMP_STRUCT_TYPE(VarName, "struct." #Name, __VA_ARGS__)
 
 __OMP_STRUCT_TYPE(Ident, ident_t, Int32, Int32, Int32, Int32, Int8Ptr)
+__OMP_STRUCT_TYPE(AsyncInfo, __tgt_async_info, Int8Ptr)
 
 #undef __OMP_STRUCT_TYPE
 #undef OMP_STRUCT_TYPE
@@ -482,6 +483,9 @@ __OMP_RTL(__tgt_target_data_begin_mapper, false, Void, Int64, Int32, VoidPtrPtr,
           VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr)
 __OMP_RTL(__tgt_target_data_begin_nowait_mapper, false, Void, Int64, Int32,
           VoidPtrPtr, VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr)
+__OMP_RTL(__tgt_target_data_begin_mapper_issue, false, AsyncInfo, Int64, Int32,
+          VoidPtrPtr, VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr)
+__OMP_RTL(__tgt_target_data_begin_mapper_wait, false, Void, Int64, AsyncInfo)
 __OMP_RTL(__tgt_target_data_end_mapper, false, Void, Int64, Int32, VoidPtrPtr,
           VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr)
 __OMP_RTL(__tgt_target_data_end_nowait_mapper, false, Void, Int64, Int32,

diff  --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 93f1e5392eb2..ae7bafd7d91e 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -42,6 +42,13 @@ static cl::opt<bool> PrintICVValues("openmp-print-icv-values", cl::init(false),
 static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels",
                                         cl::init(false), cl::Hidden);
 
+static cl::opt<bool> HideMemoryTransferLatency(
+    "openmp-hide-memory-transfer-latency",
+    cl::desc("[WIP] Tries to hide the latency of host to device memory"
+             " transfers"),
+    cl::Hidden, cl::init(false));
+
+
 STATISTIC(NumOpenMPRuntimeCallsDeduplicated,
           "Number of OpenMP runtime calls deduplicated");
 STATISTIC(NumOpenMPParallelRegionsDeleted,
@@ -508,6 +515,8 @@ struct OpenMPOpt {
 
     Changed |= deduplicateRuntimeCalls();
     Changed |= deleteParallelRegions();
+    if (HideMemoryTransferLatency)
+      Changed |= hideMemTransfersLatency();
 
     return Changed;
   }
@@ -666,6 +675,63 @@ struct OpenMPOpt {
     return Changed;
   }
 
+  /// Tries to hide the latency of runtime calls that involve host to
+  /// device memory transfers by splitting them into their "issue" and "wait"
+  /// versions. The "issue" is moved upwards as much as possible. The "wait" is
+  /// moved downards as much as possible. The "issue" issues the memory transfer
+  /// asynchronously, returning a handle. The "wait" waits in the returned
+  /// handle for the memory transfer to finish.
+  bool hideMemTransfersLatency() {
+    auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper];
+    bool Changed = false;
+    auto SplitMemTransfers = [&](Use &U, Function &Decl) {
+      auto *RTCall = getCallIfRegularCall(U, &RFI);
+      if (!RTCall)
+        return false;
+
+      bool WasSplit = splitTargetDataBeginRTC(RTCall);
+      Changed |= WasSplit;
+      return WasSplit;
+    };
+    RFI.foreachUse(SCC, SplitMemTransfers);
+
+    return Changed;
+  }
+
+  /// Splits \p RuntimeCall into its "issue" and "wait" counterparts.
+  bool splitTargetDataBeginRTC(CallInst *RuntimeCall) {
+    auto &IRBuilder = OMPInfoCache.OMPBuilder;
+    // Add "issue" runtime call declaration:
+    // declare %struct.tgt_async_info @__tgt_target_data_begin_issue(i64, i32,
+    //   i8**, i8**, i64*, i64*)
+    FunctionCallee IssueDecl = IRBuilder.getOrCreateRuntimeFunction(
+        M, OMPRTL___tgt_target_data_begin_mapper_issue);
+
+    // Change RuntimeCall call site for its asynchronous version.
+    SmallVector<Value *, 8> Args;
+    for (auto &Arg : RuntimeCall->args())
+      Args.push_back(Arg.get());
+
+    CallInst *IssueCallsite =
+        CallInst::Create(IssueDecl, Args, "handle", RuntimeCall);
+    RuntimeCall->eraseFromParent();
+
+    // Add "wait" runtime call declaration:
+    // declare void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info)
+    FunctionCallee WaitDecl = IRBuilder.getOrCreateRuntimeFunction(
+        M, OMPRTL___tgt_target_data_begin_mapper_wait);
+
+    // Add call site to WaitDecl.
+    Value *WaitParams[2] = {
+        IssueCallsite->getArgOperand(0), // device_id.
+        IssueCallsite // returned handle.
+    };
+    CallInst::Create(WaitDecl, WaitParams, /*NameStr=*/"",
+                     IssueCallsite->getNextNode());
+
+    return true;
+  }
+
   static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent,
                                     bool GlobalOnly, bool &SingleChoice) {
     if (CurrentIdent == NextIdent)

diff  --git a/llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll b/llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll
index daebe4b52ace..7f55ad12af2d 100644
--- a/llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll
+++ b/llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll
@@ -1,9 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --function-signature --scrub-attributes
-; RUN: opt -S -passes=openmpopt -aa-pipeline=basic-aa < %s | FileCheck %s
+; RUN: opt -S -passes=openmpopt -aa-pipeline=basic-aa -openmp-hide-memory-transfer-latency < %s | FileCheck %s
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 
-; FIXME: This struct should be generated after splitting at least one of the runtime calls.
-; %struct.__tgt_async_info = type { i8* }
+; CHECK: %struct.__tgt_async_info = type { i8* }
 
 %struct.ident_t = type { i32, i32, i32, i32, i8* }
 %struct.__tgt_offload_entry = type { i8*, i8*, i64, i32, i32 }
@@ -58,7 +57,10 @@ define dso_local double @heavyComputation1() {
 ; CHECK-NEXT:    %3 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs, i64 0, i64 0
 ; CHECK-NEXT:    %4 = bitcast [1 x i8*]* %.offload_ptrs to double**
 ; CHECK-NEXT:    store double* %a, double** %4, align 8
-; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null)
+
+; CHECK-NEXT:    %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64 -1, i32 1, i8** %1, i8** %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null)
+; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle)
+
 ; CHECK-NEXT:    %5 = bitcast double* %a to i64*
 ; CHECK-NEXT:    %6 = load i64, i64* %5, align 8
 ; CHECK-NEXT:    %7 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs4, i64 0, i64 0
@@ -102,11 +104,6 @@ entry:
   %3 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs, i64 0, i64 0
   %4 = bitcast [1 x i8*]* %.offload_ptrs to double**
   store double* %a, double** %4, align 8
-  ; FIXME: This setup for the runtime call __tgt_target_data_begin_mapper should be
-  ;        split into its "issue" and "wait" counterpars and moved upwards
-  ;        and downwards, respectively.
-  ; %handle = call i8* @__tgt_target_data_begin_mapper_issue(...)
-  ; call void @__tgt_target_data_begin_wait(i64 -1, %struct.__tgt_async_info %handle)
   call void @__tgt_target_data_begin_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null)
 
   %5 = bitcast double* %a to i64*
@@ -186,7 +183,10 @@ define dso_local i32 @heavyComputation2(double* %a, i32 %size) {
 ; CHECK-NEXT:    store i32* %size.addr, i32** %9, align 8
 ; CHECK-NEXT:    %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
 ; CHECK-NEXT:    store i64 4, i64* %10, align 8
-; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
+
+; CHECK-NEXT:    %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64 -1, i32 2, i8** %1, i8** %3, i64* %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
+; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle)
+
 ; CHECK-NEXT:    %11 = load i32, i32* %size.addr, align 4
 ; CHECK-NEXT:    %size.casted = zext i32 %11 to i64
 ; CHECK-NEXT:    %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 0
@@ -241,12 +241,6 @@ entry:
   store i32* %size.addr, i32** %9, align 8
   %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
   store i64 4, i64* %10, align 8
-  ; FIXME: This setup for the runtime call __tgt_target_data_begin_mapper should be
-  ;        split into its "issue" and "wait" counterpars and moved upwards
-  ;        and downwards, respectively. Here though, the "issue" cannot be moved upwards
-  ;        because it's not guaranteed that rand() won't modify *a.
-  ; %handle = call i8* @__tgt_target_data_begin_mapper_issue(...)
-  ; call void @__tgt_target_data_begin_wait(i64 -1, %struct.__tgt_async_info %handle)
   call void @__tgt_target_data_begin_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
 
   %11 = load i32, i32* %size.addr, align 4
@@ -330,7 +324,10 @@ define dso_local i32 @heavyComputation3(double* noalias %a, i32 %size) {
 ; CHECK-NEXT:    store i32* %size.addr, i32** %9, align 8
 ; CHECK-NEXT:    %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
 ; CHECK-NEXT:    store i64 4, i64* %10, align 8
-; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
+
+; CHECK-NEXT:    %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64 -1, i32 2, i8** %1, i8** %3, i64* %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
+; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle)
+
 ; CHECK-NEXT:    %11 = load i32, i32* %size.addr, align 4
 ; CHECK-NEXT:    %size.casted = zext i32 %11 to i64
 ; CHECK-NEXT:    %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 0
@@ -386,11 +383,6 @@ entry:
   store i32* %size.addr, i32** %9, align 8
   %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
   store i64 4, i64* %10, align 8
-  ; FIXME: This setup for the runtime call __tgt_target_data_begin_mapper should be
-  ;        split into its "issue" and "wait" counterpars and moved upwards
-  ;        and downwards, respectively.
-  ; %handle = call i8* @__tgt_target_data_begin_mapper_issue(...)
-  ; call void @__tgt_target_data_begin_wait(i64 -1, %struct.__tgt_async_info %handle)
   call void @__tgt_target_data_begin_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
 
   %11 = load i32, i32* %size.addr, align 4
@@ -459,7 +451,10 @@ define dso_local i32 @dataTransferOnly1(double* noalias %a, i32 %size) {
 ; CHECK-NEXT:    store double* %a, double** %4, align 8
 ; CHECK-NEXT:    %5 = getelementptr inbounds [1 x i64], [1 x i64]* %.offload_sizes, i64 0, i64 0
 ; CHECK-NEXT:    store i64 %0, i64* %5, align 8
-; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null)
+
+; CHECK-NEXT:    %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64 -1, i32 1, i8** %1, i8** %3, i64* %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null)
+; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle)
+
 ; CHECK-NEXT:    %rem = urem i32 %call, %size
 ; CHECK-NEXT:    call void @__tgt_target_data_end_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null)
 ; CHECK-NEXT:    ret i32 %rem
@@ -482,13 +477,6 @@ entry:
   store double* %a, double** %4, align 8
   %5 = getelementptr inbounds [1 x i64], [1 x i64]* %.offload_sizes, i64 0, i64 0
   store i64 %0, i64* %5, align 8
-  ; FIXME: This setup for the runtime call __tgt_target_data_begin_mapper should be
-  ;        split into its "issue" and "wait" counterpars and moved upwards
-  ;        and downwards, respectively. Here though, the "wait" cannot be moved downwards
-  ;        because it is not worthit. That is, there is no store nor call to be hoisted
-  ;        over.
-  ; %handle = call i8* @__tgt_target_data_begin_mapper_issue(...)
-  ; call void @__tgt_target_data_begin_wait(i64 -1, %struct.__tgt_async_info %handle)
   call void @__tgt_target_data_begin_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null)
 
   %rem = urem i32 %call, %size
@@ -503,7 +491,5 @@ declare void @__tgt_target_data_end_mapper(i64, i32, i8**, i8**, i64*, i64*, i8*
 
 declare dso_local i32 @rand(...)
 
-; FIXME: These two function declarations must be generated after splitting the runtime function
-;        __tgt_target_data_begin_mapper.
-; declare %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64, i32, i8**, i8**, i64*, i64*, i8**)
-; declare void @__tgt_target_data_begin_mapper_wait(i64, %struct.__tgt_async_info)
+; CHECK: declare %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64, i32, i8**, i8**, i64*, i64*, i8**)
+; CHECK: declare void @__tgt_target_data_begin_mapper_wait(i64, %struct.__tgt_async_info)