[llvm] bd2fa18 - [OpenMPOpt][HideMemTransfersLatency] Moving the 'wait' counterpart of __tgt_target_data_begin_mapper
Hamilton Tobon Mosquera via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 19 09:42:50 PDT 2020
Author: Hamilton Tobon Mosquera
Date: 2020-08-19T11:42:22-05:00
New Revision: bd2fa1819b9dc1a863a4b5a8abc540598f56c8f2
URL: https://github.com/llvm/llvm-project/commit/bd2fa1819b9dc1a863a4b5a8abc540598f56c8f2
DIFF: https://github.com/llvm/llvm-project/commit/bd2fa1819b9dc1a863a4b5a8abc540598f56c8f2.diff
LOG: [OpenMPOpt][HideMemTransfersLatency] Moving the 'wait' counterpart of __tgt_target_data_begin_mapper
canBeMovedDownwards checks if the "wait" counterpart of __tgt_target_data_begin_mapper can be moved downwards, returning a pointer to the instruction that might require/modify the data transferred, and returning null it the movement is not possible or not worth it. The function splitTargetDataBeginRTC receives that returned instruction and instead of moving the "wait" it creates it at that point.
Differential Revision: https://reviews.llvm.org/D86155
Added:
Modified:
llvm/lib/Transforms/IPO/OpenMPOpt.cpp
llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 5121574c5e6c..9bd765e22d38 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -652,7 +652,12 @@ struct OpenMPOpt {
if (!RTCall)
return false;
- bool WasSplit = splitTargetDataBeginRTC(RTCall);
+ // TODO: Check if can be moved upwards.
+ bool WasSplit = false;
+ Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall);
+ if (WaitMovementPoint)
+ WasSplit = splitTargetDataBeginRTC(*RTCall, *WaitMovementPoint);
+
Changed |= WasSplit;
return WasSplit;
};
@@ -661,8 +666,38 @@ struct OpenMPOpt {
return Changed;
}
+ /// Returns the instruction where the "wait" counterpart \p RuntimeCall can be
+ /// moved. Returns nullptr if the movement is not possible, or not worth it.
+ Instruction *canBeMovedDownwards(CallInst &RuntimeCall) {
+ // FIXME: This traverses only the BasicBlock where RuntimeCall is.
+ // Make it traverse the CFG.
+
+ Instruction *CurrentI = &RuntimeCall;
+ bool IsWorthIt = false;
+ while ((CurrentI = CurrentI->getNextNode())) {
+
+ // TODO: Once we detect the regions to be offloaded we should use the
+ // alias analysis manager to check if CurrentI may modify one of
+ // the offloaded regions.
+ if (CurrentI->mayHaveSideEffects() || CurrentI->mayReadFromMemory()) {
+ if (IsWorthIt)
+ return CurrentI;
+
+ return nullptr;
+ }
+
+ // FIXME: For now if we move it over anything without side effect
+ // is worth it.
+ IsWorthIt = true;
+ }
+
+ // Return end of BasicBlock.
+ return RuntimeCall.getParent()->getTerminator();
+ }
+
/// Splits \p RuntimeCall into its "issue" and "wait" counterparts.
- bool splitTargetDataBeginRTC(CallInst *RuntimeCall) {
+ bool splitTargetDataBeginRTC(CallInst &RuntimeCall,
+ Instruction &WaitMovementPoint) {
auto &IRBuilder = OMPInfoCache.OMPBuilder;
// Add "issue" runtime call declaration:
// declare %struct.tgt_async_info @__tgt_target_data_begin_issue(i64, i32,
@@ -672,12 +707,12 @@ struct OpenMPOpt {
// Change RuntimeCall call site for its asynchronous version.
SmallVector<Value *, 8> Args;
- for (auto &Arg : RuntimeCall->args())
+ for (auto &Arg : RuntimeCall.args())
Args.push_back(Arg.get());
CallInst *IssueCallsite =
- CallInst::Create(IssueDecl, Args, "handle", RuntimeCall);
- RuntimeCall->eraseFromParent();
+ CallInst::Create(IssueDecl, Args, "handle", &RuntimeCall);
+ RuntimeCall.eraseFromParent();
// Add "wait" runtime call declaration:
// declare void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info)
@@ -689,8 +724,7 @@ struct OpenMPOpt {
IssueCallsite->getArgOperand(0), // device_id.
IssueCallsite // returned handle.
};
- CallInst::Create(WaitDecl, WaitParams, /*NameStr=*/"",
- IssueCallsite->getNextNode());
+ CallInst::Create(WaitDecl, WaitParams, /*NameStr=*/"", &WaitMovementPoint);
return true;
}
diff --git a/llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll b/llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll
index 7f55ad12af2d..4f4bf66f7b82 100644
--- a/llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll
+++ b/llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll
@@ -59,9 +59,11 @@ define dso_local double @heavyComputation1() {
; CHECK-NEXT: store double* %a, double** %4, align 8
; CHECK-NEXT: %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64 -1, i32 1, i8** %1, i8** %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null)
-; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle)
; CHECK-NEXT: %5 = bitcast double* %a to i64*
+
+; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle)
+
; CHECK-NEXT: %6 = load i64, i64* %5, align 8
; CHECK-NEXT: %7 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs4, i64 0, i64 0
; CHECK-NEXT: %8 = bitcast [1 x i8*]* %.offload_baseptrs4 to i64*
@@ -184,8 +186,7 @@ define dso_local i32 @heavyComputation2(double* %a, i32 %size) {
; CHECK-NEXT: %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
; CHECK-NEXT: store i64 4, i64* %10, align 8
-; CHECK-NEXT: %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64 -1, i32 2, i8** %1, i8** %3, i64* %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
-; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle)
+; CHECK-NEXT: call void @__tgt_target_data_begin_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
; CHECK-NEXT: %11 = load i32, i32* %size.addr, align 4
; CHECK-NEXT: %size.casted = zext i32 %11 to i64
@@ -325,8 +326,7 @@ define dso_local i32 @heavyComputation3(double* noalias %a, i32 %size) {
; CHECK-NEXT: %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
; CHECK-NEXT: store i64 4, i64* %10, align 8
-; CHECK-NEXT: %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64 -1, i32 2, i8** %1, i8** %3, i64* %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
-; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle)
+; CHECK-NEXT: call void @__tgt_target_data_begin_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
; CHECK-NEXT: %11 = load i32, i32* %size.addr, align 4
; CHECK-NEXT: %size.casted = zext i32 %11 to i64
@@ -453,9 +453,11 @@ define dso_local i32 @dataTransferOnly1(double* noalias %a, i32 %size) {
; CHECK-NEXT: store i64 %0, i64* %5, align 8
; CHECK-NEXT: %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64 -1, i32 1, i8** %1, i8** %3, i64* %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null)
-; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle)
; CHECK-NEXT: %rem = urem i32 %call, %size
+
+; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle)
+
; CHECK-NEXT: call void @__tgt_target_data_end_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null)
; CHECK-NEXT: ret i32 %rem
;
More information about the llvm-commits
mailing list