[llvm] 8931add - [OpenMPOpt][HideMemTransfersLatency] Get values stored in offload arrays

Mon Aug 31 13:33:27 PDT 2020

Author: Hamilton Tobon Mosquera
Date: 2020-08-31T15:33:05-05:00
New Revision: 8931add6170508704007f1a410993e6aec879c01

URL: https://github.com/llvm/llvm-project/commit/8931add6170508704007f1a410993e6aec879c01
DIFF: https://github.com/llvm/llvm-project/commit/8931add6170508704007f1a410993e6aec879c01.diff

LOG: [OpenMPOpt][HideMemTransfersLatency] Get values stored in offload arrays

getValuesInOffloadArrays goes through the offload arrays in __tgt_target_data_begin_mapper getting the values stored in them before the call is issued.

call void @__tgt_target_data_begin_mapper(arg0, arg1,
    i8** %offload_baseptrs, i8** %offload_ptrs, i64* %offload_sizes,
...)

Diferential Revision: https://reviews.llvm.org/D86300

Added: 
    llvm/test/Transforms/OpenMP/values_in_offload_arrays.ll

Modified: 
    llvm/lib/Transforms/IPO/OpenMPOpt.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index d596508276374..bd4e2f769af09 100644

--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/Attributor.h"
 #include "llvm/Transforms/Utils/CallGraphUpdater.h"
+#include "llvm/Analysis/ValueTracking.h"
 
 using namespace llvm;
 using namespace omp;
@@ -379,6 +380,87 @@ struct OMPInformationCache : public InformationCache {
   SmallPtrSetImpl<Kernel> &Kernels;
 };
 
+/// Used to map the values physically (in the IR) stored in an offload
+/// array, to a vector in memory.
+struct OffloadArray {
+  /// Physical array (in the IR).
+  AllocaInst *Array = nullptr;
+  /// Mapped values.
+  SmallVector<Value *, 8> StoredValues;
+  /// Last stores made in the offload array.
+  SmallVector<StoreInst *, 8> LastAccesses;
+
+  OffloadArray() = default;
+
+  /// Initializes the OffloadArray with the values stored in \p Array before
+  /// instruction \p Before is reached. Returns false if the initialization
+  /// fails.
+  /// This MUST be used immediately after the construction of the object.
+  bool initialize(AllocaInst &Array, Instruction &Before) {
+    if (!Array.getAllocatedType()->isArrayTy())
+      return false;
+
+    if (!getValues(Array, Before))
+      return false;
+
+    this->Array = &Array;
+    return true;
+  }
+
+private:
+  /// Traverses the BasicBlock where \p Array is, collecting the stores made to
+  /// \p Array, leaving StoredValues with the values stored before the
+  /// instruction \p Before is reached.
+  bool getValues(AllocaInst &Array, Instruction &Before) {
+    // Initialize container.
+    const uint64_t NumValues =
+        Array.getAllocatedType()->getArrayNumElements();
+    StoredValues.assign(NumValues, nullptr);
+    LastAccesses.assign(NumValues, nullptr);
+
+    // TODO: This assumes the instruction \p Before is in the same
+    //  BasicBlock as Array. Make it general, for any control flow graph.
+    BasicBlock *BB = Array.getParent();
+    if (BB != Before.getParent())
+      return false;
+
+    const DataLayout &DL = Array.getModule()->getDataLayout();
+    const unsigned int PointerSize = DL.getPointerSize();
+
+    for (Instruction &I : *BB) {
+      if (&I == &Before)
+        break;
+
+      if (!isa<StoreInst>(&I))
+        continue;
+
+      auto *S = cast<StoreInst>(&I);
+      int64_t Offset = -1;
+      auto *Dst = GetPointerBaseWithConstantOffset(S->getPointerOperand(),
+                                                   Offset, DL);
+      if (Dst == &Array) {
+        int64_t Idx = Offset / PointerSize;
+        StoredValues[Idx] = getUnderlyingObject(S->getValueOperand());
+        LastAccesses[Idx] = S;
+      }
+    }
+
+    return isFilled();
+  }
+
+  /// Returns true if all values in StoredValues and
+  /// LastAccesses are not nullptrs.
+  bool isFilled() {
+    const unsigned NumValues = StoredValues.size();
+    for (unsigned I = 0; I < NumValues; ++I) {
+      if (!StoredValues[I] || !LastAccesses[I])
+        return false;
+    }
+
+    return true;
+  }
+};
+
 struct OpenMPOpt {
 
   using OptimizationRemarkGetter =
@@ -589,6 +671,12 @@ struct OpenMPOpt {
       if (!RTCall)
         return false;
 
+      OffloadArray OffloadArrays[3];
+      if (!getValuesInOffloadArrays(*RTCall, OffloadArrays))
+        return false;
+
+      LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays));
+
       // TODO: Check if can be moved upwards.
       bool WasSplit = false;
       Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall);
@@ -603,6 +691,93 @@ struct OpenMPOpt {
     return Changed;
   }
 
+  /// Maps the values stored in the offload arrays passed as arguments to
+  /// \p RuntimeCall into the offload arrays in \p OAs.
+  bool getValuesInOffloadArrays(CallInst &RuntimeCall,
+                                MutableArrayRef<OffloadArray> OAs) {
+    assert(OAs.size() == 3 && "Need space for three offload arrays!");
+
+    // A runtime call that involves memory offloading looks something like:
+    // call void @__tgt_target_data_begin_mapper(arg0, arg1,
+    //   i8** %offload_baseptrs, i8** %offload_ptrs, i64* %offload_sizes,
+    // ...)
+    // So, the idea is to access the allocas that allocate space for these
+    // offload arrays, offload_baseptrs, offload_ptrs, offload_sizes.
+    // Therefore:
+    // i8** %offload_baseptrs.
+    const unsigned BasePtrsArgNum = 2;
+    Value *BasePtrsArg = RuntimeCall.getArgOperand(BasePtrsArgNum);
+    // i8** %offload_ptrs.
+    const unsigned PtrsArgNum = 3;
+    Value *PtrsArg = RuntimeCall.getArgOperand(PtrsArgNum);
+    // i8** %offload_sizes.
+    const unsigned SizesArgNum = 4;
+    Value *SizesArg = RuntimeCall.getArgOperand(SizesArgNum);
+
+    // Get values stored in **offload_baseptrs.
+    auto *V = getUnderlyingObject(BasePtrsArg);
+    if (!isa<AllocaInst>(V))
+      return false;
+    auto *BasePtrsArray = cast<AllocaInst>(V);
+    if (!OAs[0].initialize(*BasePtrsArray, RuntimeCall))
+      return false;
+
+    // Get values stored in **offload_baseptrs.
+    V = getUnderlyingObject(PtrsArg);
+    if (!isa<AllocaInst>(V))
+      return false;
+    auto *PtrsArray = cast<AllocaInst>(V);
+    if (!OAs[1].initialize(*PtrsArray, RuntimeCall))
+      return false;
+
+    // Get values stored in **offload_sizes.
+    V = getUnderlyingObject(SizesArg);
+    // If it's a [constant] global array don't analyze it.
+    if (isa<GlobalValue>(V))
+      return isa<Constant>(V);
+    if (!isa<AllocaInst>(V))
+      return false;
+
+    auto *SizesArray = cast<AllocaInst>(V);
+    if (!OAs[2].initialize(*SizesArray, RuntimeCall))
+      return false;
+
+    return true;
+  }
+
+  /// Prints the values in the OffloadArrays \p OAs using LLVM_DEBUG.
+  /// For now this is a way to test that the function getValuesInOffloadArrays
+  /// is working properly.
+  /// TODO: Move this to a unittest when unittests are available for OpenMPOpt.
+  void dumpValuesInOffloadArrays(ArrayRef<OffloadArray> OAs) {
+    assert(OAs.size() == 3 && "There are three offload arrays to debug!");
+
+    LLVM_DEBUG(dbgs() << TAG << " Successfully got offload values:\n");
+    std::string ValuesStr;
+    raw_string_ostream Printer(ValuesStr);
+    std::string Separator = " --- ";
+
+    for (auto *BP : OAs[0].StoredValues) {
+      BP->print(Printer);
+      Printer << Separator;
+    }
+    LLVM_DEBUG(dbgs() << "\t\toffload_baseptrs: " << Printer.str() << "\n");
+    ValuesStr.clear();
+
+    for (auto *P : OAs[1].StoredValues) {
+      P->print(Printer);
+      Printer << Separator;
+    }
+    LLVM_DEBUG(dbgs() << "\t\toffload_ptrs: " << Printer.str() << "\n");
+    ValuesStr.clear();
+
+    for (auto *S : OAs[2].StoredValues) {
+      S->print(Printer);
+      Printer << Separator;
+    }
+    LLVM_DEBUG(dbgs() << "\t\toffload_sizes: " << Printer.str() << "\n");
+  }
+
   /// Returns the instruction where the "wait" counterpart \p RuntimeCall can be
   /// moved. Returns nullptr if the movement is not possible, or not worth it.
   Instruction *canBeMovedDownwards(CallInst &RuntimeCall) {

diff  --git a/llvm/test/Transforms/OpenMP/values_in_offload_arrays.ll b/llvm/test/Transforms/OpenMP/values_in_offload_arrays.ll
new file mode 100644
index 0000000000000..96195dbecc006
--- /dev/null
+++ b/llvm/test/Transforms/OpenMP/values_in_offload_arrays.ll
@@ -0,0 +1,67 @@
+; RUN: opt -S -passes=openmpopt -aa-pipeline=basic-aa -openmp-hide-memory-transfer-latency -debug-only=openmp-opt < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+
+ at .__omp_offloading_heavyComputation.region_id = weak constant i8 0
+ at .offload_maptypes. = private unnamed_addr constant [2 x i64] [i64 35, i64 35]
+
+; CHECK-LABEL: {{[^@]+}}Successfully got offload values:
+; CHECK-NEXT: offload_baseptrs: double* %a ---   %size.addr = alloca i32, align 4 ---
+; CHECK-NEXT: offload_ptrs: double* %a ---   %size.addr = alloca i32, align 4 ---
+; CHECK-NEXT: offload_sizes:   %0 = shl nuw nsw i64 %conv, 3 --- i64 4 ---
+
+;int heavyComputation(double* a, unsigned size) {
+;  int random = rand() % 7;
+;
+;  //#pragma omp target data map(a[0:size], size)
+;  void* args[2];
+;  args[0] = &a;
+;  args[1] = &size;
+;  __tgt_target_data_begin(..., args, ...)
+;
+;  #pragma omp target teams
+;  for (int i = 0; i < size; ++i) {
+;    a[i] = ++a[i] * 3.141624;
+;  }
+;
+;  return random;
+;}
+define dso_local i32 @heavyComputation(double* %a, i32 %size) {
+entry:
+  %size.addr = alloca i32, align 4
+  %.offload_baseptrs = alloca [2 x i8*], align 8
+  %.offload_ptrs = alloca [2 x i8*], align 8
+  %.offload_sizes = alloca [2 x i64], align 8
+
+  store i32 %size, i32* %size.addr, align 4
+  %call = tail call i32 (...) @rand()
+
+  %conv = zext i32 %size to i64
+  %0 = shl nuw nsw i64 %conv, 3
+  %1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 0
+  %2 = bitcast [2 x i8*]* %.offload_baseptrs to double**
+  store double* %a, double** %2, align 8
+  %3 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 0
+  %4 = bitcast [2 x i8*]* %.offload_ptrs to double**
+  store double* %a, double** %4, align 8
+  %5 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 0
+  store i64 %0, i64* %5, align 8
+  %6 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 1
+  %7 = bitcast i8** %6 to i32**
+  store i32* %size.addr, i32** %7, align 8
+  %8 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 1
+  %9 = bitcast i8** %8 to i32**
+  store i32* %size.addr, i32** %9, align 8
+  %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
+  store i64 4, i64* %10, align 8
+  call void @__tgt_target_data_begin_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes., i64 0, i64 0), i8** null)
+  %rem = srem i32 %call, 7
+  call void @__tgt_target_data_end_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes., i64 0, i64 0), i8** null)
+  ret i32 %rem
+}
+
+declare void @__tgt_target_data_begin_mapper(i64, i32, i8**, i8**, i64*, i64*, i8**)
+declare void @__tgt_target_data_end_mapper(i64, i32, i8**, i8**, i64*, i64*, i8**)
+
+declare dso_local i32 @rand(...)