[Parallel_libs-commits] [parallel-libs] r281377 - [SE] Host platform implementation

Tue Sep 13 12:28:03 PDT 2016

Author: jhen
Date: Tue Sep 13 14:28:02 2016
New Revision: 281377

URL: http://llvm.org/viewvc/llvm-project?rev=281377&view=rev
Log:
[SE] Host platform implementation

Summary:
This implementation does not currently support multiple concurrent streams, and
it won't allow kernels to be launched with grids larger than one block or
blocks larger than one thread. These limitations could be removed in the future
by launching new threads on the host, but that is not done in this
implementation.

Reviewers: jlebar

Subscribers: beanz, mgorny, jprice, parallel_libs-commits

Differential Revision: https://reviews.llvm.org/D24473

Added:
    parallel-libs/trunk/streamexecutor/examples/HostSaxpy.cpp
      - copied, changed from r281374, parallel-libs/trunk/streamexecutor/examples/CUDASaxpy.cpp
    parallel-libs/trunk/streamexecutor/include/streamexecutor/platforms/
    parallel-libs/trunk/streamexecutor/include/streamexecutor/platforms/host/
    parallel-libs/trunk/streamexecutor/include/streamexecutor/platforms/host/HostPlatform.h
    parallel-libs/trunk/streamexecutor/include/streamexecutor/platforms/host/HostPlatformDevice.h
Modified:
    parallel-libs/trunk/streamexecutor/examples/CMakeLists.txt
    parallel-libs/trunk/streamexecutor/examples/CUDASaxpy.cpp
    parallel-libs/trunk/streamexecutor/include/streamexecutor/KernelSpec.h
    parallel-libs/trunk/streamexecutor/include/streamexecutor/PlatformDevice.h
    parallel-libs/trunk/streamexecutor/lib/PlatformManager.cpp

Modified: parallel-libs/trunk/streamexecutor/examples/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/parallel-libs/trunk/streamexecutor/examples/CMakeLists.txt?rev=281377&r1=281376&r2=281377&view=diff
==============================================================================

--- parallel-libs/trunk/streamexecutor/examples/CMakeLists.txt (original)
+++ parallel-libs/trunk/streamexecutor/examples/CMakeLists.txt Tue Sep 13 14:28:02 2016
@@ -1,2 +1,5 @@
 add_executable(cuda_saxpy_example CUDASaxpy.cpp)
 target_link_libraries(cuda_saxpy_example streamexecutor)
+
+add_executable(host_saxpy_example HostSaxpy.cpp)
+target_link_libraries(host_saxpy_example streamexecutor)

Modified: parallel-libs/trunk/streamexecutor/examples/CUDASaxpy.cpp
URL: http://llvm.org/viewvc/llvm-project/parallel-libs/trunk/streamexecutor/examples/CUDASaxpy.cpp?rev=281377&r1=281376&r2=281377&view=diff
==============================================================================
--- parallel-libs/trunk/streamexecutor/examples/CUDASaxpy.cpp (original)
+++ parallel-libs/trunk/streamexecutor/examples/CUDASaxpy.cpp Tue Sep 13 14:28:02 2016
@@ -17,7 +17,6 @@
 
 #include <algorithm>
 #include <cassert>
-#include <cstdio>
 #include <cstdlib>
 #include <vector>
 

Copied: parallel-libs/trunk/streamexecutor/examples/HostSaxpy.cpp (from r281374, parallel-libs/trunk/streamexecutor/examples/CUDASaxpy.cpp)
URL: http://llvm.org/viewvc/llvm-project/parallel-libs/trunk/streamexecutor/examples/HostSaxpy.cpp?p2=parallel-libs/trunk/streamexecutor/examples/HostSaxpy.cpp&p1=parallel-libs/trunk/streamexecutor/examples/CUDASaxpy.cpp&r1=281374&r2=281377&rev=281377&view=diff
==============================================================================
--- parallel-libs/trunk/streamexecutor/examples/CUDASaxpy.cpp (original)
+++ parallel-libs/trunk/streamexecutor/examples/HostSaxpy.cpp Tue Sep 13 14:28:02 2016
@@ -1,4 +1,4 @@
-//===-- CUDASaxpy.cpp - Example of CUDA saxpy with StreamExecutor API -----===//
+//===-- HostSaxpy.cpp - Example of host saxpy with StreamExecutor API -----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -9,91 +9,44 @@
 ///
 /// \file
 /// This file contains example code demonstrating the usage of the
-/// StreamExecutor API. Snippets of this file will be included as code examples
-/// in documentation. Taking these examples from a real source file guarantees
-/// that the examples will always compile.
+/// StreamExecutor API for a host platform.
 ///
 //===----------------------------------------------------------------------===//
 
 #include <algorithm>
 #include <cassert>
 #include <cstdio>
-#include <cstdlib>
 #include <vector>
 
 #include "streamexecutor/StreamExecutor.h"
 
-/// [Example saxpy compiler-generated]
-// Code in this namespace is generated by the compiler (e.g. clang).
-//
-// The name of this namespace may depend on the compiler that generated it, so
-// this is just an example name.
-namespace __compilergen {
+void Saxpy(float A, float *X, float *Y, size_t N) {
+  for (size_t I = 0; I < N; ++I)
+    X[I] = A * X[I] + Y[I];
+}
 
-// Specialization of the streamexecutor::Kernel template class for the parameter
-// types of the saxpy(float A, float *X, float *Y) kernel.
+namespace __compilergen {
 using SaxpyKernel =
     streamexecutor::Kernel<float, streamexecutor::GlobalDeviceMemory<float>,
-                           streamexecutor::GlobalDeviceMemory<float>>;
+                           streamexecutor::GlobalDeviceMemory<float>, size_t>;
 
-// A string containing the PTX code generated by the device compiler for the
-// saxpy kernel. String contents not shown here.
-extern const char *SaxpyPTX;
+// Wrapper function converts argument addresses to arguments.
+void SaxpyWrapper(const void *const *ArgumentAddresses) {
+  Saxpy(*static_cast<const float *>(ArgumentAddresses[0]),
+        static_cast<float *>(const_cast<void *>(ArgumentAddresses[1])),
+        static_cast<float *>(const_cast<void *>(ArgumentAddresses[2])),
+        *static_cast<const size_t *>(ArgumentAddresses[3]));
+}
 
-// A global instance of a loader spec that knows how to load the code in the
-// SaxpyPTX string.
+// The wrapper function is what gets registered.
 static streamexecutor::MultiKernelLoaderSpec SaxpyLoaderSpec = []() {
   streamexecutor::MultiKernelLoaderSpec Spec;
-  Spec.addCUDAPTXInMemory("saxpy", {{{2, 0}, SaxpyPTX}});
+  Spec.addHostFunction("Saxpy", SaxpyWrapper);
   return Spec;
 }();
-
 } // namespace __compilergen
-/// [Example saxpy compiler-generated]
-
-/// [Example saxpy host PTX]
-// The PTX text for a saxpy kernel.
-const char *__compilergen::SaxpyPTX = R"(
-  .version 4.3
-  .target sm_20
-  .address_size 64
-
-  .visible .entry saxpy(.param .f32 A, .param .u64 X, .param .u64 Y) {
-    .reg .f32 %AValue;
-    .reg .f32 %XValue;
-    .reg .f32 %YValue;
-    .reg .f32 %Result;
-
-    .reg .b64 %XBaseAddrGeneric;
-    .reg .b64 %YBaseAddrGeneric;
-    .reg .b64 %XBaseAddrGlobal;
-    .reg .b64 %YBaseAddrGlobal;
-    .reg .b64 %XAddr;
-    .reg .b64 %YAddr;
-    .reg .b64 %ThreadByteOffset;
-
-    .reg .b32 %TID;
-
-    ld.param.f32 %AValue, [A];
-    ld.param.u64 %XBaseAddrGeneric, [X];
-    ld.param.u64 %YBaseAddrGeneric, [Y];
-    cvta.to.global.u64 %XBaseAddrGlobal, %XBaseAddrGeneric;
-    cvta.to.global.u64 %YBaseAddrGlobal, %YBaseAddrGeneric;
-    mov.u32 %TID, %tid.x;
-    mul.wide.u32 %ThreadByteOffset, %TID, 4;
-    add.s64 %XAddr, %ThreadByteOffset, %XBaseAddrGlobal;
-    add.s64 %YAddr, %ThreadByteOffset, %YBaseAddrGlobal;
-    ld.global.f32 %XValue, [%XAddr];
-    ld.global.f32 %YValue, [%YAddr];
-    fma.rn.f32 %Result, %AValue, %XValue, %YValue;
-    st.global.f32 [%XAddr], %Result;
-    ret;
-  }
-)";
-/// [Example saxpy host PTX]
 
 int main() {
-  /// [Example saxpy host main]
   namespace se = ::streamexecutor;
   namespace cg = ::__compilergen;
 
@@ -105,7 +58,7 @@ int main() {
 
   // Get a device object.
   se::Platform *Platform =
-      getOrDie(se::PlatformManager::getPlatformByName("CUDA"));
+      getOrDie(se::PlatformManager::getPlatformByName("host"));
   if (Platform->getDeviceCount() == 0) {
     return EXIT_FAILURE;
   }
@@ -130,7 +83,7 @@ int main() {
   se::Stream Stream = getOrDie(Device->createStream());
   Stream.thenCopyH2D(RegisteredX, X)
       .thenCopyH2D(RegisteredY, Y)
-      .thenLaunch(ArraySize, 1, Kernel, A, X, Y)
+      .thenLaunch(1, 1, Kernel, A, X, Y, ArraySize)
       .thenCopyD2H(X, RegisteredX);
   // Wait for the stream to complete.
   se::dieIfError(Stream.blockHostUntilDone());
@@ -138,5 +91,4 @@ int main() {
   // Process output data in HostX.
   std::vector<float> ExpectedX = {4, 47, 90, 133};
   assert(std::equal(ExpectedX.begin(), ExpectedX.end(), HostX.begin()));
-  /// [Example saxpy host main]
 }

Modified: parallel-libs/trunk/streamexecutor/include/streamexecutor/KernelSpec.h
URL: http://llvm.org/viewvc/llvm-project/parallel-libs/trunk/streamexecutor/include/streamexecutor/KernelSpec.h?rev=281377&r1=281376&r2=281377&view=diff
==============================================================================
--- parallel-libs/trunk/streamexecutor/include/streamexecutor/KernelSpec.h (original)
+++ parallel-libs/trunk/streamexecutor/include/streamexecutor/KernelSpec.h Tue Sep 13 14:28:02 2016
@@ -65,11 +65,13 @@
 #define STREAMEXECUTOR_KERNELSPEC_H
 
 #include <cassert>
+#include <functional>
 #include <map>
 #include <memory>
 #include <string>
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 
 namespace streamexecutor {
@@ -199,6 +201,9 @@ private:
 /// than doing it by hand.
 class MultiKernelLoaderSpec {
 public:
+  /// Type of functions used as host platform kernels.
+  using HostFunctionTy = std::function<void(const void **)>;
+
   std::string getKernelName() const {
     if (TheKernelName)
       return *TheKernelName;
@@ -215,6 +220,7 @@ public:
   bool hasOpenCLTextInMemory() const {
     return TheOpenCLTextInMemorySpec != nullptr;
   }
+  bool hasHostFunction() const { return HostFunction != nullptr; }
 
   // Accessors for platform variant kernel load specifications.
   //
@@ -233,6 +239,11 @@ public:
     return *TheOpenCLTextInMemorySpec;
   }
 
+  const HostFunctionTy &getHostFunction() const {
+    assert(hasHostFunction() && "getting spec that is not present");
+    return *HostFunction;
+  }
+
   // Builder-pattern-like methods for use in initializing a
   // MultiKernelLoaderSpec.
   //
@@ -256,6 +267,12 @@ public:
   MultiKernelLoaderSpec &addOpenCLTextInMemory(llvm::StringRef KernelName,
                                                const char *OpenCLText);
 
+  MultiKernelLoaderSpec &addHostFunction(llvm::StringRef KernelName,
+                                         HostFunctionTy Function) {
+    HostFunction = llvm::make_unique<HostFunctionTy>(std::move(Function));
+    return *this;
+  }
+
 private:
   void setKernelName(llvm::StringRef KernelName);
 
@@ -263,6 +280,7 @@ private:
   std::unique_ptr<CUDAPTXInMemorySpec> TheCUDAPTXInMemorySpec;
   std::unique_ptr<CUDAFatbinInMemorySpec> TheCUDAFatbinInMemorySpec;
   std::unique_ptr<OpenCLTextInMemorySpec> TheOpenCLTextInMemorySpec;
+  std::unique_ptr<HostFunctionTy> HostFunction;
 };
 
 } // namespace streamexecutor

Modified: parallel-libs/trunk/streamexecutor/include/streamexecutor/PlatformDevice.h
URL: http://llvm.org/viewvc/llvm-project/parallel-libs/trunk/streamexecutor/include/streamexecutor/PlatformDevice.h?rev=281377&r1=281376&r2=281377&view=diff
==============================================================================
--- parallel-libs/trunk/streamexecutor/include/streamexecutor/PlatformDevice.h (original)
+++ parallel-libs/trunk/streamexecutor/include/streamexecutor/PlatformDevice.h Tue Sep 13 14:28:02 2016
@@ -149,10 +149,10 @@ public:
   /// Similar to synchronousCopyD2H(const void *, size_t, void
   /// *, size_t, size_t), but copies memory from one location in device memory
   /// to another rather than from device to host.
-  virtual Error synchronousCopyD2D(const void *DeviceDstHandle,
-                                   size_t DstByteOffset,
-                                   const void *DeviceSrcHandle,
-                                   size_t SrcByteOffset, size_t ByteCount) {
+  virtual Error synchronousCopyD2D(const void *DeviceSrcHandle,
+                                   size_t SrcByteOffset,
+                                   const void *DeviceDstHandle,
+                                   size_t DstByteOffset, size_t ByteCount) {
     return make_error("synchronousCopyD2D not implemented for platform " +
                       getName());
   }

Added: parallel-libs/trunk/streamexecutor/include/streamexecutor/platforms/host/HostPlatform.h
URL: http://llvm.org/viewvc/llvm-project/parallel-libs/trunk/streamexecutor/include/streamexecutor/platforms/host/HostPlatform.h?rev=281377&view=auto
==============================================================================
--- parallel-libs/trunk/streamexecutor/include/streamexecutor/platforms/host/HostPlatform.h (added)
+++ parallel-libs/trunk/streamexecutor/include/streamexecutor/platforms/host/HostPlatform.h Tue Sep 13 14:28:02 2016
@@ -0,0 +1,56 @@
+//===-- HostPlatform.h - Host platform subclass -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Declaration of the HostPlatform class.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORM_H
+#define STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORM_H
+
+#include "HostPlatformDevice.h"
+#include "streamexecutor/Device.h"
+#include "streamexecutor/Platform.h"
+
+#include "llvm/Support/Mutex.h"
+
+namespace streamexecutor {
+namespace host {
+
+/// Platform that performs work on the host rather than offloading to an
+/// accelerator.
+class HostPlatform : public Platform {
+public:
+  size_t getDeviceCount() const override { return 1; }
+
+  Expected<Device *> getDevice(size_t DeviceIndex) override {
+    if (DeviceIndex != 0) {
+      return make_error(
+          "Requested device index " + llvm::Twine(DeviceIndex) +
+          " from host platform which only supports device index 0");
+    }
+    llvm::sys::ScopedLock Lock(Mutex);
+    if (!TheDevice) {
+      ThePlatformDevice = llvm::make_unique<HostPlatformDevice>();
+      TheDevice = llvm::make_unique<Device>(ThePlatformDevice.get());
+    }
+    return TheDevice.get();
+  }
+
+private:
+  llvm::sys::Mutex Mutex;
+  std::unique_ptr<HostPlatformDevice> ThePlatformDevice;
+  std::unique_ptr<Device> TheDevice;
+};
+
+} // namespace host
+} // namespace streamexecutor
+
+#endif // STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORM_H

Added: parallel-libs/trunk/streamexecutor/include/streamexecutor/platforms/host/HostPlatformDevice.h
URL: http://llvm.org/viewvc/llvm-project/parallel-libs/trunk/streamexecutor/include/streamexecutor/platforms/host/HostPlatformDevice.h?rev=281377&view=auto
==============================================================================
--- parallel-libs/trunk/streamexecutor/include/streamexecutor/platforms/host/HostPlatformDevice.h (added)
+++ parallel-libs/trunk/streamexecutor/include/streamexecutor/platforms/host/HostPlatformDevice.h Tue Sep 13 14:28:02 2016
@@ -0,0 +1,151 @@
+//===-- HostPlatformDevice.h - HostPlatformDevice class ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Declaration of the HostPlatformDevice class.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORMDEVICE_H
+#define STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORMDEVICE_H
+
+#include <cstdlib>
+#include <cstring>
+
+#include "streamexecutor/PlatformDevice.h"
+
+namespace streamexecutor {
+namespace host {
+
+/// A concrete PlatformDevice subclass that performs its work on the host rather
+/// than offloading to an accelerator.
+class HostPlatformDevice : public PlatformDevice {
+public:
+  std::string getName() const override { return "host"; }
+
+  Expected<const void *>
+  createKernel(const MultiKernelLoaderSpec &Spec) override {
+    if (!Spec.hasHostFunction()) {
+      return make_error("no host implementation available for kernel " +
+                        Spec.getKernelName());
+    }
+    return static_cast<const void *>(&Spec.getHostFunction());
+  }
+
+  Error destroyKernel(const void *Handle) override { return Error::success(); }
+
+  Expected<const void *> createStream() override {
+    // TODO(jhen): Do something with threads to allow multiple streams.
+    return this;
+  }
+
+  Error destroyStream(const void *Handle) override { return Error::success(); }
+
+  Error launch(const void *PlatformStreamHandle, BlockDimensions BlockSize,
+               GridDimensions GridSize, const void *PKernelHandle,
+               const PackedKernelArgumentArrayBase &ArgumentArray) override {
+    // TODO(jhen): Can we do something with BlockSize and GridSize?
+    if (!(BlockSize.X == 1 && BlockSize.Y == 1 && BlockSize.Z == 1)) {
+      return make_error(
+          "Block dimensions were (" + llvm::Twine(BlockSize.X) + "," +
+          llvm::Twine(BlockSize.Y) + "," + llvm::Twine(BlockSize.Z) +
+          "), but only size (1,1,1) is permitted for this platform");
+    }
+    if (!(GridSize.X == 1 && GridSize.Y == 1 && GridSize.Z == 1)) {
+      return make_error(
+          "Grid dimensions were (" + llvm::Twine(GridSize.X) + "," +
+          llvm::Twine(GridSize.Y) + "," + llvm::Twine(GridSize.Z) +
+          "), but only size (1,1,1) is permitted for this platform");
+    }
+
+    (*static_cast<const std::function<void(const void *const *)> *>(
+        PKernelHandle))(ArgumentArray.getAddresses());
+    return Error::success();
+  }
+
+  Error copyD2H(const void *PlatformStreamHandle, const void *DeviceSrcHandle,
+                size_t SrcByteOffset, void *HostDst, size_t DstByteOffset,
+                size_t ByteCount) override {
+    std::memcpy(offset(HostDst, DstByteOffset),
+                offset(DeviceSrcHandle, SrcByteOffset), ByteCount);
+    return Error::success();
+  }
+
+  Error copyH2D(const void *PlatformStreamHandle, const void *HostSrc,
+                size_t SrcByteOffset, const void *DeviceDstHandle,
+                size_t DstByteOffset, size_t ByteCount) override {
+    std::memcpy(offset(DeviceDstHandle, DstByteOffset),
+                offset(HostSrc, SrcByteOffset), ByteCount);
+    return Error::success();
+  }
+
+  Error copyD2D(const void *PlatformStreamHandle, const void *DeviceSrcHandle,
+                size_t SrcByteOffset, const void *DeviceDstHandle,
+                size_t DstByteOffset, size_t ByteCount) override {
+    std::memcpy(offset(DeviceDstHandle, DstByteOffset),
+                offset(DeviceSrcHandle, SrcByteOffset), ByteCount);
+    return Error::success();
+  }
+
+  Error blockHostUntilDone(const void *PlatformStreamHandle) override {
+    // All host operations are synchronous anyway.
+    return Error::success();
+  }
+
+  Expected<void *> allocateDeviceMemory(size_t ByteCount) override {
+    return std::malloc(ByteCount);
+  }
+
+  Error freeDeviceMemory(const void *Handle) override {
+    std::free(const_cast<void *>(Handle));
+    return Error::success();
+  }
+
+  Error registerHostMemory(void *Memory, size_t ByteCount) override {
+    return Error::success();
+  }
+
+  Error unregisterHostMemory(const void *Memory) override {
+    return Error::success();
+  }
+
+  Error synchronousCopyD2H(const void *DeviceSrcHandle, size_t SrcByteOffset,
+                           void *HostDst, size_t DstByteOffset,
+                           size_t ByteCount) override {
+    std::memcpy(offset(HostDst, DstByteOffset),
+                offset(DeviceSrcHandle, SrcByteOffset), ByteCount);
+    return Error::success();
+  }
+
+  Error synchronousCopyH2D(const void *HostSrc, size_t SrcByteOffset,
+                           const void *DeviceDstHandle, size_t DstByteOffset,
+                           size_t ByteCount) override {
+    std::memcpy(offset(DeviceDstHandle, DstByteOffset),
+                offset(HostSrc, SrcByteOffset), ByteCount);
+    return Error::success();
+  }
+
+  Error synchronousCopyD2D(const void *DeviceSrcHandle, size_t SrcByteOffset,
+                           const void *DeviceDstHandle, size_t DstByteOffset,
+                           size_t ByteCount) override {
+    std::memcpy(offset(DeviceDstHandle, DstByteOffset),
+                offset(DeviceSrcHandle, SrcByteOffset), ByteCount);
+    return Error::success();
+  }
+
+private:
+  static void *offset(const void *Base, size_t Offset) {
+    return const_cast<char *>(static_cast<const char *>(Base) + Offset);
+  }
+};
+
+} // namespace host
+} // namespace streamexecutor
+
+#endif // STREAMEXECUTOR_PLATFORMS_HOST_HOSTPLATFORMDEVICE_H

Modified: parallel-libs/trunk/streamexecutor/lib/PlatformManager.cpp
URL: http://llvm.org/viewvc/llvm-project/parallel-libs/trunk/streamexecutor/lib/PlatformManager.cpp?rev=281377&r1=281376&r2=281377&view=diff
==============================================================================
--- parallel-libs/trunk/streamexecutor/lib/PlatformManager.cpp (original)
+++ parallel-libs/trunk/streamexecutor/lib/PlatformManager.cpp Tue Sep 13 14:28:02 2016
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "streamexecutor/PlatformManager.h"
+#include "streamexecutor/platforms/host/HostPlatform.h"
 
 namespace streamexecutor {
 
@@ -23,6 +24,8 @@ PlatformManager::PlatformManager() {
   //    appropriate code to include here.
   //  * Use static initialization tricks to have platform libraries register
   //    themselves when they are loaded.
+
+  PlatformsByName.emplace("host", llvm::make_unique<host::HostPlatform>());
 }
 
 Expected<Platform *> PlatformManager::getPlatformByName(llvm::StringRef Name) {