[Parallel_libs-commits] [parallel-libs] r281654 - [SE] Fix config bug with CUDA tests

Thu Sep 15 13:26:29 PDT 2016

Author: jhen
Date: Thu Sep 15 15:26:28 2016
New Revision: 281654

URL: http://llvm.org/viewvc/llvm-project?rev=281654&view=rev
Log:
[SE] Fix config bug with CUDA tests

Summary:
It turns out CMake errors out if a processed directory contains source
files that are not used. This was causing an error with the CUDATest.cpp
file when configuring StreamExecutor with the CUDA platform disabled.

Moving CUDATest.cpp to its own directory fixes this problem.

Reviewers: jlebar, jprice

Subscribers: beanz, mgorny, jlebar, parallel_libs-commits

Differential Revision: https://reviews.llvm.org/D24618

Added:
    parallel-libs/trunk/streamexecutor/unittests/CoreTests/cuda/
    parallel-libs/trunk/streamexecutor/unittests/CoreTests/cuda/CUDATest.cpp
      - copied, changed from r281635, parallel-libs/trunk/streamexecutor/unittests/CoreTests/CUDATest.cpp
Removed:
    parallel-libs/trunk/streamexecutor/unittests/CoreTests/CUDATest.cpp
Modified:
    parallel-libs/trunk/streamexecutor/unittests/CoreTests/CMakeLists.txt

Modified: parallel-libs/trunk/streamexecutor/unittests/CoreTests/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/parallel-libs/trunk/streamexecutor/unittests/CoreTests/CMakeLists.txt?rev=281654&r1=281653&r2=281654&view=diff
==============================================================================

--- parallel-libs/trunk/streamexecutor/unittests/CoreTests/CMakeLists.txt (original)
+++ parallel-libs/trunk/streamexecutor/unittests/CoreTests/CMakeLists.txt Thu Sep 15 15:26:28 2016
@@ -1,5 +1,5 @@
 if(STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM)
-    set(CUDA_TEST_SOURCES CUDATest.cpp)
+    set(CUDA_TEST_SOURCES cuda/CUDATest.cpp)
 endif()
 
 add_se_unittest(

Removed: parallel-libs/trunk/streamexecutor/unittests/CoreTests/CUDATest.cpp
URL: http://llvm.org/viewvc/llvm-project/parallel-libs/trunk/streamexecutor/unittests/CoreTests/CUDATest.cpp?rev=281653&view=auto
==============================================================================
--- parallel-libs/trunk/streamexecutor/unittests/CoreTests/CUDATest.cpp (original)
+++ parallel-libs/trunk/streamexecutor/unittests/CoreTests/CUDATest.cpp (removed)
@@ -1,215 +0,0 @@
-//===-- CUDATest.cpp - Tests for CUDA platform ----------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file contains the unit tests for CUDA platform code.
-///
-//===----------------------------------------------------------------------===//
-
-#include "streamexecutor/StreamExecutor.h"
-
-#include "gtest/gtest.h"
-
-namespace {
-
-namespace compilergen {
-using SaxpyKernel =
-    streamexecutor::Kernel<float, streamexecutor::GlobalDeviceMemory<float>,
-                           streamexecutor::GlobalDeviceMemory<float>>;
-
-const char *SaxpyPTX = R"(
-  .version 4.3
-  .target sm_20
-  .address_size 64
-
-  .visible .entry saxpy(.param .f32 A, .param .u64 X, .param .u64 Y) {
-    .reg .f32 %AValue;
-    .reg .f32 %XValue;
-    .reg .f32 %YValue;
-    .reg .f32 %Result;
-
-    .reg .b64 %XBaseAddrGeneric;
-    .reg .b64 %YBaseAddrGeneric;
-    .reg .b64 %XBaseAddrGlobal;
-    .reg .b64 %YBaseAddrGlobal;
-    .reg .b64 %XAddr;
-    .reg .b64 %YAddr;
-    .reg .b64 %ThreadByteOffset;
-
-    .reg .b32 %TID;
-
-    ld.param.f32 %AValue, [A];
-    ld.param.u64 %XBaseAddrGeneric, [X];
-    ld.param.u64 %YBaseAddrGeneric, [Y];
-    cvta.to.global.u64 %XBaseAddrGlobal, %XBaseAddrGeneric;
-    cvta.to.global.u64 %YBaseAddrGlobal, %YBaseAddrGeneric;
-    mov.u32 %TID, %tid.x;
-    mul.wide.u32 %ThreadByteOffset, %TID, 4;
-    add.s64 %XAddr, %ThreadByteOffset, %XBaseAddrGlobal;
-    add.s64 %YAddr, %ThreadByteOffset, %YBaseAddrGlobal;
-    ld.global.f32 %XValue, [%XAddr];
-    ld.global.f32 %YValue, [%YAddr];
-    fma.rn.f32 %Result, %AValue, %XValue, %YValue;
-    st.global.f32 [%XAddr], %Result;
-    ret;
-  }
-)";
-
-static streamexecutor::MultiKernelLoaderSpec SaxpyLoaderSpec = []() {
-  streamexecutor::MultiKernelLoaderSpec Spec;
-  Spec.addCUDAPTXInMemory("saxpy", {{{2, 0}, SaxpyPTX}});
-  return Spec;
-}();
-
-using SwapPairsKernel =
-    streamexecutor::Kernel<streamexecutor::SharedDeviceMemory<int>,
-                           streamexecutor::GlobalDeviceMemory<int>, int>;
-
-const char *SwapPairsPTX = R"(
-  .version 4.3
-  .target sm_20
-  .address_size 64
-
-  .extern .shared .align 4 .b8 SwapSpace[];
-
-  .visible .entry SwapPairs(.param .u64 InOut, .param .u32 InOutSize) {
-    .reg .b64 %InOutGeneric;
-    .reg .b32 %InOutSizeValue;
-
-    .reg .b32 %LocalIndex;
-    .reg .b32 %PartnerIndex;
-    .reg .b32 %ThreadsPerBlock;
-    .reg .b32 %BlockIndex;
-    .reg .b32 %GlobalIndex;
-
-    .reg .b32 %GlobalIndexBound;
-    .reg .pred %GlobalIndexTooHigh;
-
-    .reg .b64 %InOutGlobal;
-    .reg .b64 %GlobalByteOffset;
-    .reg .b64 %GlobalAddress;
-
-    .reg .b32 %InitialValue;
-    .reg .b32 %SwappedValue;
-
-    .reg .b64 %SharedBaseAddr;
-    .reg .b64 %LocalWriteByteOffset;
-    .reg .b64 %LocalReadByteOffset;
-    .reg .b64 %SharedWriteAddr;
-    .reg .b64 %SharedReadAddr;
-
-    ld.param.u64 %InOutGeneric, [InOut];
-    ld.param.u32 %InOutSizeValue, [InOutSize];
-    mov.u32 %LocalIndex, %tid.x;
-    mov.u32 %ThreadsPerBlock, %ntid.x;
-    mov.u32 %BlockIndex, %ctaid.x;
-    mad.lo.s32 %GlobalIndex, %ThreadsPerBlock, %BlockIndex, %LocalIndex;
-    and.b32 %GlobalIndexBound, %InOutSizeValue, -2;
-    setp.ge.s32 %GlobalIndexTooHigh, %GlobalIndex, %GlobalIndexBound;
-    @%GlobalIndexTooHigh bra END;
-
-    cvta.to.global.u64 %InOutGlobal, %InOutGeneric;
-    mul.wide.s32 %GlobalByteOffset, %GlobalIndex, 4;
-    add.s64 %GlobalAddress, %InOutGlobal, %GlobalByteOffset;
-    ld.global.u32 %InitialValue, [%GlobalAddress];
-    mul.wide.s32 %LocalWriteByteOffset, %LocalIndex, 4;
-    mov.u64 %SharedBaseAddr, SwapSpace;
-    add.s64 %SharedWriteAddr, %SharedBaseAddr, %LocalWriteByteOffset;
-    st.shared.u32 [%SharedWriteAddr], %InitialValue;
-    bar.sync 0;
-    xor.b32 %PartnerIndex, %LocalIndex, 1;
-    mul.wide.s32 %LocalReadByteOffset, %PartnerIndex, 4;
-    add.s64 %SharedReadAddr, %SharedBaseAddr, %LocalReadByteOffset;
-    ld.shared.u32 %SwappedValue, [%SharedReadAddr];
-    st.global.u32 [%GlobalAddress], %SwappedValue;
-
-  END:
-    ret;
-  }
-)";
-
-static streamexecutor::MultiKernelLoaderSpec SwapPairsLoaderSpec = []() {
-  streamexecutor::MultiKernelLoaderSpec Spec;
-  Spec.addCUDAPTXInMemory("SwapPairs", {{{2, 0}, SwapPairsPTX}});
-  return Spec;
-}();
-} // namespace compilergen
-
-namespace se = ::streamexecutor;
-namespace cg = ::compilergen;
-
-class CUDATest : public ::testing::Test {
-public:
-  CUDATest()
-      : Platform(getOrDie(se::PlatformManager::getPlatformByName("CUDA"))),
-        Device(getOrDie(Platform->getDevice(0))),
-        Stream(getOrDie(Device.createStream())) {}
-
-  se::Platform *Platform;
-  se::Device Device;
-  se::Stream Stream;
-};
-
-TEST_F(CUDATest, Saxpy) {
-  float A = 42.0f;
-  std::vector<float> HostX = {0, 1, 2, 3};
-  std::vector<float> HostY = {4, 5, 6, 7};
-  size_t ArraySize = HostX.size();
-
-  cg::SaxpyKernel Kernel =
-      getOrDie(Device.createKernel<cg::SaxpyKernel>(cg::SaxpyLoaderSpec));
-
-  se::RegisteredHostMemory<float> RegisteredX =
-      getOrDie(Device.registerHostMemory<float>(HostX));
-  se::RegisteredHostMemory<float> RegisteredY =
-      getOrDie(Device.registerHostMemory<float>(HostY));
-
-  se::GlobalDeviceMemory<float> X =
-      getOrDie(Device.allocateDeviceMemory<float>(ArraySize));
-  se::GlobalDeviceMemory<float> Y =
-      getOrDie(Device.allocateDeviceMemory<float>(ArraySize));
-
-  Stream.thenCopyH2D(RegisteredX, X)
-      .thenCopyH2D(RegisteredY, Y)
-      .thenLaunch(ArraySize, 1, Kernel, A, X, Y)
-      .thenCopyD2H(X, RegisteredX);
-  se::dieIfError(Stream.blockHostUntilDone());
-
-  std::vector<float> ExpectedX = {4, 47, 90, 133};
-  EXPECT_EQ(ExpectedX, HostX);
-}
-
-TEST_F(CUDATest, DynamicSharedMemory) {
-  std::vector<int> HostPairs = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  std::vector<int> HostResult(HostPairs.size(), 0);
-  int ArraySize = HostPairs.size();
-
-  cg::SwapPairsKernel Kernel = getOrDie(
-      Device.createKernel<cg::SwapPairsKernel>(cg::SwapPairsLoaderSpec));
-
-  se::RegisteredHostMemory<int> RegisteredPairs =
-      getOrDie(Device.registerHostMemory<int>(HostPairs));
-  se::RegisteredHostMemory<int> RegisteredResult =
-      getOrDie(Device.registerHostMemory<int>(HostResult));
-
-  se::GlobalDeviceMemory<int> Pairs =
-      getOrDie(Device.allocateDeviceMemory<int>(ArraySize));
-  auto SharedMemory =
-      se::SharedDeviceMemory<int>::makeFromElementCount(ArraySize);
-
-  Stream.thenCopyH2D(RegisteredPairs, Pairs)
-      .thenLaunch(ArraySize, 1, Kernel, SharedMemory, Pairs, ArraySize)
-      .thenCopyD2H(Pairs, RegisteredResult);
-  se::dieIfError(Stream.blockHostUntilDone());
-
-  std::vector<int> ExpectedPairs = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10};
-  EXPECT_EQ(ExpectedPairs, HostResult);
-}
-
-} // namespace

Copied: parallel-libs/trunk/streamexecutor/unittests/CoreTests/cuda/CUDATest.cpp (from r281635, parallel-libs/trunk/streamexecutor/unittests/CoreTests/CUDATest.cpp)
URL: http://llvm.org/viewvc/llvm-project/parallel-libs/trunk/streamexecutor/unittests/CoreTests/cuda/CUDATest.cpp?p2=parallel-libs/trunk/streamexecutor/unittests/CoreTests/cuda/CUDATest.cpp&p1=parallel-libs/trunk/streamexecutor/unittests/CoreTests/CUDATest.cpp&r1=281635&r2=281654&rev=281654&view=diff
==============================================================================
    (empty)