[Parallel_libs-commits] [parallel-libs] r281654 - [SE] Fix config bug with CUDA tests
Jason Henline via Parallel_libs-commits
parallel_libs-commits at lists.llvm.org
Thu Sep 15 13:26:29 PDT 2016
Author: jhen
Date: Thu Sep 15 15:26:28 2016
New Revision: 281654
URL: http://llvm.org/viewvc/llvm-project?rev=281654&view=rev
Log:
[SE] Fix config bug with CUDA tests
Summary:
It turns out CMake errors out if a processed directory contains source
files that are not used. This was causing an error with the CUDATest.cpp
file when configuring StreamExecutor with the CUDA platform disabled.
Moving CUDATest.cpp to its own directory fixes this problem.
Reviewers: jlebar, jprice
Subscribers: beanz, mgorny, jlebar, parallel_libs-commits
Differential Revision: https://reviews.llvm.org/D24618
Added:
parallel-libs/trunk/streamexecutor/unittests/CoreTests/cuda/
parallel-libs/trunk/streamexecutor/unittests/CoreTests/cuda/CUDATest.cpp
- copied, changed from r281635, parallel-libs/trunk/streamexecutor/unittests/CoreTests/CUDATest.cpp
Removed:
parallel-libs/trunk/streamexecutor/unittests/CoreTests/CUDATest.cpp
Modified:
parallel-libs/trunk/streamexecutor/unittests/CoreTests/CMakeLists.txt
Modified: parallel-libs/trunk/streamexecutor/unittests/CoreTests/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/parallel-libs/trunk/streamexecutor/unittests/CoreTests/CMakeLists.txt?rev=281654&r1=281653&r2=281654&view=diff
==============================================================================
--- parallel-libs/trunk/streamexecutor/unittests/CoreTests/CMakeLists.txt (original)
+++ parallel-libs/trunk/streamexecutor/unittests/CoreTests/CMakeLists.txt Thu Sep 15 15:26:28 2016
@@ -1,5 +1,5 @@
if(STREAM_EXECUTOR_ENABLE_CUDA_PLATFORM)
- set(CUDA_TEST_SOURCES CUDATest.cpp)
+ set(CUDA_TEST_SOURCES cuda/CUDATest.cpp)
endif()
add_se_unittest(
Removed: parallel-libs/trunk/streamexecutor/unittests/CoreTests/CUDATest.cpp
URL: http://llvm.org/viewvc/llvm-project/parallel-libs/trunk/streamexecutor/unittests/CoreTests/CUDATest.cpp?rev=281653&view=auto
==============================================================================
--- parallel-libs/trunk/streamexecutor/unittests/CoreTests/CUDATest.cpp (original)
+++ parallel-libs/trunk/streamexecutor/unittests/CoreTests/CUDATest.cpp (removed)
@@ -1,215 +0,0 @@
-//===-- CUDATest.cpp - Tests for CUDA platform ----------------------------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file contains the unit tests for CUDA platform code.
-///
-//===----------------------------------------------------------------------===//
-
-#include "streamexecutor/StreamExecutor.h"
-
-#include "gtest/gtest.h"
-
-namespace {
-
-namespace compilergen {
-using SaxpyKernel =
- streamexecutor::Kernel<float, streamexecutor::GlobalDeviceMemory<float>,
- streamexecutor::GlobalDeviceMemory<float>>;
-
-const char *SaxpyPTX = R"(
- .version 4.3
- .target sm_20
- .address_size 64
-
- .visible .entry saxpy(.param .f32 A, .param .u64 X, .param .u64 Y) {
- .reg .f32 %AValue;
- .reg .f32 %XValue;
- .reg .f32 %YValue;
- .reg .f32 %Result;
-
- .reg .b64 %XBaseAddrGeneric;
- .reg .b64 %YBaseAddrGeneric;
- .reg .b64 %XBaseAddrGlobal;
- .reg .b64 %YBaseAddrGlobal;
- .reg .b64 %XAddr;
- .reg .b64 %YAddr;
- .reg .b64 %ThreadByteOffset;
-
- .reg .b32 %TID;
-
- ld.param.f32 %AValue, [A];
- ld.param.u64 %XBaseAddrGeneric, [X];
- ld.param.u64 %YBaseAddrGeneric, [Y];
- cvta.to.global.u64 %XBaseAddrGlobal, %XBaseAddrGeneric;
- cvta.to.global.u64 %YBaseAddrGlobal, %YBaseAddrGeneric;
- mov.u32 %TID, %tid.x;
- mul.wide.u32 %ThreadByteOffset, %TID, 4;
- add.s64 %XAddr, %ThreadByteOffset, %XBaseAddrGlobal;
- add.s64 %YAddr, %ThreadByteOffset, %YBaseAddrGlobal;
- ld.global.f32 %XValue, [%XAddr];
- ld.global.f32 %YValue, [%YAddr];
- fma.rn.f32 %Result, %AValue, %XValue, %YValue;
- st.global.f32 [%XAddr], %Result;
- ret;
- }
-)";
-
-static streamexecutor::MultiKernelLoaderSpec SaxpyLoaderSpec = []() {
- streamexecutor::MultiKernelLoaderSpec Spec;
- Spec.addCUDAPTXInMemory("saxpy", {{{2, 0}, SaxpyPTX}});
- return Spec;
-}();
-
-using SwapPairsKernel =
- streamexecutor::Kernel<streamexecutor::SharedDeviceMemory<int>,
- streamexecutor::GlobalDeviceMemory<int>, int>;
-
-const char *SwapPairsPTX = R"(
- .version 4.3
- .target sm_20
- .address_size 64
-
- .extern .shared .align 4 .b8 SwapSpace[];
-
- .visible .entry SwapPairs(.param .u64 InOut, .param .u32 InOutSize) {
- .reg .b64 %InOutGeneric;
- .reg .b32 %InOutSizeValue;
-
- .reg .b32 %LocalIndex;
- .reg .b32 %PartnerIndex;
- .reg .b32 %ThreadsPerBlock;
- .reg .b32 %BlockIndex;
- .reg .b32 %GlobalIndex;
-
- .reg .b32 %GlobalIndexBound;
- .reg .pred %GlobalIndexTooHigh;
-
- .reg .b64 %InOutGlobal;
- .reg .b64 %GlobalByteOffset;
- .reg .b64 %GlobalAddress;
-
- .reg .b32 %InitialValue;
- .reg .b32 %SwappedValue;
-
- .reg .b64 %SharedBaseAddr;
- .reg .b64 %LocalWriteByteOffset;
- .reg .b64 %LocalReadByteOffset;
- .reg .b64 %SharedWriteAddr;
- .reg .b64 %SharedReadAddr;
-
- ld.param.u64 %InOutGeneric, [InOut];
- ld.param.u32 %InOutSizeValue, [InOutSize];
- mov.u32 %LocalIndex, %tid.x;
- mov.u32 %ThreadsPerBlock, %ntid.x;
- mov.u32 %BlockIndex, %ctaid.x;
- mad.lo.s32 %GlobalIndex, %ThreadsPerBlock, %BlockIndex, %LocalIndex;
- and.b32 %GlobalIndexBound, %InOutSizeValue, -2;
- setp.ge.s32 %GlobalIndexTooHigh, %GlobalIndex, %GlobalIndexBound;
- @%GlobalIndexTooHigh bra END;
-
- cvta.to.global.u64 %InOutGlobal, %InOutGeneric;
- mul.wide.s32 %GlobalByteOffset, %GlobalIndex, 4;
- add.s64 %GlobalAddress, %InOutGlobal, %GlobalByteOffset;
- ld.global.u32 %InitialValue, [%GlobalAddress];
- mul.wide.s32 %LocalWriteByteOffset, %LocalIndex, 4;
- mov.u64 %SharedBaseAddr, SwapSpace;
- add.s64 %SharedWriteAddr, %SharedBaseAddr, %LocalWriteByteOffset;
- st.shared.u32 [%SharedWriteAddr], %InitialValue;
- bar.sync 0;
- xor.b32 %PartnerIndex, %LocalIndex, 1;
- mul.wide.s32 %LocalReadByteOffset, %PartnerIndex, 4;
- add.s64 %SharedReadAddr, %SharedBaseAddr, %LocalReadByteOffset;
- ld.shared.u32 %SwappedValue, [%SharedReadAddr];
- st.global.u32 [%GlobalAddress], %SwappedValue;
-
- END:
- ret;
- }
-)";
-
-static streamexecutor::MultiKernelLoaderSpec SwapPairsLoaderSpec = []() {
- streamexecutor::MultiKernelLoaderSpec Spec;
- Spec.addCUDAPTXInMemory("SwapPairs", {{{2, 0}, SwapPairsPTX}});
- return Spec;
-}();
-} // namespace compilergen
-
-namespace se = ::streamexecutor;
-namespace cg = ::compilergen;
-
-class CUDATest : public ::testing::Test {
-public:
- CUDATest()
- : Platform(getOrDie(se::PlatformManager::getPlatformByName("CUDA"))),
- Device(getOrDie(Platform->getDevice(0))),
- Stream(getOrDie(Device.createStream())) {}
-
- se::Platform *Platform;
- se::Device Device;
- se::Stream Stream;
-};
-
-TEST_F(CUDATest, Saxpy) {
- float A = 42.0f;
- std::vector<float> HostX = {0, 1, 2, 3};
- std::vector<float> HostY = {4, 5, 6, 7};
- size_t ArraySize = HostX.size();
-
- cg::SaxpyKernel Kernel =
- getOrDie(Device.createKernel<cg::SaxpyKernel>(cg::SaxpyLoaderSpec));
-
- se::RegisteredHostMemory<float> RegisteredX =
- getOrDie(Device.registerHostMemory<float>(HostX));
- se::RegisteredHostMemory<float> RegisteredY =
- getOrDie(Device.registerHostMemory<float>(HostY));
-
- se::GlobalDeviceMemory<float> X =
- getOrDie(Device.allocateDeviceMemory<float>(ArraySize));
- se::GlobalDeviceMemory<float> Y =
- getOrDie(Device.allocateDeviceMemory<float>(ArraySize));
-
- Stream.thenCopyH2D(RegisteredX, X)
- .thenCopyH2D(RegisteredY, Y)
- .thenLaunch(ArraySize, 1, Kernel, A, X, Y)
- .thenCopyD2H(X, RegisteredX);
- se::dieIfError(Stream.blockHostUntilDone());
-
- std::vector<float> ExpectedX = {4, 47, 90, 133};
- EXPECT_EQ(ExpectedX, HostX);
-}
-
-TEST_F(CUDATest, DynamicSharedMemory) {
- std::vector<int> HostPairs = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
- std::vector<int> HostResult(HostPairs.size(), 0);
- int ArraySize = HostPairs.size();
-
- cg::SwapPairsKernel Kernel = getOrDie(
- Device.createKernel<cg::SwapPairsKernel>(cg::SwapPairsLoaderSpec));
-
- se::RegisteredHostMemory<int> RegisteredPairs =
- getOrDie(Device.registerHostMemory<int>(HostPairs));
- se::RegisteredHostMemory<int> RegisteredResult =
- getOrDie(Device.registerHostMemory<int>(HostResult));
-
- se::GlobalDeviceMemory<int> Pairs =
- getOrDie(Device.allocateDeviceMemory<int>(ArraySize));
- auto SharedMemory =
- se::SharedDeviceMemory<int>::makeFromElementCount(ArraySize);
-
- Stream.thenCopyH2D(RegisteredPairs, Pairs)
- .thenLaunch(ArraySize, 1, Kernel, SharedMemory, Pairs, ArraySize)
- .thenCopyD2H(Pairs, RegisteredResult);
- se::dieIfError(Stream.blockHostUntilDone());
-
- std::vector<int> ExpectedPairs = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10};
- EXPECT_EQ(ExpectedPairs, HostResult);
-}
-
-} // namespace
Copied: parallel-libs/trunk/streamexecutor/unittests/CoreTests/cuda/CUDATest.cpp (from r281635, parallel-libs/trunk/streamexecutor/unittests/CoreTests/CUDATest.cpp)
URL: http://llvm.org/viewvc/llvm-project/parallel-libs/trunk/streamexecutor/unittests/CoreTests/cuda/CUDATest.cpp?p2=parallel-libs/trunk/streamexecutor/unittests/CoreTests/cuda/CUDATest.cpp&p1=parallel-libs/trunk/streamexecutor/unittests/CoreTests/CUDATest.cpp&r1=281635&r2=281654&rev=281654&view=diff
==============================================================================
(empty)
More information about the Parallel_libs-commits
mailing list