10 #if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H) 11 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H 15 static const int kCudaScratchSize = 1024;
19 class StreamInterface {
21 virtual ~StreamInterface() {}
23 virtual const cudaStream_t& stream()
const = 0;
24 virtual const cudaDeviceProp& deviceProperties()
const = 0;
27 virtual void* allocate(
size_t num_bytes)
const = 0;
28 virtual void deallocate(
void* buffer)
const = 0;
31 virtual void* scratchpad()
const = 0;
37 virtual unsigned int* semaphore()
const = 0;
40 static cudaDeviceProp* m_deviceProperties;
41 static bool m_devicePropInitialized =
false;
43 static void initializeDeviceProp() {
44 if (!m_devicePropInitialized) {
45 if (!m_devicePropInitialized) {
47 cudaError_t status = cudaGetDeviceCount(&num_devices);
48 if (status != cudaSuccess) {
49 std::cerr <<
"Failed to get the number of CUDA devices: " 50 << cudaGetErrorString(status)
52 assert(status == cudaSuccess);
54 m_deviceProperties =
new cudaDeviceProp[num_devices];
55 for (
int i = 0; i < num_devices; ++i) {
56 status = cudaGetDeviceProperties(&m_deviceProperties[i], i);
57 if (status != cudaSuccess) {
58 std::cerr <<
"Failed to initialize CUDA device #" 61 << cudaGetErrorString(status)
63 assert(status == cudaSuccess);
66 m_devicePropInitialized =
true;
71 static const cudaStream_t default_stream = cudaStreamDefault;
73 class CudaStreamDevice :
public StreamInterface {
76 CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
77 cudaGetDevice(&device_);
78 initializeDeviceProp();
81 CudaStreamDevice(
int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
82 initializeDeviceProp();
88 CudaStreamDevice(
const cudaStream_t* stream,
int device = -1)
89 : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
91 cudaGetDevice(&device_);
94 cudaError_t err = cudaGetDeviceCount(&num_devices);
95 EIGEN_UNUSED_VARIABLE(err)
96 assert(err == cudaSuccess);
97 assert(device < num_devices);
100 initializeDeviceProp();
103 virtual ~CudaStreamDevice() {
105 deallocate(scratch_);
109 const cudaStream_t& stream()
const {
return *stream_; }
110 const cudaDeviceProp& deviceProperties()
const {
111 return m_deviceProperties[device_];
113 virtual void* allocate(
size_t num_bytes)
const {
114 cudaError_t err = cudaSetDevice(device_);
115 EIGEN_UNUSED_VARIABLE(err)
116 assert(err == cudaSuccess);
118 err = cudaMalloc(&result, num_bytes);
119 assert(err == cudaSuccess);
120 assert(result != NULL);
123 virtual
void deallocate(
void* buffer)
const {
124 cudaError_t err = cudaSetDevice(device_);
125 EIGEN_UNUSED_VARIABLE(err)
126 assert(err == cudaSuccess);
127 assert(buffer != NULL);
128 err = cudaFree(buffer);
129 assert(err == cudaSuccess);
132 virtual
void* scratchpad()
const {
133 if (scratch_ == NULL) {
134 scratch_ = allocate(kCudaScratchSize +
sizeof(
unsigned int));
139 virtual unsigned int* semaphore()
const {
140 if (semaphore_ == NULL) {
141 char* scratch =
static_cast<char*
>(scratchpad()) + kCudaScratchSize;
142 semaphore_ =
reinterpret_cast<unsigned int*
>(scratch);
143 cudaError_t err = cudaMemsetAsync(semaphore_, 0,
sizeof(
unsigned int), *stream_);
144 EIGEN_UNUSED_VARIABLE(err)
145 assert(err == cudaSuccess);
151 const cudaStream_t* stream_;
153 mutable
void* scratch_;
154 mutable
unsigned int* semaphore_;
160 explicit GpuDevice(
const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
161 eigen_assert(stream);
163 explicit GpuDevice(
const StreamInterface* stream,
int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
164 eigen_assert(stream);
167 EIGEN_STRONG_INLINE
const cudaStream_t& stream()
const {
168 return stream_->stream();
171 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void* allocate(
size_t num_bytes)
const {
172 #ifndef __CUDA_ARCH__ 173 return stream_->allocate(num_bytes);
175 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
180 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void deallocate(
void* buffer)
const {
181 #ifndef __CUDA_ARCH__ 182 stream_->deallocate(buffer);
184 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
188 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void* scratchpad()
const {
189 #ifndef __CUDA_ARCH__ 190 return stream_->scratchpad();
192 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
197 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
unsigned int* semaphore()
const {
198 #ifndef __CUDA_ARCH__ 199 return stream_->semaphore();
201 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
206 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void memcpy(
void* dst,
const void* src,
size_t n)
const {
207 #ifndef __CUDA_ARCH__ 208 cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
210 EIGEN_UNUSED_VARIABLE(err)
211 assert(err == cudaSuccess);
213 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
217 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void memcpyHostToDevice(
void* dst,
const void* src,
size_t n)
const {
218 #ifndef __CUDA_ARCH__ 220 cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream());
221 EIGEN_UNUSED_VARIABLE(err)
222 assert(err == cudaSuccess);
224 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
228 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void memcpyDeviceToHost(
void* dst,
const void* src,
size_t n)
const {
229 #ifndef __CUDA_ARCH__ 231 cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream());
232 EIGEN_UNUSED_VARIABLE(err)
233 assert(err == cudaSuccess);
235 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
239 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void memset(
void* buffer,
int c,
size_t n)
const {
240 #ifndef __CUDA_ARCH__ 241 cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream());
242 EIGEN_UNUSED_VARIABLE(err)
243 assert(err == cudaSuccess);
245 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
249 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
size_t numThreads()
const {
254 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
size_t firstLevelCacheSize()
const {
259 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
size_t lastLevelCacheSize()
const {
262 return firstLevelCacheSize();
265 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void synchronize()
const {
266 #if defined(__CUDACC__) && !defined(__CUDA_ARCH__) 267 cudaError_t err = cudaStreamSynchronize(stream_->stream());
268 if (err != cudaSuccess) {
269 std::cerr <<
"Error detected in CUDA stream: " 270 << cudaGetErrorString(err)
272 assert(err == cudaSuccess);
275 assert(
false &&
"The default device should be used instead to generate kernel code");
279 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int getNumCudaMultiProcessors()
const {
280 #ifndef __CUDA_ARCH__ 281 return stream_->deviceProperties().multiProcessorCount;
283 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
287 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int maxCudaThreadsPerBlock()
const {
288 #ifndef __CUDA_ARCH__ 289 return stream_->deviceProperties().maxThreadsPerBlock;
291 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
295 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int maxCudaThreadsPerMultiProcessor()
const {
296 #ifndef __CUDA_ARCH__ 297 return stream_->deviceProperties().maxThreadsPerMultiProcessor;
299 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
303 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int sharedMemPerBlock()
const {
304 #ifndef __CUDA_ARCH__ 305 return stream_->deviceProperties().sharedMemPerBlock;
307 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
311 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int majorDeviceVersion()
const {
312 #ifndef __CUDA_ARCH__ 313 return stream_->deviceProperties().major;
315 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
319 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int minorDeviceVersion()
const {
320 #ifndef __CUDA_ARCH__ 321 return stream_->deviceProperties().minor;
323 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
328 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int maxBlocks()
const {
334 inline bool ok()
const {
336 cudaError_t error = cudaStreamQuery(stream_->stream());
337 return (error == cudaSuccess) || (error == cudaErrorNotReady);
344 const StreamInterface* stream_;
348 #define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ 349 (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ 350 assert(cudaGetLastError() == cudaSuccess); 355 static EIGEN_DEVICE_FUNC
inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
356 #ifndef __CUDA_ARCH__ 357 cudaError_t status = cudaDeviceSetSharedMemConfig(config);
358 EIGEN_UNUSED_VARIABLE(status)
359 assert(status == cudaSuccess);
361 EIGEN_UNUSED_VARIABLE(config)
368 #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H Namespace containing all symbols from the Eigen library.
Definition: AdolcForward:45