diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake index e03feabffcb..1c503052efb 100644 --- a/cmake/Cuda.cmake +++ b/cmake/Cuda.cmake @@ -198,7 +198,11 @@ function(detect_cuDNN) set(HAVE_CUDNN TRUE PARENT_SCOPE) set(CUDNN_FOUND TRUE PARENT_SCOPE) - file(READ ${CUDNN_INCLUDE}/cudnn.h CUDNN_VERSION_FILE_CONTENTS) + if(EXISTS ${CUDNN_INCLUDE}/cudnn_version.h) + file(READ ${CUDNN_INCLUDE}/cudnn_version.h CUDNN_VERSION_FILE_CONTENTS) + else() + file(READ ${CUDNN_INCLUDE}/cudnn.h CUDNN_VERSION_FILE_CONTENTS) + endif() # cuDNN v3 and beyond string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)" @@ -224,7 +228,7 @@ function(detect_cuDNN) string(COMPARE LESS "${CUDNN_VERSION_MAJOR}" 3 cuDNNVersionIncompatible) if(cuDNNVersionIncompatible) - message(FATAL_ERROR "cuDNN version >3 is required.") + message("cuDNN version >3 is required.") endif() set(CUDNN_VERSION "${CUDNN_VERSION}" PARENT_SCOPE) diff --git a/include/caffe/layers/cudnn_conv_layer.hpp b/include/caffe/layers/cudnn_conv_layer.hpp index 31fe49a71fa..1d0059930cc 100644 --- a/include/caffe/layers/cudnn_conv_layer.hpp +++ b/include/caffe/layers/cudnn_conv_layer.hpp @@ -30,7 +30,9 @@ template class CuDNNConvolutionLayer : public ConvolutionLayer { public: explicit CuDNNConvolutionLayer(const LayerParameter& param) - : ConvolutionLayer(param), handles_setup_(false) {} + : ConvolutionLayer(param), + handles_setup_(false), + shapes_ready_(false) {} virtual void LayerSetUp(const vector*>& bottom, const vector*>& top); virtual void Reshape(const vector*>& bottom, @@ -43,7 +45,10 @@ class CuDNNConvolutionLayer : public ConvolutionLayer { virtual void Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom); + void findOptimalAlgorithm(int index, size_t workspace_limit_bytes); + bool handles_setup_; + bool shapes_ready_; cudnnHandle_t* handle_; cudaStream_t* stream_; @@ -58,6 +63,8 @@ class CuDNNConvolutionLayer : public ConvolutionLayer { vector conv_descs_; int bottom_offset_, top_offset_, bias_offset_; + std::vector cudnn_shape_; + size_t *workspace_fwd_sizes_; size_t *workspace_bwd_data_sizes_; size_t *workspace_bwd_filter_sizes_; diff --git a/include/caffe/layers/cudnn_deconv_layer.hpp b/include/caffe/layers/cudnn_deconv_layer.hpp index 12799e5b8ef..a19a353a132 100644 --- a/include/caffe/layers/cudnn_deconv_layer.hpp +++ b/include/caffe/layers/cudnn_deconv_layer.hpp @@ -25,7 +25,9 @@ template class CuDNNDeconvolutionLayer : public DeconvolutionLayer { public: explicit CuDNNDeconvolutionLayer(const LayerParameter& param) - : DeconvolutionLayer(param), handles_setup_(false) {} + : DeconvolutionLayer(param), + handles_setup_(false), + shapes_ready_(false) {} virtual void LayerSetUp(const vector*>& bottom, const vector*>& top); virtual void Reshape(const vector*>& bottom, @@ -39,7 +41,11 @@ class CuDNNDeconvolutionLayer : public DeconvolutionLayer { const vector& propagate_down, const vector*>& bottom); + void findOptimalAlgorithm(int index, size_t workspace_limit_bytes); + void getWorkSpaces(int index); + bool handles_setup_; + bool shapes_ready_; cudnnHandle_t* handle_; cudaStream_t* stream_; @@ -54,6 +60,8 @@ class CuDNNDeconvolutionLayer : public DeconvolutionLayer { vector conv_descs_; int bottom_offset_, top_offset_, bias_offset_; + std::vector cudnn_shape_; + size_t *workspace_fwd_sizes_; size_t *workspace_bwd_data_sizes_; size_t *workspace_bwd_filter_sizes_; diff --git a/include/caffe/util/cudnn.hpp b/include/caffe/util/cudnn.hpp index cd3f93f6e28..73944486e6d 100644 --- a/include/caffe/util/cudnn.hpp +++ b/include/caffe/util/cudnn.hpp @@ -3,6 +3,8 @@ #ifdef USE_CUDNN #include +#include +#include #include "caffe/common.hpp" #include "caffe/proto/caffe.pb.h" @@ -50,6 +52,10 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) { return "CUDNN_STATUS_RUNTIME_IN_PROGRESS"; case CUDNN_STATUS_RUNTIME_FP_OVERFLOW: return "CUDNN_STATUS_RUNTIME_FP_OVERFLOW"; +#endif +#if CUDNN_VERSION_MIN(8, 0, 0) + case CUDNN_STATUS_VERSION_MISMATCH: + return "CUDNN_STATUS_VERSION_MISMATCH"; #endif } return "Unknown cudnn status"; @@ -161,6 +167,40 @@ inline void createActivationDescriptor(cudnnActivationDescriptor_t* activ_desc, CUDNN_PROPAGATE_NAN, Dtype(0))); } +template +inline T findFirstSuitableAlgorithm(std::vector const &v, + size_t count, size_t limit) { + count = std::min(count, v.size()); + for (size_t i = 0; i < count; i++) { + if (v[i].memory <= limit) { + return v[i]; + } + } + if (!v.empty()) { + return v[0]; + } + return T(); +} + +/// Check that new shape identical to previous up to batch size +/// that allowed to be smaller +inline bool areConvShapesCompatible(std::vector const &src, + std::vector const &reshaped) { + if (src.size() != reshaped.size() || src.size() < 2) { + return false; + } + if (src[0] < reshaped[0]) { + return false; + } + for (size_t i = 1; i < src.size(); i++) { + if (src[i] != reshaped[i]) { + return false; + } + } + return true; +} + + } // namespace cudnn } // namespace caffe diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp index efc9e04e8c0..3bd0c4fc04f 100644 --- a/src/caffe/layers/cudnn_conv_layer.cpp +++ b/src/caffe/layers/cudnn_conv_layer.cpp @@ -85,12 +85,133 @@ void CuDNNConvolutionLayer::LayerSetUp( } handles_setup_ = true; + shapes_ready_ = false; } +#if CUDNN_VERSION_MIN(7, 0, 0) +// Using FindAlgorithm +template +void CuDNNConvolutionLayer::findOptimalAlgorithm(int index, + size_t workspace_limit_bytes) { + + int nfwd = 1, nbwd_filter = 1, nbwd_data = 1; + CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithmMaxCount( + handle_[0], &nfwd)); + CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount( + handle_[0], &nbwd_filter)); + CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithmMaxCount( + handle_[0], &nbwd_data)); + + std::vector + fwd_v(std::max(nfwd, 1)); + + std::vector + bwd_filter_v(std::max(nbwd_filter, 1)); + + std::vector + bwd_data_v(std::max(nbwd_data, 1)); + + cudnnConvolutionFwdAlgoPerf_t fwd_perf; + cudnnConvolutionBwdFilterAlgoPerf_t bwd_filter_perf; + cudnnConvolutionBwdDataAlgoPerf_t bwd_data_perf; + int count = 0; + // choose forward and backward algorithms + workspace(s) + CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7(handle_[0], + bottom_descs_[index], + filter_desc_, + conv_descs_[index], + top_descs_[index], + fwd_v.size(), &count, &fwd_v[0])); + + fwd_perf = cudnn::findFirstSuitableAlgorithm(fwd_v, + count, workspace_limit_bytes); + fwd_algo_[index] =fwd_perf.algo; + workspace_fwd_sizes_[index] = fwd_perf.memory; + + // choose backward algorithm for filter + CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm_v7(handle_[0], + bottom_descs_[index], + top_descs_[index], + conv_descs_[index], + filter_desc_, + bwd_filter_v.size(), + &count, + &bwd_filter_v[0])); + + bwd_filter_perf = cudnn::findFirstSuitableAlgorithm(bwd_filter_v, + count, workspace_limit_bytes); + bwd_filter_algo_[index] = bwd_filter_perf.algo; + workspace_bwd_filter_sizes_[index] = bwd_filter_perf.memory; + + // choose backward algo for data + CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm_v7(handle_[0], + filter_desc_, top_descs_[index], conv_descs_[index], + bottom_descs_[index], + bwd_data_v.size(), &count, &bwd_data_v[0])); + + bwd_data_perf = cudnn::findFirstSuitableAlgorithm(bwd_data_v, + count, workspace_limit_bytes); + bwd_data_algo_[index] = bwd_data_perf.algo; + workspace_bwd_data_sizes_[index] = bwd_data_perf.memory; + +} +#else + +template +void CuDNNConvolutionLayer::findOptimalAlgorithm(int index, + size_t workspace_limit_bytes) { + // choose forward and backward algorithms + workspace(s) + CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(handle_[0], + bottom_descs_[index], + filter_desc_, + conv_descs_[index], + top_descs_[index], + CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, + workspace_limit_bytes, + &fwd_algo_[index])); + + // choose backward algorithm for filter + CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(handle_[0], + bottom_descs_[index], top_descs_[index], conv_descs_[index], + filter_desc_, + CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, + workspace_limit_bytes, &bwd_filter_algo_[index]) ); + + // choose backward algo for data + CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(handle_[0], + filter_desc_, top_descs_[index], conv_descs_[index], + bottom_descs_[index], + CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, + workspace_limit_bytes, &bwd_data_algo_[index])); + + CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(handle_[0], + bottom_descs_[index], + filter_desc_, + conv_descs_[index], + top_descs_[index], + fwd_algo_[index], + &(workspace_fwd_sizes_[index]))); + + // get workspace for backwards filter algorithm + CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(handle_[0], + bottom_descs_[index], top_descs_[index], conv_descs_[index], + filter_desc_, bwd_filter_algo_[index], + &workspace_bwd_filter_sizes_[index])); + + + // get workspace size + CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(handle_[0], + filter_desc_, top_descs_[index], conv_descs_[index], bottom_descs_[index], + bwd_data_algo_[index], &workspace_bwd_data_sizes_[index]) ); +} +#endif + + template void CuDNNConvolutionLayer::Reshape( const vector*>& bottom, const vector*>& top) { ConvolutionLayer::Reshape(bottom, top); + CHECK_EQ(2, this->num_spatial_axes_) << "CuDNNConvolution input must have 2 spatial axes " << "(e.g., height and width). " @@ -112,6 +233,9 @@ void CuDNNConvolutionLayer::Reshape( // planning strategy and a rewrite of Caffe's GPU memory mangagement size_t workspace_limit_bytes = 8*1024*1024; + bool select_algo = !shapes_ready_ + || cudnn_shape_ != bottom[0]->shape(); + for (int i = 0; i < bottom.size(); i++) { cudnn::setTensor4dDesc(&bottom_descs_[i], this->num_, @@ -127,46 +251,14 @@ void CuDNNConvolutionLayer::Reshape( filter_desc_, pad_h, pad_w, stride_h, stride_w); - // choose forward and backward algorithms + workspace(s) - CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(handle_[0], - bottom_descs_[i], - filter_desc_, - conv_descs_[i], - top_descs_[i], - CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, - workspace_limit_bytes, - &fwd_algo_[i])); - - CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(handle_[0], - bottom_descs_[i], - filter_desc_, - conv_descs_[i], - top_descs_[i], - fwd_algo_[i], - &(workspace_fwd_sizes_[i]))); - - // choose backward algorithm for filter - CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(handle_[0], - bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_, - CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, - workspace_limit_bytes, &bwd_filter_algo_[i]) ); - - // get workspace for backwards filter algorithm - CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(handle_[0], - bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_, - bwd_filter_algo_[i], &workspace_bwd_filter_sizes_[i])); - - // choose backward algo for data - CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(handle_[0], - filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i], - CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, - workspace_limit_bytes, &bwd_data_algo_[i])); - - // get workspace size - CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(handle_[0], - filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i], - bwd_data_algo_[i], &workspace_bwd_data_sizes_[i]) ); + if (select_algo) { + findOptimalAlgorithm(i, workspace_limit_bytes); + } + } + if (select_algo) { + cudnn_shape_ = bottom[0]->shape(); } + shapes_ready_ = true; // reduce over all workspace sizes to get a maximum to allocate / reallocate size_t total_workspace_fwd = 0; @@ -185,6 +277,10 @@ void CuDNNConvolutionLayer::Reshape( size_t max_workspace = std::max(total_workspace_fwd, total_workspace_bwd_data); max_workspace = std::max(max_workspace, total_workspace_bwd_filter); + // ensure alignment + const size_t aligmnent_workspace = 128; + max_workspace = (max_workspace + aligmnent_workspace - 1) + / aligmnent_workspace * aligmnent_workspace; // ensure all groups have enough workspace size_t total_max_workspace = max_workspace * (this->group_ * CUDNN_STREAMS_PER_GROUP); @@ -192,8 +288,11 @@ void CuDNNConvolutionLayer::Reshape( // this is the total amount of storage needed over all groups + streams if (total_max_workspace > workspaceSizeInBytes) { DLOG(INFO) << "Reallocating workspace storage: " << total_max_workspace; + static size_t total_ws_global = 0; + total_ws_global += total_max_workspace - workspaceSizeInBytes; workspaceSizeInBytes = total_max_workspace; + // free the existing workspace and allocate a new (larger) one cudaFree(this->workspaceData); diff --git a/src/caffe/layers/cudnn_deconv_layer.cpp b/src/caffe/layers/cudnn_deconv_layer.cpp index 260da5c1ee0..698391c057d 100644 --- a/src/caffe/layers/cudnn_deconv_layer.cpp +++ b/src/caffe/layers/cudnn_deconv_layer.cpp @@ -87,8 +87,151 @@ void CuDNNDeconvolutionLayer::LayerSetUp( } handles_setup_ = true; + shapes_ready_ = false; } +#if CUDNN_VERSION_MIN(7, 0, 0) +// Using FindAlgorithm +template +void CuDNNDeconvolutionLayer::findOptimalAlgorithm(int index, + size_t workspace_limit_bytes) { + // unlike in conv, in deconv we don't query sizes since algoritm may + // change heuristically after we select + + int nfwd = 1, nbwd_filter = 1, nbwd_data = 1; + CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithmMaxCount( + handle_[0], &nfwd)); + CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount( + handle_[0], &nbwd_filter)); + CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithmMaxCount( + handle_[0], &nbwd_data)); + + std::vector + fwd_v(std::max(nfwd, 1)); + + std::vector + bwd_filter_v(std::max(nbwd_filter, 1)); + + std::vector + bwd_data_v(std::max(nbwd_data, 1)); + + cudnnConvolutionFwdAlgoPerf_t fwd_perf; + cudnnConvolutionBwdFilterAlgoPerf_t bwd_filter_perf; + cudnnConvolutionBwdDataAlgoPerf_t bwd_data_perf; + int count = 0; + // choose forward and backward algorithms + workspace(s) + CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7( + handle_[0], + top_descs_[index], + filter_desc_, + conv_descs_[index], + bottom_descs_[index], + fwd_v.size(), &count, &fwd_v[0])); + + fwd_perf = cudnn::findFirstSuitableAlgorithm(fwd_v, + count, workspace_limit_bytes); + fwd_algo_[index] = fwd_perf.algo; + + // choose backward algorithm for filter + CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm_v7( + handle_[0], + top_descs_[index], + bottom_descs_[index], + conv_descs_[index], + filter_desc_, + bwd_filter_v.size(), &count, &bwd_filter_v[0])); + + bwd_filter_perf = cudnn::findFirstSuitableAlgorithm(bwd_filter_v, + count, workspace_limit_bytes); + bwd_filter_algo_[index] = bwd_filter_perf.algo; + + // choose backward algo for data + CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm_v7( + handle_[0], + filter_desc_, + bottom_descs_[index], + conv_descs_[index], + top_descs_[index], + bwd_data_v.size(), &count, &bwd_data_v[0])); + + bwd_data_perf = cudnn::findFirstSuitableAlgorithm(bwd_data_v, + count, workspace_limit_bytes); + bwd_data_algo_[index] = bwd_data_perf.algo; +} +#else +template +void CuDNNDeconvolutionLayer::findOptimalAlgorithm(int index, + size_t workspace_limit_bytes) { + // choose forward and backward algorithms + workspace(s) + CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm( + handle_[0], + top_descs_[index], + filter_desc_, + conv_descs_[index], + bottom_descs_[index], + CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, + workspace_limit_bytes, + &fwd_algo_[index])); + + // choose backward algorithm for filter + CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm( + handle_[0], + top_descs_[index], + bottom_descs_[index], + conv_descs_[index], + filter_desc_, + CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, + workspace_limit_bytes, + &bwd_filter_algo_[index])); + + // choose backward algo for data + CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm( + handle_[0], + filter_desc_, + bottom_descs_[index], + conv_descs_[index], + top_descs_[index], + CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, + workspace_limit_bytes, + &bwd_data_algo_[index])); +} +#endif + +template +void CuDNNDeconvolutionLayer::getWorkSpaces(int index) { + CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize( + handle_[0], + top_descs_[index], + filter_desc_, + conv_descs_[index], + bottom_descs_[index], + fwd_algo_[index], + &(workspace_fwd_sizes_[index]))); + + // get workspace for backwards filter algorithm + CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize( + handle_[0], + top_descs_[index], + bottom_descs_[index], + conv_descs_[index], + filter_desc_, + bwd_filter_algo_[index], + &workspace_bwd_filter_sizes_[index])); + + + // get workspace size + CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize( + handle_[0], + filter_desc_, + bottom_descs_[index], + conv_descs_[index], + top_descs_[index], + bwd_data_algo_[index], + &workspace_bwd_data_sizes_[index])); +} + + + template void CuDNNDeconvolutionLayer::Reshape( const vector*>& bottom, const vector*>& top) { @@ -114,6 +257,9 @@ void CuDNNDeconvolutionLayer::Reshape( // planning strategy and a rewrite of Caffe's GPU memory mangagement size_t workspace_limit_bytes = 8*1024*1024; + bool select_algo = !shapes_ready_ + || cudnn_shape_ != bottom[0]->shape(); + for (int i = 0; i < bottom.size(); i++) { cudnn::setTensor4dDesc(&bottom_descs_[i], this->num_, @@ -141,90 +287,39 @@ void CuDNNDeconvolutionLayer::Reshape( stride_h, stride_w); - // choose forward and backward algorithms + workspace(s) - CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm( - handle_[0], - top_descs_[i], - filter_desc_, - conv_descs_[i], - bottom_descs_[i], - CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, - workspace_limit_bytes, - &fwd_algo_[i])); - - // We have found that CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM is - // buggy. Thus, if this algo was chosen, choose winograd instead. If - // winograd is not supported or workspace is larger than threshold, choose - // implicit_gemm instead. - if (fwd_algo_[i] == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) { - size_t winograd_workspace_size; - cudnnStatus_t status = cudnnGetConvolutionForwardWorkspaceSize( - handle_[0], - top_descs_[i], - filter_desc_, - conv_descs_[i], - bottom_descs_[i], - CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD, - &winograd_workspace_size); - if (status != CUDNN_STATUS_SUCCESS || - winograd_workspace_size >= workspace_limit_bytes) { - fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; - } else { - fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD; + if (select_algo) { + findOptimalAlgorithm(i, workspace_limit_bytes); + + // We have found that CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM is + // buggy. Thus, if this algo was chosen, choose winograd instead. If + // winograd is not supported or workspace is larger than threshold, + // choose implicit_gemm instead. + if (fwd_algo_[i] == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) { + size_t winograd_workspace_size; + cudnnStatus_t status = cudnnGetConvolutionForwardWorkspaceSize( + handle_[0], + top_descs_[i], + filter_desc_, + conv_descs_[i], + bottom_descs_[i], + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD, + &winograd_workspace_size); + if (status != CUDNN_STATUS_SUCCESS || + winograd_workspace_size >= workspace_limit_bytes) { + fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; + } else { + fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD; + } } - } - - CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize( - handle_[0], - top_descs_[i], - filter_desc_, - conv_descs_[i], - bottom_descs_[i], - fwd_algo_[i], - &(workspace_fwd_sizes_[i]))); - - // choose backward algorithm for filter - CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm( - handle_[0], - top_descs_[i], - bottom_descs_[i], - conv_descs_[i], - filter_desc_, - CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, - workspace_limit_bytes, - &bwd_filter_algo_[i])); - - // get workspace for backwards filter algorithm - CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize( - handle_[0], - top_descs_[i], - bottom_descs_[i], - conv_descs_[i], - filter_desc_, - bwd_filter_algo_[i], - &workspace_bwd_filter_sizes_[i])); - - // choose backward algo for data - CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm( - handle_[0], - filter_desc_, - bottom_descs_[i], - conv_descs_[i], - top_descs_[i], - CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, - workspace_limit_bytes, - &bwd_data_algo_[i])); + getWorkSpaces(i); + } // end if select algo + } - // get workspace size - CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize( - handle_[0], - filter_desc_, - bottom_descs_[i], - conv_descs_[i], - top_descs_[i], - bwd_data_algo_[i], - &workspace_bwd_data_sizes_[i])); + if (select_algo) { + cudnn_shape_ = bottom[0]->shape(); } + shapes_ready_ = true; + // reduce over all workspace sizes to get a maximum to allocate / reallocate size_t total_workspace_fwd = 0; @@ -243,6 +338,11 @@ void CuDNNDeconvolutionLayer::Reshape( size_t max_workspace = std::max(total_workspace_fwd, total_workspace_bwd_data); max_workspace = std::max(max_workspace, total_workspace_bwd_filter); + // ensure alignment + const size_t aligmnent_workspace = 128; + max_workspace = (max_workspace + aligmnent_workspace - 1) / + aligmnent_workspace * aligmnent_workspace; + // ensure all groups have enough workspace size_t total_max_workspace = max_workspace * (this->group_ * CUDNN_STREAMS_PER_GROUP);