Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support of CuDNN8 #7000

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions cmake/Cuda.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,11 @@ function(detect_cuDNN)
set(HAVE_CUDNN TRUE PARENT_SCOPE)
set(CUDNN_FOUND TRUE PARENT_SCOPE)

file(READ ${CUDNN_INCLUDE}/cudnn.h CUDNN_VERSION_FILE_CONTENTS)
if(EXISTS ${CUDNN_INCLUDE}/cudnn_version.h)
file(READ ${CUDNN_INCLUDE}/cudnn_version.h CUDNN_VERSION_FILE_CONTENTS)
else()
file(READ ${CUDNN_INCLUDE}/cudnn.h CUDNN_VERSION_FILE_CONTENTS)
endif()

# cuDNN v3 and beyond
string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)"
Expand All @@ -224,7 +228,7 @@ function(detect_cuDNN)

string(COMPARE LESS "${CUDNN_VERSION_MAJOR}" 3 cuDNNVersionIncompatible)
if(cuDNNVersionIncompatible)
message(FATAL_ERROR "cuDNN version >3 is required.")
message("cuDNN version >3 is required.")
endif()

set(CUDNN_VERSION "${CUDNN_VERSION}" PARENT_SCOPE)
Expand Down
9 changes: 8 additions & 1 deletion include/caffe/layers/cudnn_conv_layer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@ template <typename Dtype>
class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
public:
explicit CuDNNConvolutionLayer(const LayerParameter& param)
: ConvolutionLayer<Dtype>(param), handles_setup_(false) {}
: ConvolutionLayer<Dtype>(param),
handles_setup_(false),
shapes_ready_(false) {}
virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
Expand All @@ -43,7 +45,10 @@ class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);

void findOptimalAlgorithm(int index, size_t workspace_limit_bytes);

bool handles_setup_;
bool shapes_ready_;
cudnnHandle_t* handle_;
cudaStream_t* stream_;

Expand All @@ -58,6 +63,8 @@ class CuDNNConvolutionLayer : public ConvolutionLayer<Dtype> {
vector<cudnnConvolutionDescriptor_t> conv_descs_;
int bottom_offset_, top_offset_, bias_offset_;

std::vector<int> cudnn_shape_;

size_t *workspace_fwd_sizes_;
size_t *workspace_bwd_data_sizes_;
size_t *workspace_bwd_filter_sizes_;
Expand Down
10 changes: 9 additions & 1 deletion include/caffe/layers/cudnn_deconv_layer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@ template <typename Dtype>
class CuDNNDeconvolutionLayer : public DeconvolutionLayer<Dtype> {
public:
explicit CuDNNDeconvolutionLayer(const LayerParameter& param)
: DeconvolutionLayer<Dtype>(param), handles_setup_(false) {}
: DeconvolutionLayer<Dtype>(param),
handles_setup_(false),
shapes_ready_(false) {}
virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
Expand All @@ -39,7 +41,11 @@ class CuDNNDeconvolutionLayer : public DeconvolutionLayer<Dtype> {
const vector<bool>& propagate_down,
const vector<Blob<Dtype>*>& bottom);

void findOptimalAlgorithm(int index, size_t workspace_limit_bytes);
void getWorkSpaces(int index);

bool handles_setup_;
bool shapes_ready_;
cudnnHandle_t* handle_;
cudaStream_t* stream_;

Expand All @@ -54,6 +60,8 @@ class CuDNNDeconvolutionLayer : public DeconvolutionLayer<Dtype> {
vector<cudnnConvolutionDescriptor_t> conv_descs_;
int bottom_offset_, top_offset_, bias_offset_;

std::vector<int> cudnn_shape_;

size_t *workspace_fwd_sizes_;
size_t *workspace_bwd_data_sizes_;
size_t *workspace_bwd_filter_sizes_;
Expand Down
40 changes: 40 additions & 0 deletions include/caffe/util/cudnn.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
#ifdef USE_CUDNN

#include <cudnn.h>
#include <algorithm>
#include <vector>

#include "caffe/common.hpp"
#include "caffe/proto/caffe.pb.h"
Expand Down Expand Up @@ -50,6 +52,10 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
return "CUDNN_STATUS_RUNTIME_IN_PROGRESS";
case CUDNN_STATUS_RUNTIME_FP_OVERFLOW:
return "CUDNN_STATUS_RUNTIME_FP_OVERFLOW";
#endif
#if CUDNN_VERSION_MIN(8, 0, 0)
case CUDNN_STATUS_VERSION_MISMATCH:
return "CUDNN_STATUS_VERSION_MISMATCH";
#endif
}
return "Unknown cudnn status";
Expand Down Expand Up @@ -161,6 +167,40 @@ inline void createActivationDescriptor(cudnnActivationDescriptor_t* activ_desc,
CUDNN_PROPAGATE_NAN, Dtype(0)));
}

template<typename T>
inline T findFirstSuitableAlgorithm(std::vector<T> const &v,
size_t count, size_t limit) {
count = std::min(count, v.size());
for (size_t i = 0; i < count; i++) {
if (v[i].memory <= limit) {
return v[i];
}
}
if (!v.empty()) {
return v[0];
}
return T();
}

/// Check that new shape identical to previous up to batch size
/// that allowed to be smaller
inline bool areConvShapesCompatible(std::vector<int> const &src,
std::vector<int> const &reshaped) {
if (src.size() != reshaped.size() || src.size() < 2) {
return false;
}
if (src[0] < reshaped[0]) {
return false;
}
for (size_t i = 1; i < src.size(); i++) {
if (src[i] != reshaped[i]) {
return false;
}
}
return true;
}


} // namespace cudnn

} // namespace caffe
Expand Down
177 changes: 138 additions & 39 deletions src/caffe/layers/cudnn_conv_layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,133 @@ void CuDNNConvolutionLayer<Dtype>::LayerSetUp(
}

handles_setup_ = true;
shapes_ready_ = false;
}

#if CUDNN_VERSION_MIN(7, 0, 0)
// Using FindAlgorithm
template <typename Dtype>
void CuDNNConvolutionLayer<Dtype>::findOptimalAlgorithm(int index,
size_t workspace_limit_bytes) {

int nfwd = 1, nbwd_filter = 1, nbwd_data = 1;
CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithmMaxCount(
handle_[0], &nfwd));
CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
handle_[0], &nbwd_filter));
CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithmMaxCount(
handle_[0], &nbwd_data));

std::vector<cudnnConvolutionFwdAlgoPerf_t>
fwd_v(std::max(nfwd, 1));

std::vector<cudnnConvolutionBwdFilterAlgoPerf_t>
bwd_filter_v(std::max(nbwd_filter, 1));

std::vector<cudnnConvolutionBwdDataAlgoPerf_t>
bwd_data_v(std::max(nbwd_data, 1));

cudnnConvolutionFwdAlgoPerf_t fwd_perf;
cudnnConvolutionBwdFilterAlgoPerf_t bwd_filter_perf;
cudnnConvolutionBwdDataAlgoPerf_t bwd_data_perf;
int count = 0;
// choose forward and backward algorithms + workspace(s)
CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7(handle_[0],
bottom_descs_[index],
filter_desc_,
conv_descs_[index],
top_descs_[index],
fwd_v.size(), &count, &fwd_v[0]));

fwd_perf = cudnn::findFirstSuitableAlgorithm(fwd_v,
count, workspace_limit_bytes);
fwd_algo_[index] =fwd_perf.algo;
workspace_fwd_sizes_[index] = fwd_perf.memory;

// choose backward algorithm for filter
CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm_v7(handle_[0],
bottom_descs_[index],
top_descs_[index],
conv_descs_[index],
filter_desc_,
bwd_filter_v.size(),
&count,
&bwd_filter_v[0]));

bwd_filter_perf = cudnn::findFirstSuitableAlgorithm(bwd_filter_v,
count, workspace_limit_bytes);
bwd_filter_algo_[index] = bwd_filter_perf.algo;
workspace_bwd_filter_sizes_[index] = bwd_filter_perf.memory;

// choose backward algo for data
CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm_v7(handle_[0],
filter_desc_, top_descs_[index], conv_descs_[index],
bottom_descs_[index],
bwd_data_v.size(), &count, &bwd_data_v[0]));

bwd_data_perf = cudnn::findFirstSuitableAlgorithm(bwd_data_v,
count, workspace_limit_bytes);
bwd_data_algo_[index] = bwd_data_perf.algo;
workspace_bwd_data_sizes_[index] = bwd_data_perf.memory;

}
#else

template <typename Dtype>
void CuDNNConvolutionLayer<Dtype>::findOptimalAlgorithm(int index,
size_t workspace_limit_bytes) {
// choose forward and backward algorithms + workspace(s)
CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(handle_[0],
bottom_descs_[index],
filter_desc_,
conv_descs_[index],
top_descs_[index],
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_limit_bytes,
&fwd_algo_[index]));

// choose backward algorithm for filter
CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(handle_[0],
bottom_descs_[index], top_descs_[index], conv_descs_[index],
filter_desc_,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
workspace_limit_bytes, &bwd_filter_algo_[index]) );

// choose backward algo for data
CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(handle_[0],
filter_desc_, top_descs_[index], conv_descs_[index],
bottom_descs_[index],
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
workspace_limit_bytes, &bwd_data_algo_[index]));

CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(handle_[0],
bottom_descs_[index],
filter_desc_,
conv_descs_[index],
top_descs_[index],
fwd_algo_[index],
&(workspace_fwd_sizes_[index])));

// get workspace for backwards filter algorithm
CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(handle_[0],
bottom_descs_[index], top_descs_[index], conv_descs_[index],
filter_desc_, bwd_filter_algo_[index],
&workspace_bwd_filter_sizes_[index]));


// get workspace size
CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(handle_[0],
filter_desc_, top_descs_[index], conv_descs_[index], bottom_descs_[index],
bwd_data_algo_[index], &workspace_bwd_data_sizes_[index]) );
}
#endif


template <typename Dtype>
void CuDNNConvolutionLayer<Dtype>::Reshape(
const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
ConvolutionLayer<Dtype>::Reshape(bottom, top);

CHECK_EQ(2, this->num_spatial_axes_)
<< "CuDNNConvolution input must have 2 spatial axes "
<< "(e.g., height and width). "
Expand All @@ -112,6 +233,9 @@ void CuDNNConvolutionLayer<Dtype>::Reshape(
// planning strategy and a rewrite of Caffe's GPU memory mangagement
size_t workspace_limit_bytes = 8*1024*1024;

bool select_algo = !shapes_ready_
|| cudnn_shape_ != bottom[0]->shape();

for (int i = 0; i < bottom.size(); i++) {
cudnn::setTensor4dDesc<Dtype>(&bottom_descs_[i],
this->num_,
Expand All @@ -127,46 +251,14 @@ void CuDNNConvolutionLayer<Dtype>::Reshape(
filter_desc_, pad_h, pad_w,
stride_h, stride_w);

// choose forward and backward algorithms + workspace(s)
CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(handle_[0],
bottom_descs_[i],
filter_desc_,
conv_descs_[i],
top_descs_[i],
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_limit_bytes,
&fwd_algo_[i]));

CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(handle_[0],
bottom_descs_[i],
filter_desc_,
conv_descs_[i],
top_descs_[i],
fwd_algo_[i],
&(workspace_fwd_sizes_[i])));

// choose backward algorithm for filter
CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(handle_[0],
bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
workspace_limit_bytes, &bwd_filter_algo_[i]) );

// get workspace for backwards filter algorithm
CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(handle_[0],
bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_,
bwd_filter_algo_[i], &workspace_bwd_filter_sizes_[i]));

// choose backward algo for data
CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(handle_[0],
filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i],
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
workspace_limit_bytes, &bwd_data_algo_[i]));

// get workspace size
CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(handle_[0],
filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i],
bwd_data_algo_[i], &workspace_bwd_data_sizes_[i]) );
if (select_algo) {
findOptimalAlgorithm(i, workspace_limit_bytes);
}
}
if (select_algo) {
cudnn_shape_ = bottom[0]->shape();
}
shapes_ready_ = true;

// reduce over all workspace sizes to get a maximum to allocate / reallocate
size_t total_workspace_fwd = 0;
Expand All @@ -185,15 +277,22 @@ void CuDNNConvolutionLayer<Dtype>::Reshape(
size_t max_workspace = std::max(total_workspace_fwd,
total_workspace_bwd_data);
max_workspace = std::max(max_workspace, total_workspace_bwd_filter);
// ensure alignment
const size_t aligmnent_workspace = 128;
max_workspace = (max_workspace + aligmnent_workspace - 1)
/ aligmnent_workspace * aligmnent_workspace;
// ensure all groups have enough workspace
size_t total_max_workspace = max_workspace *
(this->group_ * CUDNN_STREAMS_PER_GROUP);

// this is the total amount of storage needed over all groups + streams
if (total_max_workspace > workspaceSizeInBytes) {
DLOG(INFO) << "Reallocating workspace storage: " << total_max_workspace;
static size_t total_ws_global = 0;
total_ws_global += total_max_workspace - workspaceSizeInBytes;
workspaceSizeInBytes = total_max_workspace;


// free the existing workspace and allocate a new (larger) one
cudaFree(this->workspaceData);

Expand Down
Loading