Skip to content

Commit

Permalink
Re-evaluating device info when get closer to zero mem
Browse files Browse the repository at this point in the history
  • Loading branch information
drnikolaev committed May 31, 2016
1 parent c12b7ad commit 9141deb
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 39 deletions.
20 changes: 16 additions & 4 deletions include/caffe/util/gpu_memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,14 +58,22 @@ class GPUMemoryManager {
}

// Memory allocation/release
void reserve(size_t size) {
bool try_reserve(size_t size) {
bool status = true;
if (size > size_) {
if (ptr_) {
GPUMemoryManager::deallocate(ptr_, stream_);
}
GPUMemoryManager::allocate(&ptr_, size, stream_);
size_ = size;
status = GPUMemoryManager::try_allocate(&ptr_, size, stream_);
if (status) {
size_ = size;
}
}
return status;
}

void reserve(size_t size) {
CHECK(try_reserve(size));
}

/*
Expand Down Expand Up @@ -111,8 +119,12 @@ class GPUMemoryManager {

public:
typedef void* pointer;
static void allocate(pointer* ptr, size_t size, cudaStream_t stream =
static bool try_allocate(pointer* ptr, size_t size, cudaStream_t stream =
cudaStreamDefault);
static void allocate(pointer* ptr, size_t size, cudaStream_t stream =
cudaStreamDefault) {
CHECK(try_allocate(ptr, size, stream));
}
static void deallocate(pointer ptr, cudaStream_t = cudaStreamDefault);
static void GetInfo(size_t* free_mem, size_t* used_mem);

Expand Down
29 changes: 11 additions & 18 deletions src/caffe/layers/cudnn_conv_layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,34 +116,18 @@ void CuDNNConvolutionLayer<Dtype>::Reshape(
filter_desc_, pad_h, pad_w, stride_h, stride_w);

// Have to pass full fwd/bwd cycle before taking the rest of memory
if (backward_passed_ctr_ > 1) {
if (backward_passed_ctr_ > 0) {
// choose forward and backward algorithms + workspace(s)
CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(Caffe::cudnn_handle(),
bottom_descs_[i], filter_desc_, conv_descs_[i], top_descs_[i],
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_limit_bytes, &fwd_algo_[i]));
}

CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(Caffe::cudnn_handle(),
bottom_descs_[i], filter_desc_, conv_descs_[i], top_descs_[i],
fwd_algo_[i], &(workspace_fwd_sizes_[i])));

if (backward_passed_ctr_ > 1) {
// choose backward algorithm for filter
CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
Caffe::cudnn_handle(),
bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
workspace_limit_bytes, &bwd_filter_algo_[i]));
}

// get workspace for backwards filter algorithm
CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
Caffe::cudnn_handle(),
bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_,
bwd_filter_algo_[i], &workspace_bwd_filter_sizes_[i]));

if (backward_passed_ctr_ > 1) {
// choose backward algo for data
CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
Caffe::cudnn_handle(),
Expand All @@ -152,7 +136,16 @@ void CuDNNConvolutionLayer<Dtype>::Reshape(
workspace_limit_bytes, &bwd_data_algo_[i]));
}

// get workspace size
// get workspace size for forward algorithm
CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(Caffe::cudnn_handle(),
bottom_descs_[i], filter_desc_, conv_descs_[i], top_descs_[i],
fwd_algo_[i], &(workspace_fwd_sizes_[i])));
// get workspace size for backward filter algorithm
CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
Caffe::cudnn_handle(),
bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_,
bwd_filter_algo_[i], &workspace_bwd_filter_sizes_[i]));
// get workspace size for backward data algorithm
CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
Caffe::cudnn_handle(),
filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i],
Expand Down
19 changes: 16 additions & 3 deletions src/caffe/layers/cudnn_conv_layer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,13 @@ void CuDNNConvolutionLayer<Dtype>::Forward_gpu(
if (workspace_fwd_sizes_[i] > workspace_limit_bytes) {
this->Reshape(bottom, top);
}
workspace.reserve(workspace_fwd_sizes_[i]);
// Sometimes closer to zero we might have memory info diverged from reality
// If try_reserve fails, it updates the info internally and we proceed with
// Reshape one more time
if (!workspace.try_reserve(workspace_fwd_sizes_[i])) {
this->Reshape(bottom, top);
workspace.reserve(workspace_fwd_sizes_[i]);
}

// Forward through cuDNN in parallel over groups.
for (int g = 0; g < this->group_; g++) {
Expand Down Expand Up @@ -85,8 +91,15 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
}
// To remove pressure on allocator, allocate the larger of the
// workspaces needed for the following steps
workspace.reserve(std::max(workspace_bwd_filter_sizes_[i],
workspace_bwd_data_sizes_[i]));
// Sometimes closer to zero we might have memory info diverged from reality
// If try_reserve fails, it updates the info internally and we proceed with
// Reshape one more time
if (!workspace.try_reserve(std::max(workspace_bwd_filter_sizes_[i],
workspace_bwd_data_sizes_[i]))) {
this->Reshape(bottom, top);
workspace.reserve(std::max(workspace_bwd_filter_sizes_[i],
workspace_bwd_data_sizes_[i]));
}

// Backward through cuDNN in parallel over groups and gradients.
for (int g = 0; g < this->group_; g++) {
Expand Down
26 changes: 12 additions & 14 deletions src/caffe/util/gpu_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,19 +56,20 @@ void GPUMemoryManager::destroy() {
mode_ = NO_POOL;
}

void GPUMemoryManager::allocate(void** ptr, size_t size, cudaStream_t stream) {
bool GPUMemoryManager::try_allocate(void** ptr, size_t size, cudaStream_t stream) {
CHECK((ptr) != NULL);
cudaError_t status = cudaSuccess, last_err = cudaSuccess;
switch (mode_) {
case CUB_POOL:
if (cub_allocator->DeviceAllocate(ptr, size, stream) != cudaSuccess) {
// Clean Cache & Retry logic is inside now
status = cub_allocator->DeviceAllocate(ptr, size, stream);
// If there was a retry and it succeeded we get good status here but
// we need to clean up last error...
last_err = cudaGetLastError();
// ...and update the dev info if something was wrong
if (status != cudaSuccess || last_err != cudaSuccess) {
int cur_device;
CUDA_CHECK(cudaGetDevice(&cur_device));
// free all cached memory (for all devices), synchrionize
cudaDeviceSynchronize();
cudaThreadSynchronize();
cub_allocator->FreeAllCached();
cudaDeviceSynchronize();
cudaThreadSynchronize();
// Refresh per-device saved values.
for (int i = 0; i < dev_info_.size(); ++i) {
// only query devices that were initialized
Expand All @@ -80,16 +81,13 @@ void GPUMemoryManager::allocate(void** ptr, size_t size, cudaStream_t stream) {
}
}
}
// Retry once
CUDA_CHECK(cub_allocator->DeviceAllocate(ptr, size, stream));
}
// If retry succeeds we need to clean up last error
cudaGetLastError();
break;
default:
CUDA_CHECK(cudaMalloc(ptr, size));
status = cudaMalloc(ptr, size);
break;
}
return status == cudaSuccess;
}

void GPUMemoryManager::deallocate(void* ptr, cudaStream_t stream) {
Expand Down Expand Up @@ -172,7 +170,7 @@ void GPUMemoryManager::GetInfo(size_t* free_mem, size_t* total_mem) {
CUDA_CHECK(cudaGetDevice(&cur_device));
*total_mem = dev_info_[cur_device].total_;
// Free memory is initial free memory minus outstanding allocations.
// Assuming we only allocate via GPUMemoryManager since its constructon.
// Assuming we only allocate via GPUMemoryManager since its construction.
*free_mem = dev_info_[cur_device].free_ -
cub_allocator->cached_bytes[cur_device].live;
break;
Expand Down

0 comments on commit 9141deb

Please sign in to comment.