Skip to content

Commit

Permalink
Merge pull request #153 from drnikolaev/caffe-0.15-oom
Browse files Browse the repository at this point in the history
Thanks @lukeyeager for reviewing the code. This PR will be followed by another one implementing better memory distribution algorithm for cuDNN Convolution Layer (as per our discussion).
  • Loading branch information
drnikolaev committed Jun 1, 2016
2 parents cf75318 + 1d4882b commit d988833
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 21 deletions.
20 changes: 16 additions & 4 deletions include/caffe/util/gpu_memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,14 +58,22 @@ class GPUMemoryManager {
}

// Memory allocation/release
void reserve(size_t size) {
bool try_reserve(size_t size) {
bool status = true;
if (size > size_) {
if (ptr_) {
GPUMemoryManager::deallocate(ptr_, stream_);
}
GPUMemoryManager::allocate(&ptr_, size, stream_);
size_ = size;
status = GPUMemoryManager::try_allocate(&ptr_, size, stream_);
if (status) {
size_ = size;
}
}
return status;
}

void reserve(size_t size) {
CHECK(try_reserve(size));
}

/*
Expand Down Expand Up @@ -111,8 +119,12 @@ class GPUMemoryManager {

public:
typedef void* pointer;
static void allocate(pointer* ptr, size_t size, cudaStream_t stream =
static bool try_allocate(pointer* ptr, size_t size, cudaStream_t stream =
cudaStreamDefault);
static void allocate(pointer* ptr, size_t size, cudaStream_t stream =
cudaStreamDefault) {
CHECK(try_allocate(ptr, size, stream));
}
static void deallocate(pointer ptr, cudaStream_t = cudaStreamDefault);
static void GetInfo(size_t* free_mem, size_t* used_mem);

Expand Down
19 changes: 16 additions & 3 deletions src/caffe/layers/cudnn_conv_layer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,13 @@ void CuDNNConvolutionLayer<Dtype>::Forward_gpu(
if (workspace_fwd_sizes_[i] > workspace_limit_bytes) {
this->Reshape(bottom, top);
}
workspace.reserve(workspace_fwd_sizes_[i]);
// Sometimes closer to zero we might have memory info diverged from reality
// If try_reserve fails, it updates the info internally and we proceed with
// Reshape one more time
if (!workspace.try_reserve(workspace_fwd_sizes_[i])) {
this->Reshape(bottom, top);
workspace.reserve(workspace_fwd_sizes_[i]);
}

// Forward through cuDNN in parallel over groups.
for (int g = 0; g < this->group_; g++) {
Expand Down Expand Up @@ -85,8 +91,15 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
}
// To remove pressure on allocator, allocate the larger of the
// workspaces needed for the following steps
workspace.reserve(std::max(workspace_bwd_filter_sizes_[i],
workspace_bwd_data_sizes_[i]));
// Sometimes closer to zero we might have memory info diverged from reality
// If try_reserve fails, it updates the info internally and we proceed with
// Reshape one more time
if (!workspace.try_reserve(std::max(workspace_bwd_filter_sizes_[i],
workspace_bwd_data_sizes_[i]))) {
this->Reshape(bottom, top);
workspace.reserve(std::max(workspace_bwd_filter_sizes_[i],
workspace_bwd_data_sizes_[i]));
}

// Backward through cuDNN in parallel over groups and gradients.
for (int g = 0; g < this->group_; g++) {
Expand Down
27 changes: 13 additions & 14 deletions src/caffe/util/gpu_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,19 +56,21 @@ void GPUMemoryManager::destroy() {
mode_ = NO_POOL;
}

void GPUMemoryManager::allocate(void** ptr, size_t size, cudaStream_t stream) {
bool GPUMemoryManager::try_allocate(void** ptr, size_t size,
cudaStream_t stream) {
CHECK((ptr) != NULL);
cudaError_t status = cudaSuccess, last_err = cudaSuccess;
switch (mode_) {
case CUB_POOL:
if (cub_allocator->DeviceAllocate(ptr, size, stream) != cudaSuccess) {
// Clean Cache & Retry logic is inside now
status = cub_allocator->DeviceAllocate(ptr, size, stream);
// If there was a retry and it succeeded we get good status here but
// we need to clean up last error...
last_err = cudaGetLastError();
// ...and update the dev info if something was wrong
if (status != cudaSuccess || last_err != cudaSuccess) {
int cur_device;
CUDA_CHECK(cudaGetDevice(&cur_device));
// free all cached memory (for all devices), synchrionize
cudaDeviceSynchronize();
cudaThreadSynchronize();
cub_allocator->FreeAllCached();
cudaDeviceSynchronize();
cudaThreadSynchronize();
// Refresh per-device saved values.
for (int i = 0; i < dev_info_.size(); ++i) {
// only query devices that were initialized
Expand All @@ -80,16 +82,13 @@ void GPUMemoryManager::allocate(void** ptr, size_t size, cudaStream_t stream) {
}
}
}
// Retry once
CUDA_CHECK(cub_allocator->DeviceAllocate(ptr, size, stream));
}
// If retry succeeds we need to clean up last error
cudaGetLastError();
break;
default:
CUDA_CHECK(cudaMalloc(ptr, size));
status = cudaMalloc(ptr, size);
break;
}
return status == cudaSuccess;
}

void GPUMemoryManager::deallocate(void* ptr, cudaStream_t stream) {
Expand Down Expand Up @@ -172,7 +171,7 @@ void GPUMemoryManager::GetInfo(size_t* free_mem, size_t* total_mem) {
CUDA_CHECK(cudaGetDevice(&cur_device));
*total_mem = dev_info_[cur_device].total_;
// Free memory is initial free memory minus outstanding allocations.
// Assuming we only allocate via GPUMemoryManager since its constructon.
// Assuming we only allocate via GPUMemoryManager since its construction.
*free_mem = dev_info_[cur_device].free_ -
cub_allocator->cached_bytes[cur_device].live;
break;
Expand Down

0 comments on commit d988833

Please sign in to comment.