Merge pull request #153 from drnikolaev/caffe-0.15-oom

Thanks @lukeyeager for reviewing the code. This PR will be followed by another one implementing better memory distribution algorithm for cuDNN Convolution Layer (as per our discussion).
NVIDIA · Jun 1, 2016 · d988833 · d988833
2 parents cf75318 + 1d4882b
commit d988833
Show file tree

Hide file tree

Showing 3 changed files with 45 additions and 21 deletions.
diff --git a/include/caffe/util/gpu_memory.hpp b/include/caffe/util/gpu_memory.hpp
@@ -58,14 +58,22 @@ class GPUMemoryManager {
  }
 
  // Memory allocation/release
- void reserve(size_t size) {
+ bool try_reserve(size_t size) {
+ bool status = true;
  if (size > size_) {
  if (ptr_) {
  GPUMemoryManager::deallocate(ptr_, stream_);
  }
- GPUMemoryManager::allocate(&ptr_, size, stream_);
- size_ = size;
+ status = GPUMemoryManager::try_allocate(&ptr_, size, stream_);
+ if (status) {
+ size_ = size;
+ }
  }
+ return status;
+ }
+
+ void reserve(size_t size) {
+ CHECK(try_reserve(size));
  }
 
  /*
@@ -111,8 +119,12 @@ class GPUMemoryManager {
 
  public:
  typedef void* pointer;
- static void allocate(pointer* ptr, size_t size, cudaStream_t stream =
+ static bool try_allocate(pointer* ptr, size_t size, cudaStream_t stream =
  cudaStreamDefault);
+ static void allocate(pointer* ptr, size_t size, cudaStream_t stream =
+ cudaStreamDefault) {
+ CHECK(try_allocate(ptr, size, stream));
+ }
  static void deallocate(pointer ptr, cudaStream_t = cudaStreamDefault);
  static void GetInfo(size_t* free_mem, size_t* used_mem);
 

diff --git a/src/caffe/layers/cudnn_conv_layer.cu b/src/caffe/layers/cudnn_conv_layer.cu
@@ -26,7 +26,13 @@ void CuDNNConvolutionLayer<Dtype>::Forward_gpu(
  if (workspace_fwd_sizes_[i] > workspace_limit_bytes) {
  this->Reshape(bottom, top);
  }
- workspace.reserve(workspace_fwd_sizes_[i]);
+ // Sometimes closer to zero we might have memory info diverged from reality
+ // If try_reserve fails, it updates the info internally and we proceed with
+ // Reshape one more time
+ if (!workspace.try_reserve(workspace_fwd_sizes_[i])) {
+ this->Reshape(bottom, top);
+ workspace.reserve(workspace_fwd_sizes_[i]);
+ }
 
  // Forward through cuDNN in parallel over groups.
  for (int g = 0; g < this->group_; g++) {
@@ -85,8 +91,15 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
  }
  // To remove pressure on allocator, allocate the larger of the
  // workspaces needed for the following steps
- workspace.reserve(std::max(workspace_bwd_filter_sizes_[i],
- workspace_bwd_data_sizes_[i]));
+ // Sometimes closer to zero we might have memory info diverged from reality
+ // If try_reserve fails, it updates the info internally and we proceed with
+ // Reshape one more time
+ if (!workspace.try_reserve(std::max(workspace_bwd_filter_sizes_[i],
+ workspace_bwd_data_sizes_[i]))) {
+ this->Reshape(bottom, top);
+ workspace.reserve(std::max(workspace_bwd_filter_sizes_[i],
+ workspace_bwd_data_sizes_[i]));
+ }
 
  // Backward through cuDNN in parallel over groups and gradients.
  for (int g = 0; g < this->group_; g++) {

diff --git a/src/caffe/util/gpu_memory.cpp b/src/caffe/util/gpu_memory.cpp
@@ -56,19 +56,21 @@ void GPUMemoryManager::destroy() {
  mode_ = NO_POOL;
 }
 
-void GPUMemoryManager::allocate(void** ptr, size_t size, cudaStream_t stream) {
+bool GPUMemoryManager::try_allocate(void** ptr, size_t size,
+ cudaStream_t stream) {
  CHECK((ptr) != NULL);
+ cudaError_t status = cudaSuccess, last_err = cudaSuccess;
  switch (mode_) {
  case CUB_POOL:
- if (cub_allocator->DeviceAllocate(ptr, size, stream) != cudaSuccess) {
+ // Clean Cache & Retry logic is inside now
+ status = cub_allocator->DeviceAllocate(ptr, size, stream);
+ // If there was a retry and it succeeded we get good status here but
+ // we need to clean up last error...
+ last_err = cudaGetLastError();
+ // ...and update the dev info if something was wrong
+ if (status != cudaSuccess || last_err != cudaSuccess) {
  int cur_device;
  CUDA_CHECK(cudaGetDevice(&cur_device));
- // free all cached memory (for all devices), synchrionize
- cudaDeviceSynchronize();
- cudaThreadSynchronize();
- cub_allocator->FreeAllCached();
- cudaDeviceSynchronize();
- cudaThreadSynchronize();
  // Refresh per-device saved values.
  for (int i = 0; i < dev_info_.size(); ++i) {
  // only query devices that were initialized
@@ -80,16 +82,13 @@ void GPUMemoryManager::allocate(void** ptr, size_t size, cudaStream_t stream) {
  }
  }
  }
- // Retry once
- CUDA_CHECK(cub_allocator->DeviceAllocate(ptr, size, stream));
  }
- // If retry succeeds we need to clean up last error
- cudaGetLastError();
  break;
  default:
- CUDA_CHECK(cudaMalloc(ptr, size));
+ status = cudaMalloc(ptr, size);
  break;
  }
+ return status == cudaSuccess;
 }
 
 void GPUMemoryManager::deallocate(void* ptr, cudaStream_t stream) {
@@ -172,7 +171,7 @@ void GPUMemoryManager::GetInfo(size_t* free_mem, size_t* total_mem) {
  CUDA_CHECK(cudaGetDevice(&cur_device));
  *total_mem = dev_info_[cur_device].total_;
  // Free memory is initial free memory minus outstanding allocations.
- // Assuming we only allocate via GPUMemoryManager since its constructon.
+ // Assuming we only allocate via GPUMemoryManager since its construction.
  *free_mem = dev_info_[cur_device].free_ -
  cub_allocator->cached_bytes[cur_device].live;
  break;