Re-evaluating device info when get closer to zero mem

NVIDIA · May 31, 2016 · 9141deb · 9141deb
1 parent c12b7ad
commit 9141deb
Show file tree

Hide file tree

Showing 4 changed files with 55 additions and 39 deletions.
diff --git a/include/caffe/util/gpu_memory.hpp b/include/caffe/util/gpu_memory.hpp
@@ -58,14 +58,22 @@ class GPUMemoryManager {
  }
 
  // Memory allocation/release
- void reserve(size_t size) {
+ bool try_reserve(size_t size) {
+ bool status = true;
  if (size > size_) {
  if (ptr_) {
  GPUMemoryManager::deallocate(ptr_, stream_);
  }
- GPUMemoryManager::allocate(&ptr_, size, stream_);
- size_ = size;
+ status = GPUMemoryManager::try_allocate(&ptr_, size, stream_);
+ if (status) {
+ size_ = size;
+ }
  }
+ return status;
+ }
+
+ void reserve(size_t size) {
+ CHECK(try_reserve(size));
  }
 
  /*
@@ -111,8 +119,12 @@ class GPUMemoryManager {
 
  public:
  typedef void* pointer;
- static void allocate(pointer* ptr, size_t size, cudaStream_t stream =
+ static bool try_allocate(pointer* ptr, size_t size, cudaStream_t stream =
  cudaStreamDefault);
+ static void allocate(pointer* ptr, size_t size, cudaStream_t stream =
+ cudaStreamDefault) {
+ CHECK(try_allocate(ptr, size, stream));
+ }
  static void deallocate(pointer ptr, cudaStream_t = cudaStreamDefault);
  static void GetInfo(size_t* free_mem, size_t* used_mem);
 

diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp
@@ -116,34 +116,18 @@ void CuDNNConvolutionLayer<Dtype>::Reshape(
  filter_desc_, pad_h, pad_w, stride_h, stride_w);
 
  // Have to pass full fwd/bwd cycle before taking the rest of memory
- if (backward_passed_ctr_ > 1) {
+ if (backward_passed_ctr_ > 0) {
  // choose forward and backward algorithms + workspace(s)
  CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(Caffe::cudnn_handle(),
  bottom_descs_[i], filter_desc_, conv_descs_[i], top_descs_[i],
  CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
  workspace_limit_bytes, &fwd_algo_[i]));
- }
-
- CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(Caffe::cudnn_handle(),
- bottom_descs_[i], filter_desc_, conv_descs_[i], top_descs_[i],
- fwd_algo_[i], &(workspace_fwd_sizes_[i])));
-
- if (backward_passed_ctr_ > 1) {
  // choose backward algorithm for filter
  CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
  Caffe::cudnn_handle(),
  bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_,
  CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
  workspace_limit_bytes, &bwd_filter_algo_[i]));
- }
-
- // get workspace for backwards filter algorithm
- CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
- Caffe::cudnn_handle(),
- bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_,
- bwd_filter_algo_[i], &workspace_bwd_filter_sizes_[i]));
-
- if (backward_passed_ctr_ > 1) {
  // choose backward algo for data
  CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
  Caffe::cudnn_handle(),
@@ -152,7 +136,16 @@ void CuDNNConvolutionLayer<Dtype>::Reshape(
  workspace_limit_bytes, &bwd_data_algo_[i]));
  }
 
- // get workspace size
+ // get workspace size for forward algorithm
+ CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(Caffe::cudnn_handle(),
+ bottom_descs_[i], filter_desc_, conv_descs_[i], top_descs_[i],
+ fwd_algo_[i], &(workspace_fwd_sizes_[i])));
+ // get workspace size for backward filter algorithm
+ CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
+ Caffe::cudnn_handle(),
+ bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_,
+ bwd_filter_algo_[i], &workspace_bwd_filter_sizes_[i]));
+ // get workspace size for backward data algorithm
  CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
  Caffe::cudnn_handle(),
  filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i],

diff --git a/src/caffe/layers/cudnn_conv_layer.cu b/src/caffe/layers/cudnn_conv_layer.cu
@@ -26,7 +26,13 @@ void CuDNNConvolutionLayer<Dtype>::Forward_gpu(
  if (workspace_fwd_sizes_[i] > workspace_limit_bytes) {
  this->Reshape(bottom, top);
  }
- workspace.reserve(workspace_fwd_sizes_[i]);
+ // Sometimes closer to zero we might have memory info diverged from reality
+ // If try_reserve fails, it updates the info internally and we proceed with
+ // Reshape one more time
+ if (!workspace.try_reserve(workspace_fwd_sizes_[i])) {
+ this->Reshape(bottom, top);
+ workspace.reserve(workspace_fwd_sizes_[i]);
+ }
 
  // Forward through cuDNN in parallel over groups.
  for (int g = 0; g < this->group_; g++) {
@@ -85,8 +91,15 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
  }
  // To remove pressure on allocator, allocate the larger of the
  // workspaces needed for the following steps
- workspace.reserve(std::max(workspace_bwd_filter_sizes_[i],
- workspace_bwd_data_sizes_[i]));
+ // Sometimes closer to zero we might have memory info diverged from reality
+ // If try_reserve fails, it updates the info internally and we proceed with
+ // Reshape one more time
+ if (!workspace.try_reserve(std::max(workspace_bwd_filter_sizes_[i],
+ workspace_bwd_data_sizes_[i]))) {
+ this->Reshape(bottom, top);
+ workspace.reserve(std::max(workspace_bwd_filter_sizes_[i],
+ workspace_bwd_data_sizes_[i]));
+ }
 
  // Backward through cuDNN in parallel over groups and gradients.
  for (int g = 0; g < this->group_; g++) {

diff --git a/src/caffe/util/gpu_memory.cpp b/src/caffe/util/gpu_memory.cpp
@@ -56,19 +56,20 @@ void GPUMemoryManager::destroy() {
  mode_ = NO_POOL;
 }
 
-void GPUMemoryManager::allocate(void** ptr, size_t size, cudaStream_t stream) {
+bool GPUMemoryManager::try_allocate(void** ptr, size_t size, cudaStream_t stream) {
  CHECK((ptr) != NULL);
+ cudaError_t status = cudaSuccess, last_err = cudaSuccess;
  switch (mode_) {
  case CUB_POOL:
- if (cub_allocator->DeviceAllocate(ptr, size, stream) != cudaSuccess) {
+ // Clean Cache & Retry logic is inside now
+ status = cub_allocator->DeviceAllocate(ptr, size, stream);
+ // If there was a retry and it succeeded we get good status here but
+ // we need to clean up last error...
+ last_err = cudaGetLastError();
+ // ...and update the dev info if something was wrong
+ if (status != cudaSuccess || last_err != cudaSuccess) {
  int cur_device;
  CUDA_CHECK(cudaGetDevice(&cur_device));
- // free all cached memory (for all devices), synchrionize
- cudaDeviceSynchronize();
- cudaThreadSynchronize();
- cub_allocator->FreeAllCached();
- cudaDeviceSynchronize();
- cudaThreadSynchronize();
  // Refresh per-device saved values.
  for (int i = 0; i < dev_info_.size(); ++i) {
  // only query devices that were initialized
@@ -80,16 +81,13 @@ void GPUMemoryManager::allocate(void** ptr, size_t size, cudaStream_t stream) {
  }
  }
  }
- // Retry once
- CUDA_CHECK(cub_allocator->DeviceAllocate(ptr, size, stream));
  }
- // If retry succeeds we need to clean up last error
- cudaGetLastError();
  break;
  default:
- CUDA_CHECK(cudaMalloc(ptr, size));
+ status = cudaMalloc(ptr, size);
  break;
  }
+ return status == cudaSuccess;
 }
 
 void GPUMemoryManager::deallocate(void* ptr, cudaStream_t stream) {
@@ -172,7 +170,7 @@ void GPUMemoryManager::GetInfo(size_t* free_mem, size_t* total_mem) {
  CUDA_CHECK(cudaGetDevice(&cur_device));
  *total_mem = dev_info_[cur_device].total_;
  // Free memory is initial free memory minus outstanding allocations.
- // Assuming we only allocate via GPUMemoryManager since its constructon.
+ // Assuming we only allocate via GPUMemoryManager since its construction.
  *free_mem = dev_info_[cur_device].free_ -
  cub_allocator->cached_bytes[cur_device].live;
  break;