From 9141deb9e227312cfd8260d2995392b7ca0e9296 Mon Sep 17 00:00:00 2001
From: Sergei Nikolaev <snikolaev@nvidia.com>
Date: Sat, 28 May 2016 01:44:15 -0700
Subject: [PATCH] Re-evaluating device info when get closer to zero mem

---
 include/caffe/util/gpu_memory.hpp     | 20 ++++++++++++++----
 src/caffe/layers/cudnn_conv_layer.cpp | 29 ++++++++++-----------------
 src/caffe/layers/cudnn_conv_layer.cu  | 19 +++++++++++++++---
 src/caffe/util/gpu_memory.cpp         | 26 +++++++++++-------------
 4 files changed, 55 insertions(+), 39 deletions(-)
diff --git a/include/caffe/util/gpu_memory.hpp b/include/caffe/util/gpu_memory.hpp
index 0ac24f4c25f..33c9b727673 100644
--- a/include/caffe/util/gpu_memory.hpp
+++ b/include/caffe/util/gpu_memory.hpp
@@ -58,14 +58,22 @@ class GPUMemoryManager {
     }
 
     // Memory allocation/release
-    void reserve(size_t size) {
+    bool try_reserve(size_t size) {
+      bool status = true;
       if (size > size_) {
         if (ptr_) {
           GPUMemoryManager::deallocate(ptr_, stream_);
         }
-        GPUMemoryManager::allocate(&ptr_, size, stream_);
-        size_ = size;
+        status = GPUMemoryManager::try_allocate(&ptr_, size, stream_);
+        if (status) {
+          size_ = size;
+        }
       }
+      return status;
+    }
+
+    void reserve(size_t size) {
+      CHECK(try_reserve(size));
     }
 
     /*
@@ -111,8 +119,12 @@ class GPUMemoryManager {
 
  public:
   typedef void* pointer;
-  static void allocate(pointer* ptr, size_t size, cudaStream_t stream =
+  static bool try_allocate(pointer* ptr, size_t size, cudaStream_t stream =
       cudaStreamDefault);
+  static void allocate(pointer* ptr, size_t size, cudaStream_t stream =
+      cudaStreamDefault) {
+    CHECK(try_allocate(ptr, size, stream));
+  }
   static void deallocate(pointer ptr, cudaStream_t = cudaStreamDefault);
   static void GetInfo(size_t* free_mem, size_t* used_mem);
 
diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp
index 0a6effaea6a..2e59e520cdf 100644
--- a/src/caffe/layers/cudnn_conv_layer.cpp
+++ b/src/caffe/layers/cudnn_conv_layer.cpp
@@ -116,34 +116,18 @@ void CuDNNConvolutionLayer<Dtype>::Reshape(
         filter_desc_, pad_h, pad_w, stride_h, stride_w);
 
     // Have to pass full fwd/bwd cycle before taking the rest of memory
-    if (backward_passed_ctr_ > 1) {
+    if (backward_passed_ctr_ > 0) {
       // choose forward and backward algorithms + workspace(s)
       CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(Caffe::cudnn_handle(),
           bottom_descs_[i], filter_desc_, conv_descs_[i], top_descs_[i],
           CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
           workspace_limit_bytes, &fwd_algo_[i]));
-    }
-
-    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(Caffe::cudnn_handle(),
-        bottom_descs_[i], filter_desc_, conv_descs_[i], top_descs_[i],
-        fwd_algo_[i], &(workspace_fwd_sizes_[i])));
-
-    if (backward_passed_ctr_ > 1) {
       // choose backward algorithm for filter
       CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
           Caffe::cudnn_handle(),
           bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_,
           CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
           workspace_limit_bytes, &bwd_filter_algo_[i]));
-    }
-
-    // get workspace for backwards filter algorithm
-    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
-        Caffe::cudnn_handle(),
-        bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_,
-        bwd_filter_algo_[i], &workspace_bwd_filter_sizes_[i]));
-
-    if (backward_passed_ctr_ > 1) {
       // choose backward algo for data
       CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
           Caffe::cudnn_handle(),
@@ -152,7 +136,16 @@ void CuDNNConvolutionLayer<Dtype>::Reshape(
           workspace_limit_bytes, &bwd_data_algo_[i]));
     }
 
-    // get workspace size
+    // get workspace size for forward algorithm
+    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(Caffe::cudnn_handle(),
+        bottom_descs_[i], filter_desc_, conv_descs_[i], top_descs_[i],
+        fwd_algo_[i], &(workspace_fwd_sizes_[i])));
+    // get workspace size for backward filter algorithm
+    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
+        Caffe::cudnn_handle(),
+        bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_,
+        bwd_filter_algo_[i], &workspace_bwd_filter_sizes_[i]));
+    // get workspace size for backward data algorithm
     CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
         Caffe::cudnn_handle(),
         filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i],
diff --git a/src/caffe/layers/cudnn_conv_layer.cu b/src/caffe/layers/cudnn_conv_layer.cu
index 93ceea14acf..2113906eb60 100644
--- a/src/caffe/layers/cudnn_conv_layer.cu
+++ b/src/caffe/layers/cudnn_conv_layer.cu
@@ -26,7 +26,13 @@ void CuDNNConvolutionLayer<Dtype>::Forward_gpu(
     if (workspace_fwd_sizes_[i] > workspace_limit_bytes) {
       this->Reshape(bottom, top);
     }
-    workspace.reserve(workspace_fwd_sizes_[i]);
+    // Sometimes closer to zero we might have memory info diverged from reality
+    // If try_reserve fails, it updates the info internally and we proceed with
+    // Reshape one more time
+    if (!workspace.try_reserve(workspace_fwd_sizes_[i])) {
+      this->Reshape(bottom, top);
+      workspace.reserve(workspace_fwd_sizes_[i]);
+    }
 
     // Forward through cuDNN in parallel over groups.
     for (int g = 0; g < this->group_; g++) {
@@ -85,8 +91,15 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     }
     // To remove pressure on allocator, allocate the larger of the
     // workspaces needed for the following steps
-    workspace.reserve(std::max(workspace_bwd_filter_sizes_[i],
-        workspace_bwd_data_sizes_[i]));
+    // Sometimes closer to zero we might have memory info diverged from reality
+    // If try_reserve fails, it updates the info internally and we proceed with
+    // Reshape one more time
+    if (!workspace.try_reserve(std::max(workspace_bwd_filter_sizes_[i],
+        workspace_bwd_data_sizes_[i]))) {
+      this->Reshape(bottom, top);
+      workspace.reserve(std::max(workspace_bwd_filter_sizes_[i],
+          workspace_bwd_data_sizes_[i]));
+    }
 
     // Backward through cuDNN in parallel over groups and gradients.
     for (int g = 0; g < this->group_; g++) {
diff --git a/src/caffe/util/gpu_memory.cpp b/src/caffe/util/gpu_memory.cpp
index 1eedc34e374..9a869019095 100644
--- a/src/caffe/util/gpu_memory.cpp
+++ b/src/caffe/util/gpu_memory.cpp
@@ -56,19 +56,20 @@ void GPUMemoryManager::destroy() {
   mode_ = NO_POOL;
 }
 
-void GPUMemoryManager::allocate(void** ptr, size_t size, cudaStream_t stream) {
+bool GPUMemoryManager::try_allocate(void** ptr, size_t size, cudaStream_t stream) {
   CHECK((ptr) != NULL);
+  cudaError_t status = cudaSuccess, last_err = cudaSuccess;
   switch (mode_) {
   case CUB_POOL:
-    if (cub_allocator->DeviceAllocate(ptr, size, stream) != cudaSuccess) {
+    // Clean Cache & Retry logic is inside now
+    status = cub_allocator->DeviceAllocate(ptr, size, stream);
+    // If there was a retry and it succeeded we get good status here but
+    // we need to clean up last error...
+    last_err = cudaGetLastError();
+    // ...and update the dev info if something was wrong
+    if (status != cudaSuccess || last_err != cudaSuccess) {
       int cur_device;
       CUDA_CHECK(cudaGetDevice(&cur_device));
-      // free all cached memory (for all devices), synchrionize
-      cudaDeviceSynchronize();
-      cudaThreadSynchronize();
-      cub_allocator->FreeAllCached();
-      cudaDeviceSynchronize();
-      cudaThreadSynchronize();
       // Refresh per-device saved values.
       for (int i = 0; i < dev_info_.size(); ++i) {
         // only query devices that were initialized
@@ -80,16 +81,13 @@ void GPUMemoryManager::allocate(void** ptr, size_t size, cudaStream_t stream) {
           }
         }
       }
-      // Retry once
-      CUDA_CHECK(cub_allocator->DeviceAllocate(ptr, size, stream));
     }
-    // If retry succeeds we need to clean up last error
-    cudaGetLastError();
     break;
   default:
-    CUDA_CHECK(cudaMalloc(ptr, size));
+    status = cudaMalloc(ptr, size);
     break;
   }
+  return status == cudaSuccess;
 }
 
 void GPUMemoryManager::deallocate(void* ptr, cudaStream_t stream) {
@@ -172,7 +170,7 @@ void GPUMemoryManager::GetInfo(size_t* free_mem, size_t* total_mem) {
     CUDA_CHECK(cudaGetDevice(&cur_device));
     *total_mem = dev_info_[cur_device].total_;
     // Free memory is initial free memory minus outstanding allocations.
-    // Assuming we only allocate via GPUMemoryManager since its constructon.
+    // Assuming we only allocate via GPUMemoryManager since its construction.
     *free_mem = dev_info_[cur_device].free_ -
         cub_allocator->cached_bytes[cur_device].live;
     break;