From 9141deb9e227312cfd8260d2995392b7ca0e9296 Mon Sep 17 00:00:00 2001 From: Sergei Nikolaev Date: Sat, 28 May 2016 01:44:15 -0700 Subject: [PATCH] Re-evaluating device info when get closer to zero mem --- include/caffe/util/gpu_memory.hpp | 20 ++++++++++++++---- src/caffe/layers/cudnn_conv_layer.cpp | 29 ++++++++++----------------- src/caffe/layers/cudnn_conv_layer.cu | 19 +++++++++++++++--- src/caffe/util/gpu_memory.cpp | 26 +++++++++++------------- 4 files changed, 55 insertions(+), 39 deletions(-) diff --git a/include/caffe/util/gpu_memory.hpp b/include/caffe/util/gpu_memory.hpp index 0ac24f4c25f..33c9b727673 100644 --- a/include/caffe/util/gpu_memory.hpp +++ b/include/caffe/util/gpu_memory.hpp @@ -58,14 +58,22 @@ class GPUMemoryManager { } // Memory allocation/release - void reserve(size_t size) { + bool try_reserve(size_t size) { + bool status = true; if (size > size_) { if (ptr_) { GPUMemoryManager::deallocate(ptr_, stream_); } - GPUMemoryManager::allocate(&ptr_, size, stream_); - size_ = size; + status = GPUMemoryManager::try_allocate(&ptr_, size, stream_); + if (status) { + size_ = size; + } } + return status; + } + + void reserve(size_t size) { + CHECK(try_reserve(size)); } /* @@ -111,8 +119,12 @@ class GPUMemoryManager { public: typedef void* pointer; - static void allocate(pointer* ptr, size_t size, cudaStream_t stream = + static bool try_allocate(pointer* ptr, size_t size, cudaStream_t stream = cudaStreamDefault); + static void allocate(pointer* ptr, size_t size, cudaStream_t stream = + cudaStreamDefault) { + CHECK(try_allocate(ptr, size, stream)); + } static void deallocate(pointer ptr, cudaStream_t = cudaStreamDefault); static void GetInfo(size_t* free_mem, size_t* used_mem); diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp index 0a6effaea6a..2e59e520cdf 100644 --- a/src/caffe/layers/cudnn_conv_layer.cpp +++ b/src/caffe/layers/cudnn_conv_layer.cpp @@ -116,34 +116,18 @@ void CuDNNConvolutionLayer::Reshape( filter_desc_, pad_h, pad_w, stride_h, stride_w); // Have to pass full fwd/bwd cycle before taking the rest of memory - if (backward_passed_ctr_ > 1) { + if (backward_passed_ctr_ > 0) { // choose forward and backward algorithms + workspace(s) CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(Caffe::cudnn_handle(), bottom_descs_[i], filter_desc_, conv_descs_[i], top_descs_[i], CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace_limit_bytes, &fwd_algo_[i])); - } - - CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(Caffe::cudnn_handle(), - bottom_descs_[i], filter_desc_, conv_descs_[i], top_descs_[i], - fwd_algo_[i], &(workspace_fwd_sizes_[i]))); - - if (backward_passed_ctr_ > 1) { // choose backward algorithm for filter CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm( Caffe::cudnn_handle(), bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_, CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, workspace_limit_bytes, &bwd_filter_algo_[i])); - } - - // get workspace for backwards filter algorithm - CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize( - Caffe::cudnn_handle(), - bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_, - bwd_filter_algo_[i], &workspace_bwd_filter_sizes_[i])); - - if (backward_passed_ctr_ > 1) { // choose backward algo for data CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm( Caffe::cudnn_handle(), @@ -152,7 +136,16 @@ void CuDNNConvolutionLayer::Reshape( workspace_limit_bytes, &bwd_data_algo_[i])); } - // get workspace size + // get workspace size for forward algorithm + CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(Caffe::cudnn_handle(), + bottom_descs_[i], filter_desc_, conv_descs_[i], top_descs_[i], + fwd_algo_[i], &(workspace_fwd_sizes_[i]))); + // get workspace size for backward filter algorithm + CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize( + Caffe::cudnn_handle(), + bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_, + bwd_filter_algo_[i], &workspace_bwd_filter_sizes_[i])); + // get workspace size for backward data algorithm CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize( Caffe::cudnn_handle(), filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i], diff --git a/src/caffe/layers/cudnn_conv_layer.cu b/src/caffe/layers/cudnn_conv_layer.cu index 93ceea14acf..2113906eb60 100644 --- a/src/caffe/layers/cudnn_conv_layer.cu +++ b/src/caffe/layers/cudnn_conv_layer.cu @@ -26,7 +26,13 @@ void CuDNNConvolutionLayer::Forward_gpu( if (workspace_fwd_sizes_[i] > workspace_limit_bytes) { this->Reshape(bottom, top); } - workspace.reserve(workspace_fwd_sizes_[i]); + // Sometimes closer to zero we might have memory info diverged from reality + // If try_reserve fails, it updates the info internally and we proceed with + // Reshape one more time + if (!workspace.try_reserve(workspace_fwd_sizes_[i])) { + this->Reshape(bottom, top); + workspace.reserve(workspace_fwd_sizes_[i]); + } // Forward through cuDNN in parallel over groups. for (int g = 0; g < this->group_; g++) { @@ -85,8 +91,15 @@ void CuDNNConvolutionLayer::Backward_gpu(const vector*>& top, } // To remove pressure on allocator, allocate the larger of the // workspaces needed for the following steps - workspace.reserve(std::max(workspace_bwd_filter_sizes_[i], - workspace_bwd_data_sizes_[i])); + // Sometimes closer to zero we might have memory info diverged from reality + // If try_reserve fails, it updates the info internally and we proceed with + // Reshape one more time + if (!workspace.try_reserve(std::max(workspace_bwd_filter_sizes_[i], + workspace_bwd_data_sizes_[i]))) { + this->Reshape(bottom, top); + workspace.reserve(std::max(workspace_bwd_filter_sizes_[i], + workspace_bwd_data_sizes_[i])); + } // Backward through cuDNN in parallel over groups and gradients. for (int g = 0; g < this->group_; g++) { diff --git a/src/caffe/util/gpu_memory.cpp b/src/caffe/util/gpu_memory.cpp index 1eedc34e374..9a869019095 100644 --- a/src/caffe/util/gpu_memory.cpp +++ b/src/caffe/util/gpu_memory.cpp @@ -56,19 +56,20 @@ void GPUMemoryManager::destroy() { mode_ = NO_POOL; } -void GPUMemoryManager::allocate(void** ptr, size_t size, cudaStream_t stream) { +bool GPUMemoryManager::try_allocate(void** ptr, size_t size, cudaStream_t stream) { CHECK((ptr) != NULL); + cudaError_t status = cudaSuccess, last_err = cudaSuccess; switch (mode_) { case CUB_POOL: - if (cub_allocator->DeviceAllocate(ptr, size, stream) != cudaSuccess) { + // Clean Cache & Retry logic is inside now + status = cub_allocator->DeviceAllocate(ptr, size, stream); + // If there was a retry and it succeeded we get good status here but + // we need to clean up last error... + last_err = cudaGetLastError(); + // ...and update the dev info if something was wrong + if (status != cudaSuccess || last_err != cudaSuccess) { int cur_device; CUDA_CHECK(cudaGetDevice(&cur_device)); - // free all cached memory (for all devices), synchrionize - cudaDeviceSynchronize(); - cudaThreadSynchronize(); - cub_allocator->FreeAllCached(); - cudaDeviceSynchronize(); - cudaThreadSynchronize(); // Refresh per-device saved values. for (int i = 0; i < dev_info_.size(); ++i) { // only query devices that were initialized @@ -80,16 +81,13 @@ void GPUMemoryManager::allocate(void** ptr, size_t size, cudaStream_t stream) { } } } - // Retry once - CUDA_CHECK(cub_allocator->DeviceAllocate(ptr, size, stream)); } - // If retry succeeds we need to clean up last error - cudaGetLastError(); break; default: - CUDA_CHECK(cudaMalloc(ptr, size)); + status = cudaMalloc(ptr, size); break; } + return status == cudaSuccess; } void GPUMemoryManager::deallocate(void* ptr, cudaStream_t stream) { @@ -172,7 +170,7 @@ void GPUMemoryManager::GetInfo(size_t* free_mem, size_t* total_mem) { CUDA_CHECK(cudaGetDevice(&cur_device)); *total_mem = dev_info_[cur_device].total_; // Free memory is initial free memory minus outstanding allocations. - // Assuming we only allocate via GPUMemoryManager since its constructon. + // Assuming we only allocate via GPUMemoryManager since its construction. *free_mem = dev_info_[cur_device].free_ - cub_allocator->cached_bytes[cur_device].live; break;