Merge pull request #101 from borisfom/caffe-0.14

gpu_memory cleanup, fixes possible init issue
NVIDIA · Jan 15, 2016 · 8eebbb4 · 8eebbb4
2 parents db610fd + e1fe549
commit 8eebbb4
Show file tree

Hide file tree

Showing 4 changed files with 82 additions and 66 deletions.
diff --git a/3rdparty/cub/cub/util_allocator.cuh b/3rdparty/cub/cub/util_allocator.cuh
@@ -408,10 +408,10 @@ struct CachingDeviceAllocator
  && (block_itr->device == device)
  && (block_itr->bin == search_key.bin)) {
 
-  // use special rule for the last ("exact size") bin: set max memory overuse to 1/8th
+ // use special rule for the last ("exact size") bin: set max memory overuse to 1/8th
  if (search_key.bin == (unsigned int) -1 && (block_itr->bytes - search_key.bytes)*8UL > search_key.bytes)
  break;
- 
+
  cudaStream_t prev_stream = block_itr->associated_stream;
  if ((active_stream == prev_stream)
  || (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) {
@@ -454,22 +454,12 @@ struct CachingDeviceAllocator
  if (error != cudaSuccess) {
  if (debug) CubLog("\tdevice %d failed to allocate %lld bytes for stream %lld",
  device, (long long) search_key.bytes, (long long) search_key.associated_stream);
-
- // if (search_key.bytes < cached_bytes[device]) {
- // free all cached memory (for all devices), synchrionize and retry once
- cudaDeviceSynchronize();
- cudaThreadSynchronize();
- FreeAllCached();
- cudaDeviceSynchronize();
- cudaThreadSynchronize();
- error = cudaMalloc(&search_key.d_ptr, search_key.bytes);
- // }
  }
- if (CubDebug(error)) 
+ if (CubDebug(error))
  return error;
- if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) 
+ if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
  return error;
- 
+
  // Insert into live blocks
  Lock(&spin_lock);
  live_blocks.insert(search_key);
@@ -548,30 +538,30 @@ struct CachingDeviceAllocator
 
  // Lock
  Lock(&spin_lock);
- 
+
  // Find corresponding block descriptor
  BusyBlocks::iterator block_itr = live_blocks.find(search_key);
  if (block_itr != live_blocks.end()) {
  // Remove from live blocks
  search_key = *block_itr;
- live_blocks.erase(block_itr); 
+ live_blocks.erase(block_itr);
  cached_bytes[device].busy -= search_key.bytes;
- 
+
  // Check if we should keep the returned allocation
  if (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)
- { 
+ {
  // Insert returned allocation into free blocks
  cached_blocks.insert(search_key);
  cached_bytes[device].free += search_key.bytes;
  recached = true;
- if (debug) { 
+ if (debug) {
  CubLog("\tdevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
- device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), 
+ device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(),
  (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].busy);
  }
  }
  }
- 
+
  Unlock(&spin_lock);
 
  if (recached) {
@@ -585,7 +575,7 @@ struct CachingDeviceAllocator
  return error;
  if (CubDebug(error = cudaSetDevice(device))) return error;
  }
- 
+
  // Actually free device memory
  if (CubDebug(error = cudaFree(d_ptr))) return error;
  if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error;

diff --git a/Makefile b/Makefile
@@ -571,7 +571,7 @@ $(DYNAMIC_NAME): $(OBJS)| $(LIB_BUILD_DIR)
  @ cd $(BUILD_DIR)/lib; rm -f $(DYNAMIC_SONAME_SHORT); ln -s $(DYNAMIC_VERSIONED_NAME_SHORT) $(DYNAMIC_SONAME_SHORT)
  @ cd $(BUILD_DIR)/lib; rm -f $(DYNAMIC_NAME_SHORT); ln -s $(DYNAMIC_SONAME_SHORT) $(DYNAMIC_NAME_SHORT)
 
-$(STATIC_NAME): $(OBJS) | $(LIB_BUILD_DIR) 
+$(STATIC_NAME): $(OBJS) | $(LIB_BUILD_DIR)
  @ echo AR -o $@
  $(Q)ar rcs $@ $(OBJS)
 

diff --git a/include/caffe/util/gpu_memory.hpp b/include/caffe/util/gpu_memory.hpp
@@ -77,6 +77,8 @@ class gpu_memory {
  cudaStream_t stream_;
  size_t size_;
  };
+ static void update_dev_info(int device);
+
 # endif
 
  private:
@@ -89,8 +91,12 @@ class gpu_memory {
 
 #ifndef CPU_ONLY
  struct MemInfo {
- size_t free;
- size_t total;
+ MemInfo() {
+ free = total = flush_count = 0;
+ }
+ size_t free;
+ size_t total;
+ unsigned flush_count;
  };
 
  static vector<MemInfo> dev_info_;
@@ -102,7 +108,6 @@ class gpu_memory {
  cudaStream_t stream = cudaStreamDefault);
  static void deallocate(pointer ptr, cudaStream_t = cudaStreamDefault);
 
- static void registerStream(cudaStream_t stream);
  static void getInfo(size_t *free_mem, size_t *used_mem);
 
  private:

diff --git a/src/caffe/util/gpu_memory.cpp b/src/caffe/util/gpu_memory.cpp
@@ -26,9 +26,12 @@ namespace caffe {
  return "No GPU: CPU Only Memory";
  }
 #else
+
  void gpu_memory::init(const std::vector<int>& gpus,
  PoolMode m, bool debug) {
- debug_ = debug;
+ bool debug_env = (getenv("DEBUG_GPU_MEM") != 0);
+ debug_ = debug || debug_env;
+
  if (gpus.size() <= 0) {
  // should we report an error here ?
  m = gpu_memory::NoPool;
@@ -63,7 +66,29 @@ namespace caffe {
  CHECK((ptr) != NULL);
  switch (mode_) {
  case CubPool:
- CUDA_CHECK(cubAlloc->DeviceAllocate(ptr, size, stream));
+ if (cubAlloc->DeviceAllocate(ptr, size, stream) != cudaSuccess) {
+ int cur_device;
+ CUDA_CHECK(cudaGetDevice(&cur_device));
+ // free all cached memory (for all devices), synchrionize
+ cudaDeviceSynchronize();
+ cudaThreadSynchronize();
+ cubAlloc->FreeAllCached();
+ cudaDeviceSynchronize();
+ cudaThreadSynchronize();
+
+ // Refresh per-device saved values.
+ for (int i = 0; i < dev_info_.size(); i++) {
+ // only query devices that were initialized
+ if (dev_info_[i].total) {
+ update_dev_info(i);
+ // record which device caused cache flush
+ if (i == cur_device)
+ dev_info_[i].flush_count++;
+ }
+ }
+ // retry once
+ CUDA_CHECK(cubAlloc->DeviceAllocate(ptr, size, stream));
+ }
  break;
  default:
  CUDA_CHECK(cudaMalloc(ptr, size));
@@ -85,47 +110,42 @@ namespace caffe {
  }
  }
 
- void gpu_memory::registerStream(cudaStream_t stream) {
- switch (mode_) {
- case CubPool:
- default:
- break;
- }
- }
 
- void gpu_memory::initMEM(const std::vector<int>& gpus, PoolMode m) {
- mode_ = m;
+ void gpu_memory::update_dev_info(int device) {
  int initial_device;
-
  CUDA_CHECK(cudaGetDevice(&initial_device));
 
- for (int i = 0; i < gpus.size(); i++) {
- int cur_device = gpus[i];
- if (cur_device+1 > dev_info_.size())
- dev_info_.resize(cur_device+1);
-
- CUDA_CHECK(cudaSetDevice(gpus[i]));
- cudaDeviceProp props;
- CUDA_CHECK(cudaGetDeviceProperties(&props, cur_device));
- CUDA_CHECK(cudaMemGetInfo(&dev_info_[cur_device].free,
- &dev_info_[cur_device].total));
-
- if (debug_) {
- std::cout << "cudaGetDeviceProperties: Mem = "
- << props.totalGlobalMem <<std::endl;
- std::cout << "cudaMemGetInfo_[" << cur_device
- <<": Free= " << dev_info_[cur_device].free
- << " Total= " << dev_info_[cur_device].total << std::endl;
- }
-
- // make sure we don't ask for more that total device memory
- dev_info_[i].free = std::min(dev_info_[cur_device].total,
- dev_info_[cur_device].free);
- dev_info_[i].free = std::min(props.totalGlobalMem,
- dev_info_[cur_device].free);
+ if (device+1 > dev_info_.size())
+ dev_info_.resize(device+1);
+
+ CUDA_CHECK(cudaSetDevice(device));
+ cudaDeviceProp props;
+ CUDA_CHECK(cudaGetDeviceProperties(&props, device));
+ CUDA_CHECK(cudaMemGetInfo(&dev_info_[device].free,
+ &dev_info_[device].total));
+
+ if (debug_) {
+ std::cout << "cudaGetDeviceProperties: Mem = "
+ << props.totalGlobalMem <<std::endl;
+ std::cout << "cudaMemGetInfo_[" << device
+ <<": Free= " << dev_info_[device].free
+ << " Total= " << dev_info_[device].total << std::endl;
  }
 
+ // make sure we don't have more that total device memory
+ dev_info_[device].total = std::min(props.totalGlobalMem,
+ dev_info_[device].total);
+
+ // here we are adding existing 'busy' allocations to CUDA free memory
+ dev_info_[device].free =
+ std::min(dev_info_[device].total,
+ dev_info_[device].free
+ + cubAlloc->cached_bytes[device].busy);
+ CUDA_CHECK(cudaSetDevice(initial_device));
+ }
 
+ void gpu_memory::initMEM(const std::vector<int>& gpus, PoolMode m) {
+ mode_ = m;
  switch ( mode_ ) {
  case CubPool:
  try {
@@ -141,12 +161,13 @@ namespace caffe {
  }
  catch (...) {}
  CHECK(cubAlloc);
+ for (int i = 0; i < gpus.size(); i++) {
+ update_dev_info(gpus[i]);
+ }
  break;
  default:
  break;
  }
-
- CUDA_CHECK(cudaSetDevice(initial_device));
  }
 
  const char* gpu_memory::getPoolName() {
@@ -167,7 +188,7 @@ namespace caffe {
  // Free memory is initial free memory minus outstanding allocations.
  // Assuming we only allocate via gpu_memory since its constructon.
  *free_mem = dev_info_[cur_device].free -
-  cubAlloc->cached_bytes[cur_device].busy;
+ cubAlloc->cached_bytes[cur_device].busy;
  break;
  default:
  CUDA_CHECK(cudaMemGetInfo(free_mem, total_mem));