Skip to content

Commit

Permalink
Merge pull request #101 from borisfom/caffe-0.14
Browse files Browse the repository at this point in the history
gpu_memory cleanup, fixes possible init issue
  • Loading branch information
drnikolaev committed Jan 15, 2016
2 parents db610fd + e1fe549 commit 8eebbb4
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 66 deletions.
36 changes: 13 additions & 23 deletions 3rdparty/cub/cub/util_allocator.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -408,10 +408,10 @@ struct CachingDeviceAllocator
&& (block_itr->device == device)
&& (block_itr->bin == search_key.bin)) {

// use special rule for the last ("exact size") bin: set max memory overuse to 1/8th
// use special rule for the last ("exact size") bin: set max memory overuse to 1/8th
if (search_key.bin == (unsigned int) -1 && (block_itr->bytes - search_key.bytes)*8UL > search_key.bytes)
break;

cudaStream_t prev_stream = block_itr->associated_stream;
if ((active_stream == prev_stream)
|| (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) {
Expand Down Expand Up @@ -454,22 +454,12 @@ struct CachingDeviceAllocator
if (error != cudaSuccess) {
if (debug) CubLog("\tdevice %d failed to allocate %lld bytes for stream %lld",
device, (long long) search_key.bytes, (long long) search_key.associated_stream);

// if (search_key.bytes < cached_bytes[device]) {
// free all cached memory (for all devices), synchrionize and retry once
cudaDeviceSynchronize();
cudaThreadSynchronize();
FreeAllCached();
cudaDeviceSynchronize();
cudaThreadSynchronize();
error = cudaMalloc(&search_key.d_ptr, search_key.bytes);
// }
}
if (CubDebug(error))
if (CubDebug(error))
return error;
if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
return error;

// Insert into live blocks
Lock(&spin_lock);
live_blocks.insert(search_key);
Expand Down Expand Up @@ -548,30 +538,30 @@ struct CachingDeviceAllocator

// Lock
Lock(&spin_lock);

// Find corresponding block descriptor
BusyBlocks::iterator block_itr = live_blocks.find(search_key);
if (block_itr != live_blocks.end()) {
// Remove from live blocks
search_key = *block_itr;
live_blocks.erase(block_itr);
live_blocks.erase(block_itr);
cached_bytes[device].busy -= search_key.bytes;

// Check if we should keep the returned allocation
if (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)
{
{
// Insert returned allocation into free blocks
cached_blocks.insert(search_key);
cached_bytes[device].free += search_key.bytes;
recached = true;
if (debug) {
if (debug) {
CubLog("\tdevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(),
device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(),
(long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].busy);
}
}
}

Unlock(&spin_lock);

if (recached) {
Expand All @@ -585,7 +575,7 @@ struct CachingDeviceAllocator
return error;
if (CubDebug(error = cudaSetDevice(device))) return error;
}

// Actually free device memory
if (CubDebug(error = cudaFree(d_ptr))) return error;
if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error;
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -571,7 +571,7 @@ $(DYNAMIC_NAME): $(OBJS)| $(LIB_BUILD_DIR)
@ cd $(BUILD_DIR)/lib; rm -f $(DYNAMIC_SONAME_SHORT); ln -s $(DYNAMIC_VERSIONED_NAME_SHORT) $(DYNAMIC_SONAME_SHORT)
@ cd $(BUILD_DIR)/lib; rm -f $(DYNAMIC_NAME_SHORT); ln -s $(DYNAMIC_SONAME_SHORT) $(DYNAMIC_NAME_SHORT)

$(STATIC_NAME): $(OBJS) | $(LIB_BUILD_DIR)
$(STATIC_NAME): $(OBJS) | $(LIB_BUILD_DIR)
@ echo AR -o $@
$(Q)ar rcs $@ $(OBJS)

Expand Down
11 changes: 8 additions & 3 deletions include/caffe/util/gpu_memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ class gpu_memory {
cudaStream_t stream_;
size_t size_;
};
static void update_dev_info(int device);

# endif

private:
Expand All @@ -89,8 +91,12 @@ class gpu_memory {

#ifndef CPU_ONLY
struct MemInfo {
size_t free;
size_t total;
MemInfo() {
free = total = flush_count = 0;
}
size_t free;
size_t total;
unsigned flush_count;
};

static vector<MemInfo> dev_info_;
Expand All @@ -102,7 +108,6 @@ class gpu_memory {
cudaStream_t stream = cudaStreamDefault);
static void deallocate(pointer ptr, cudaStream_t = cudaStreamDefault);

static void registerStream(cudaStream_t stream);
static void getInfo(size_t *free_mem, size_t *used_mem);

private:
Expand Down
99 changes: 60 additions & 39 deletions src/caffe/util/gpu_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,12 @@ namespace caffe {
return "No GPU: CPU Only Memory";
}
#else

void gpu_memory::init(const std::vector<int>& gpus,
PoolMode m, bool debug) {
debug_ = debug;
bool debug_env = (getenv("DEBUG_GPU_MEM") != 0);
debug_ = debug || debug_env;

if (gpus.size() <= 0) {
// should we report an error here ?
m = gpu_memory::NoPool;
Expand Down Expand Up @@ -63,7 +66,29 @@ namespace caffe {
CHECK((ptr) != NULL);
switch (mode_) {
case CubPool:
CUDA_CHECK(cubAlloc->DeviceAllocate(ptr, size, stream));
if (cubAlloc->DeviceAllocate(ptr, size, stream) != cudaSuccess) {
int cur_device;
CUDA_CHECK(cudaGetDevice(&cur_device));
// free all cached memory (for all devices), synchrionize
cudaDeviceSynchronize();
cudaThreadSynchronize();
cubAlloc->FreeAllCached();
cudaDeviceSynchronize();
cudaThreadSynchronize();

// Refresh per-device saved values.
for (int i = 0; i < dev_info_.size(); i++) {
// only query devices that were initialized
if (dev_info_[i].total) {
update_dev_info(i);
// record which device caused cache flush
if (i == cur_device)
dev_info_[i].flush_count++;
}
}
// retry once
CUDA_CHECK(cubAlloc->DeviceAllocate(ptr, size, stream));
}
break;
default:
CUDA_CHECK(cudaMalloc(ptr, size));
Expand All @@ -85,47 +110,42 @@ namespace caffe {
}
}

void gpu_memory::registerStream(cudaStream_t stream) {
switch (mode_) {
case CubPool:
default:
break;
}
}

void gpu_memory::initMEM(const std::vector<int>& gpus, PoolMode m) {
mode_ = m;
void gpu_memory::update_dev_info(int device) {
int initial_device;

CUDA_CHECK(cudaGetDevice(&initial_device));

for (int i = 0; i < gpus.size(); i++) {
int cur_device = gpus[i];
if (cur_device+1 > dev_info_.size())
dev_info_.resize(cur_device+1);

CUDA_CHECK(cudaSetDevice(gpus[i]));
cudaDeviceProp props;
CUDA_CHECK(cudaGetDeviceProperties(&props, cur_device));
CUDA_CHECK(cudaMemGetInfo(&dev_info_[cur_device].free,
&dev_info_[cur_device].total));

if (debug_) {
std::cout << "cudaGetDeviceProperties: Mem = "
<< props.totalGlobalMem <<std::endl;
std::cout << "cudaMemGetInfo_[" << cur_device
<<": Free= " << dev_info_[cur_device].free
<< " Total= " << dev_info_[cur_device].total << std::endl;
}

// make sure we don't ask for more that total device memory
dev_info_[i].free = std::min(dev_info_[cur_device].total,
dev_info_[cur_device].free);
dev_info_[i].free = std::min(props.totalGlobalMem,
dev_info_[cur_device].free);
if (device+1 > dev_info_.size())
dev_info_.resize(device+1);

CUDA_CHECK(cudaSetDevice(device));
cudaDeviceProp props;
CUDA_CHECK(cudaGetDeviceProperties(&props, device));
CUDA_CHECK(cudaMemGetInfo(&dev_info_[device].free,
&dev_info_[device].total));

if (debug_) {
std::cout << "cudaGetDeviceProperties: Mem = "
<< props.totalGlobalMem <<std::endl;
std::cout << "cudaMemGetInfo_[" << device
<<": Free= " << dev_info_[device].free
<< " Total= " << dev_info_[device].total << std::endl;
}

// make sure we don't have more that total device memory
dev_info_[device].total = std::min(props.totalGlobalMem,
dev_info_[device].total);

// here we are adding existing 'busy' allocations to CUDA free memory
dev_info_[device].free =
std::min(dev_info_[device].total,
dev_info_[device].free
+ cubAlloc->cached_bytes[device].busy);
CUDA_CHECK(cudaSetDevice(initial_device));
}

void gpu_memory::initMEM(const std::vector<int>& gpus, PoolMode m) {
mode_ = m;
switch ( mode_ ) {
case CubPool:
try {
Expand All @@ -141,12 +161,13 @@ namespace caffe {
}
catch (...) {}
CHECK(cubAlloc);
for (int i = 0; i < gpus.size(); i++) {
update_dev_info(gpus[i]);
}
break;
default:
break;
}

CUDA_CHECK(cudaSetDevice(initial_device));
}

const char* gpu_memory::getPoolName() {
Expand All @@ -167,7 +188,7 @@ namespace caffe {
// Free memory is initial free memory minus outstanding allocations.
// Assuming we only allocate via gpu_memory since its constructon.
*free_mem = dev_info_[cur_device].free -
cubAlloc->cached_bytes[cur_device].busy;
cubAlloc->cached_bytes[cur_device].busy;
break;
default:
CUDA_CHECK(cudaMemGetInfo(free_mem, total_mem));
Expand Down

0 comments on commit 8eebbb4

Please sign in to comment.