Skip to content

Commit

Permalink
NCCL integration
Browse files Browse the repository at this point in the history
  • Loading branch information
slayton58 authored and drnikolaev committed Jun 2, 2016
1 parent d988833 commit 6b4d102
Show file tree
Hide file tree
Showing 26 changed files with 474 additions and 306 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ caffe_option(BUILD_python_layer "Build the Caffe Python layer" ON)
caffe_option(USE_OPENCV "Build with OpenCV support" ON)
caffe_option(USE_LEVELDB "Build with levelDB" ON)
caffe_option(USE_LMDB "Build with lmdb" ON)
caffe_option(USE_NCCL "Build with NCCL Library for multi-GPU support" ON IF NOT CPU_ONLY)
caffe_option(ALLOW_LMDB_NOLOCK "Allow MDB_NOLOCK when reading LMDB files (only if necessary)" OFF)

# ---[ Dependencies
Expand Down
6 changes: 6 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,12 @@ ifeq ($(USE_CUDNN), 1)
COMMON_FLAGS += -DUSE_CUDNN
endif

# NCCL acceleration configuration
ifeq ($(USE_NCCL), 1)
LIBRARIES += nccl
COMMON_FLAGS += -DUSE_NCCL
endif

# configure IO libraries
ifeq ($(USE_OPENCV), 1)
COMMON_FLAGS += -DUSE_OPENCV
Expand Down
4 changes: 4 additions & 0 deletions Makefile.config.example
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@
# cuDNN version 4 or higher is required.
# USE_CUDNN := 1

# NCCL acceleration switch (uncomment to build with NCCL)
# See https:/NVIDIA/nccl
# USE_NCCL := 1

# CPU-only switch (uncomment to build without GPU support).
# cuDNN version 4 or higher is required.
# CPU_ONLY := 1
Expand Down
4 changes: 4 additions & 0 deletions cmake/ConfigGen.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,10 @@ function(caffe_generate_export_configs)
list(APPEND Caffe_DEFINITIONS -DUSE_MKL)
endif()

if(USE_NCCL)
list(APPEND Caffe_DEFINITIONS -DUSE_NCCL)
endif()

configure_file("cmake/Templates/CaffeConfig.cmake.in" "${PROJECT_BINARY_DIR}/CaffeConfig.cmake" @ONLY)

# Add targets to the build-tree export set
Expand Down
8 changes: 8 additions & 0 deletions cmake/Dependencies.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -170,3 +170,11 @@ endif()
if(BUILD_docs)
find_package(Doxygen)
endif()

# ---[ NCCL
if(USE_NCCL)
add_definitions(-DUSE_NCCL)
find_package(NCCL REQUIRED)
include_directories(SYSTEM ${NCCL_INCLUDE})
list(APPEND Caffe_LINKER_LIBS ${NCCL_LIBRARIES})
endif()
22 changes: 22 additions & 0 deletions cmake/Modules/FindNCCL.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Find the NCCL libraries
#
# The following variables are optionally searched for defaults
# NCCL_ROOT_DIR: Base directory where all NCCL components are found
#
# The following are set after configuration is done:
# NCCL_FOUND
# NCCL_INCLUDE_DIR
# NCCL_LIBRARIES

find_path(NCCL_INCLUDE_DIR NAMES nccl.h PATHS ${NCCL_ROOT_DIR})

find_library(NCCL_LIBRARIES NAMES nccl PATHS ${NCCL_ROOT_DIR})

include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIR NCCL_LIBRARIES)

if(NCCL_FOUND)
message(STATUS "Found NCCL (include: ${NCCL_INCLUDE_DIR}, library: ${NCCL_LIBRARIES})")
mark_as_advanced(NCCL_INCLUDE_DIR NCCL_LIBRARIES)
endif()

5 changes: 5 additions & 0 deletions cmake/Summary.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,11 @@ function(caffe_print_configuration_summary)
else()
caffe_status(" cuDNN : Disabled")
endif()
if(USE_NCCL)
caffe_status(" NCCL : " NCCL_FOUND THEN "Yes" ELSE "Not found")
else()
caffe_status(" NCCL : Disabled")
endif()
caffe_status("")
endif()
if(HAVE_PYTHON)
Expand Down
7 changes: 4 additions & 3 deletions cmake/Templates/caffe_config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,16 @@
/* Binaries directory */
#define BINARY_FOLDER "${PROJECT_BINARY_DIR}"

/* NVIDA Cuda */
/* NVIDIA Cuda */
#cmakedefine HAVE_CUDA

/* NVIDA cuDNN */
/* NVIDIA cuDNN */
#cmakedefine HAVE_CUDNN
#cmakedefine USE_CUDNN

/* NVIDA cuDNN */
/* NVIDIA cuDNN */
#cmakedefine CPU_ONLY
#cmakedefine USE_NCCL

/* Test device */
#define CUDA_TEST_DEVICE ${CUDA_TEST_DEVICE}
Expand Down
12 changes: 12 additions & 0 deletions include/caffe/net.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@

namespace caffe {

template <typename Dtype>
class Solver;

/**
* @brief Connects Layer%s together into a directed acyclic graph (DAG)
* specified by a NetParameter.
Expand Down Expand Up @@ -227,6 +230,11 @@ class Net {
static bool StateMeetsRule(const NetState& state, const NetStateRule& rule,
const string& layer_name);

/// @brief set a Solver for this net
void SetSolver(Solver<Dtype>* s) {
solver_ = s;
}

protected:
// Helpers for Init.
/// @brief Append a new top blob to the net.
Expand Down Expand Up @@ -278,6 +286,8 @@ class Net {
vector<int> param_owners_;
vector<string> param_display_names_;
vector<pair<int, int> > param_layer_indices_;
/// (layer, blob) -> param_id map
map<pair<int, int>, int> layer_index_params_;
map<string, int> param_names_index_;
/// blob indices for the input and the output of the net
vector<int> net_input_blob_indices_;
Expand Down Expand Up @@ -307,6 +317,8 @@ class Net {
bool debug_info_;
/// The root net that actually holds the shared layers in data parallelism
const Net* const root_net_;
/// Pointer to the solver being used with this net
Solver<Dtype>* solver_;
DISABLE_COPY_AND_ASSIGN(Net);
};

Expand Down
41 changes: 37 additions & 4 deletions include/caffe/parallel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@
#include "caffe/syncedmem.hpp"
#include "caffe/util/blocking_queue.hpp"

#ifdef USE_NCCL
#include "caffe/util/nccl.hpp"
#endif

namespace caffe {

// Represents a net parameters. Once a net is created, its parameter buffers can
Expand Down Expand Up @@ -89,7 +93,7 @@ class P2PSync : public GPUParams<Dtype>, public Solver<Dtype>::Callback,
public InternalThread {
public:
explicit P2PSync(shared_ptr<Solver<Dtype> > root_solver,
P2PSync<Dtype>* parent, const SolverParameter& param);
int rank, int nranks, const SolverParameter& param);
virtual ~P2PSync();

inline const shared_ptr<Solver<Dtype> >& solver() const {
Expand All @@ -104,18 +108,47 @@ class P2PSync : public GPUParams<Dtype>, public Solver<Dtype>::Callback,
// Divide the batch size by the number of solvers
static void divide_batch_size(NetParameter* net);

#ifdef USE_NCCL
// set the NCCL communicator
void setNCCLComm(ncclComm_t comm);
#endif

public:
void allreduce(int param_id);
void syncCommStream();

protected:
void SetupP2PAccess();
void soft_barrier();
void on_start();
void on_gradients_ready();

void allreduce();
void syncAllStreams();
#ifndef CPU_ONLY
#ifdef USE_NCCL
ncclComm_t getNCCLComm();
#endif
cudaStream_t getCommStream();
#endif
void InternalThreadEntry();

const int rank_;
const int nranks_;
P2PSync<Dtype>* parent_;
vector<P2PSync<Dtype>*> children_;
#ifndef CPU_ONLY
#ifdef USE_NCCL
std::vector<ncclComm_t> nccl_comms_;
#endif
vector<cudaStream_t> comm_streams_;
#endif
BlockingQueue<P2PSync<Dtype>*> queue_;
const int initial_iter_;
Dtype* parent_grads_;

shared_ptr<Solver<Dtype> > solver_;
const SolverParameter& params_;

// per-parameter reduction enabled
bool per_parameter_reduce_;

using Params<Dtype>::size_;
using Params<Dtype>::data_;
Expand Down
62 changes: 39 additions & 23 deletions include/caffe/sgd_solvers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,11 @@ namespace caffe {
template <typename Dtype>
class SGDSolver : public Solver<Dtype> {
public:
explicit SGDSolver(const SolverParameter& param)
: Solver<Dtype>(param) { PreSolve(); }
explicit SGDSolver(const string& param_file)
explicit SGDSolver(const SolverParameter& param,
Solver<Dtype> *root_solver = NULL)
: Solver<Dtype>(param, root_solver) { PreSolve(); }
explicit SGDSolver(const string& param_file,
Solver<Dtype> *root_solver = NULL)
: Solver<Dtype>(param_file) { PreSolve(); }
virtual inline const char* type() const { return "SGD"; }

Expand Down Expand Up @@ -48,10 +50,12 @@ class SGDSolver : public Solver<Dtype> {
template <typename Dtype>
class NesterovSolver : public SGDSolver<Dtype> {
public:
explicit NesterovSolver(const SolverParameter& param)
: SGDSolver<Dtype>(param) {}
explicit NesterovSolver(const string& param_file)
: SGDSolver<Dtype>(param_file) {}
explicit NesterovSolver(const SolverParameter& param,
Solver<Dtype> *root_solver = NULL)
: SGDSolver<Dtype>(param, root_solver) {}
explicit NesterovSolver(const string& param_file,
Solver<Dtype> *root_solver = NULL)
: SGDSolver<Dtype>(param_file, root_solver) {}
virtual inline const char* type() const { return "Nesterov"; }

protected:
Expand All @@ -63,10 +67,14 @@ class NesterovSolver : public SGDSolver<Dtype> {
template <typename Dtype>
class AdaGradSolver : public SGDSolver<Dtype> {
public:
explicit AdaGradSolver(const SolverParameter& param)
: SGDSolver<Dtype>(param) { constructor_sanity_check(); }
explicit AdaGradSolver(const string& param_file)
: SGDSolver<Dtype>(param_file) { constructor_sanity_check(); }
explicit AdaGradSolver(const SolverParameter& param,
Solver<Dtype> *root_solver = NULL)
: SGDSolver<Dtype>(param, root_solver)
{ constructor_sanity_check(); }
explicit AdaGradSolver(const string& param_file,
Solver<Dtype> *root_solver = NULL)
: SGDSolver<Dtype>(param_file, root_solver)
{ constructor_sanity_check(); }
virtual inline const char* type() const { return "AdaGrad"; }

protected:
Expand All @@ -83,10 +91,14 @@ class AdaGradSolver : public SGDSolver<Dtype> {
template <typename Dtype>
class RMSPropSolver : public SGDSolver<Dtype> {
public:
explicit RMSPropSolver(const SolverParameter& param)
: SGDSolver<Dtype>(param) { constructor_sanity_check(); }
explicit RMSPropSolver(const string& param_file)
: SGDSolver<Dtype>(param_file) { constructor_sanity_check(); }
explicit RMSPropSolver(const SolverParameter& param,
Solver<Dtype> *root_solver = NULL)
: SGDSolver<Dtype>(param, root_solver)
{ constructor_sanity_check(); }
explicit RMSPropSolver(const string& param_file,
Solver<Dtype> *root_solver = NULL)
: SGDSolver<Dtype>(param_file, root_solver)
{ constructor_sanity_check(); }
virtual inline const char* type() const { return "RMSProp"; }

protected:
Expand All @@ -106,10 +118,12 @@ class RMSPropSolver : public SGDSolver<Dtype> {
template <typename Dtype>
class AdaDeltaSolver : public SGDSolver<Dtype> {
public:
explicit AdaDeltaSolver(const SolverParameter& param)
: SGDSolver<Dtype>(param) { AdaDeltaPreSolve(); }
explicit AdaDeltaSolver(const string& param_file)
: SGDSolver<Dtype>(param_file) { AdaDeltaPreSolve(); }
explicit AdaDeltaSolver(const SolverParameter& param,
Solver<Dtype> *root_solver = NULL)
: SGDSolver<Dtype>(param, root_solver) { AdaDeltaPreSolve(); }
explicit AdaDeltaSolver(const string& param_file,
Solver<Dtype> *root_solver = NULL)
: SGDSolver<Dtype>(param_file, root_solver) { AdaDeltaPreSolve(); }
virtual inline const char* type() const { return "AdaDelta"; }

protected:
Expand All @@ -130,10 +144,12 @@ class AdaDeltaSolver : public SGDSolver<Dtype> {
template <typename Dtype>
class AdamSolver : public SGDSolver<Dtype> {
public:
explicit AdamSolver(const SolverParameter& param)
: SGDSolver<Dtype>(param) { AdamPreSolve();}
explicit AdamSolver(const string& param_file)
: SGDSolver<Dtype>(param_file) { AdamPreSolve(); }
explicit AdamSolver(const SolverParameter& param,
Solver<Dtype> *root_solver = NULL)
: SGDSolver<Dtype>(param, root_solver) { AdamPreSolve();}
explicit AdamSolver(const string& param_file,
Solver<Dtype> *root_solver = NULL)
: SGDSolver<Dtype>(param_file, root_solver) { AdamPreSolve(); }
virtual inline const char* type() const { return "Adam"; }

protected:
Expand Down
12 changes: 11 additions & 1 deletion include/caffe/solver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include "caffe/net.hpp"
#include "caffe/solver_factory.hpp"
#include "caffe/util/benchmark.hpp"

namespace caffe {

Expand Down Expand Up @@ -76,9 +77,14 @@ class Solver {

// Invoked at specific points during an iteration
class Callback {
public:
virtual void allreduce(int param_id) = 0;
virtual void syncCommStream() = 0;

protected:
virtual void on_start() = 0;
virtual void on_gradients_ready() = 0;
virtual void allreduce() = 0;
virtual void soft_barrier() = 0;

template <typename T>
friend class Solver;
Expand Down Expand Up @@ -129,6 +135,10 @@ class Solver {
// True iff a request to stop early was received.
bool requested_early_exit_;

// Timing information
Timer iteration_timer_;
float iterations_last_;

DISABLE_COPY_AND_ASSIGN(Solver);
};

Expand Down
Loading

0 comments on commit 6b4d102

Please sign in to comment.