NCCL integration

NVIDIA · Jun 2, 2016 · 6b4d102 · 6b4d102
1 parent d988833
commit 6b4d102
Show file tree

Hide file tree

Showing 26 changed files with 474 additions and 306 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -37,6 +37,7 @@ caffe_option(BUILD_python_layer "Build the Caffe Python layer" ON)
 caffe_option(USE_OPENCV "Build with OpenCV support" ON)
 caffe_option(USE_LEVELDB "Build with levelDB" ON)
 caffe_option(USE_LMDB "Build with lmdb" ON)
+caffe_option(USE_NCCL "Build with NCCL Library for multi-GPU support" ON IF NOT CPU_ONLY)
 caffe_option(ALLOW_LMDB_NOLOCK "Allow MDB_NOLOCK when reading LMDB files (only if necessary)" OFF)
 
 # ---[ Dependencies

diff --git a/Makefile b/Makefile
@@ -334,6 +334,12 @@ ifeq ($(USE_CUDNN), 1)
  COMMON_FLAGS += -DUSE_CUDNN
 endif
 
+# NCCL acceleration configuration
+ifeq ($(USE_NCCL), 1)
+ LIBRARIES += nccl
+ COMMON_FLAGS += -DUSE_NCCL
+endif
+
 # configure IO libraries
 ifeq ($(USE_OPENCV), 1)
  COMMON_FLAGS += -DUSE_OPENCV

diff --git a/Makefile.config.example b/Makefile.config.example
@@ -5,6 +5,10 @@
 # cuDNN version 4 or higher is required.
 # USE_CUDNN := 1
 
+# NCCL acceleration switch (uncomment to build with NCCL)
+# See https:/NVIDIA/nccl
+# USE_NCCL := 1
+
 # CPU-only switch (uncomment to build without GPU support).
 # cuDNN version 4 or higher is required.
 # CPU_ONLY := 1

diff --git a/cmake/ConfigGen.cmake b/cmake/ConfigGen.cmake
@@ -81,6 +81,10 @@ function(caffe_generate_export_configs)
  list(APPEND Caffe_DEFINITIONS -DUSE_MKL)
  endif()
 
+ if(USE_NCCL)
+ list(APPEND Caffe_DEFINITIONS -DUSE_NCCL)
+ endif()
+
  configure_file("cmake/Templates/CaffeConfig.cmake.in" "${PROJECT_BINARY_DIR}/CaffeConfig.cmake" @ONLY)
 
  # Add targets to the build-tree export set

diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
@@ -170,3 +170,11 @@ endif()
 if(BUILD_docs)
  find_package(Doxygen)
 endif()
+
+# ---[ NCCL
+if(USE_NCCL)
+ add_definitions(-DUSE_NCCL)
+ find_package(NCCL REQUIRED)
+ include_directories(SYSTEM ${NCCL_INCLUDE})
+ list(APPEND Caffe_LINKER_LIBS ${NCCL_LIBRARIES})
+endif()
diff --git a/cmake/Modules/FindNCCL.cmake b/cmake/Modules/FindNCCL.cmake
@@ -0,0 +1,22 @@
+# Find the NCCL libraries
+#
+# The following variables are optionally searched for defaults
+# NCCL_ROOT_DIR: Base directory where all NCCL components are found
+#
+# The following are set after configuration is done:
+# NCCL_FOUND
+# NCCL_INCLUDE_DIR
+# NCCL_LIBRARIES
+
+find_path(NCCL_INCLUDE_DIR NAMES nccl.h PATHS ${NCCL_ROOT_DIR})
+
+find_library(NCCL_LIBRARIES NAMES nccl PATHS ${NCCL_ROOT_DIR})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIR NCCL_LIBRARIES)
+
+if(NCCL_FOUND)
+ message(STATUS "Found NCCL (include: ${NCCL_INCLUDE_DIR}, library: ${NCCL_LIBRARIES})")
+ mark_as_advanced(NCCL_INCLUDE_DIR NCCL_LIBRARIES)
+endif()
+
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
@@ -146,6 +146,11 @@ function(caffe_print_configuration_summary)
  else()
  caffe_status(" cuDNN : Disabled")
  endif()
+ if(USE_NCCL)
+ caffe_status(" NCCL : " NCCL_FOUND THEN "Yes" ELSE "Not found")
+ else()
+ caffe_status(" NCCL : Disabled")
+ endif()
  caffe_status("")
  endif()
  if(HAVE_PYTHON)

diff --git a/cmake/Templates/caffe_config.h.in b/cmake/Templates/caffe_config.h.in
@@ -4,15 +4,16 @@
 /* Binaries directory */
 #define BINARY_FOLDER "${PROJECT_BINARY_DIR}"
 
-/* NVIDA Cuda */
+/* NVIDIA Cuda */
 #cmakedefine HAVE_CUDA
 
-/* NVIDA cuDNN */
+/* NVIDIA cuDNN */
 #cmakedefine HAVE_CUDNN
 #cmakedefine USE_CUDNN
 
-/* NVIDA cuDNN */
+/* NVIDIA cuDNN */
 #cmakedefine CPU_ONLY
+#cmakedefine USE_NCCL
 
 /* Test device */
 #define CUDA_TEST_DEVICE ${CUDA_TEST_DEVICE}

diff --git a/include/caffe/net.hpp b/include/caffe/net.hpp
@@ -14,6 +14,9 @@
 
 namespace caffe {
 
+template <typename Dtype>
+class Solver;
+
 /**
  * @brief Connects Layer%s together into a directed acyclic graph (DAG)
  * specified by a NetParameter.
@@ -227,6 +230,11 @@ class Net {
  static bool StateMeetsRule(const NetState& state, const NetStateRule& rule,
  const string& layer_name);
 
+ /// @brief set a Solver for this net
+ void SetSolver(Solver<Dtype>* s) {
+ solver_ = s;
+ }
+
  protected:
  // Helpers for Init.
  /// @brief Append a new top blob to the net.
@@ -278,6 +286,8 @@ class Net {
  vector<int> param_owners_;
  vector<string> param_display_names_;
  vector<pair<int, int> > param_layer_indices_;
+ /// (layer, blob) -> param_id map
+ map<pair<int, int>, int> layer_index_params_;
  map<string, int> param_names_index_;
  /// blob indices for the input and the output of the net
  vector<int> net_input_blob_indices_;
@@ -307,6 +317,8 @@ class Net {
  bool debug_info_;
  /// The root net that actually holds the shared layers in data parallelism
  const Net* const root_net_;
+ /// Pointer to the solver being used with this net
+ Solver<Dtype>* solver_;
  DISABLE_COPY_AND_ASSIGN(Net);
 };
 

diff --git a/include/caffe/parallel.hpp b/include/caffe/parallel.hpp
@@ -14,6 +14,10 @@
 #include "caffe/syncedmem.hpp"
 #include "caffe/util/blocking_queue.hpp"
 
+#ifdef USE_NCCL
+#include "caffe/util/nccl.hpp"
+#endif
+
 namespace caffe {
 
 // Represents a net parameters. Once a net is created, its parameter buffers can
@@ -89,7 +93,7 @@ class P2PSync : public GPUParams<Dtype>, public Solver<Dtype>::Callback,
  public InternalThread {
  public:
  explicit P2PSync(shared_ptr<Solver<Dtype> > root_solver,
- P2PSync<Dtype>* parent, const SolverParameter& param);
+ int rank, int nranks, const SolverParameter& param);
  virtual ~P2PSync();
 
  inline const shared_ptr<Solver<Dtype> >& solver() const {
@@ -104,18 +108,47 @@ class P2PSync : public GPUParams<Dtype>, public Solver<Dtype>::Callback,
  // Divide the batch size by the number of solvers
  static void divide_batch_size(NetParameter* net);
 
+#ifdef USE_NCCL
+ // set the NCCL communicator
+ void setNCCLComm(ncclComm_t comm);
+#endif
+
+ public:
+ void allreduce(int param_id);
+ void syncCommStream();
+
  protected:
+ void SetupP2PAccess();
+ void soft_barrier();
  void on_start();
- void on_gradients_ready();
-
+ void allreduce();
+ void syncAllStreams();
+#ifndef CPU_ONLY
+#ifdef USE_NCCL
+ ncclComm_t getNCCLComm();
+#endif
+ cudaStream_t getCommStream();
+#endif
  void InternalThreadEntry();
 
+ const int rank_;
+ const int nranks_;
  P2PSync<Dtype>* parent_;
  vector<P2PSync<Dtype>*> children_;
+#ifndef CPU_ONLY
+#ifdef USE_NCCL
+ std::vector<ncclComm_t> nccl_comms_;
+#endif
+ vector<cudaStream_t> comm_streams_;
+#endif
  BlockingQueue<P2PSync<Dtype>*> queue_;
  const int initial_iter_;
- Dtype* parent_grads_;
+
  shared_ptr<Solver<Dtype> > solver_;
+ const SolverParameter& params_;
+
+ // per-parameter reduction enabled
+ bool per_parameter_reduce_;
 
  using Params<Dtype>::size_;
  using Params<Dtype>::data_;

diff --git a/include/caffe/sgd_solvers.hpp b/include/caffe/sgd_solvers.hpp
@@ -15,9 +15,11 @@ namespace caffe {
 template <typename Dtype>
 class SGDSolver : public Solver<Dtype> {
  public:
- explicit SGDSolver(const SolverParameter& param)
- : Solver<Dtype>(param) { PreSolve(); }
- explicit SGDSolver(const string& param_file)
+ explicit SGDSolver(const SolverParameter& param,
+ Solver<Dtype> *root_solver = NULL)
+ : Solver<Dtype>(param, root_solver) { PreSolve(); }
+ explicit SGDSolver(const string& param_file,
+ Solver<Dtype> *root_solver = NULL)
  : Solver<Dtype>(param_file) { PreSolve(); }
  virtual inline const char* type() const { return "SGD"; }
 
@@ -48,10 +50,12 @@ class SGDSolver : public Solver<Dtype> {
 template <typename Dtype>
 class NesterovSolver : public SGDSolver<Dtype> {
  public:
- explicit NesterovSolver(const SolverParameter& param)
- : SGDSolver<Dtype>(param) {}
- explicit NesterovSolver(const string& param_file)
- : SGDSolver<Dtype>(param_file) {}
+ explicit NesterovSolver(const SolverParameter& param,
+ Solver<Dtype> *root_solver = NULL)
+ : SGDSolver<Dtype>(param, root_solver) {}
+ explicit NesterovSolver(const string& param_file,
+ Solver<Dtype> *root_solver = NULL)
+ : SGDSolver<Dtype>(param_file, root_solver) {}
  virtual inline const char* type() const { return "Nesterov"; }
 
  protected:
@@ -63,10 +67,14 @@ class NesterovSolver : public SGDSolver<Dtype> {
 template <typename Dtype>
 class AdaGradSolver : public SGDSolver<Dtype> {
  public:
- explicit AdaGradSolver(const SolverParameter& param)
- : SGDSolver<Dtype>(param) { constructor_sanity_check(); }
- explicit AdaGradSolver(const string& param_file)
- : SGDSolver<Dtype>(param_file) { constructor_sanity_check(); }
+ explicit AdaGradSolver(const SolverParameter& param,
+ Solver<Dtype> *root_solver = NULL)
+ : SGDSolver<Dtype>(param, root_solver)
+ { constructor_sanity_check(); }
+ explicit AdaGradSolver(const string& param_file,
+ Solver<Dtype> *root_solver = NULL)
+ : SGDSolver<Dtype>(param_file, root_solver)
+ { constructor_sanity_check(); }
  virtual inline const char* type() const { return "AdaGrad"; }
 
  protected:
@@ -83,10 +91,14 @@ class AdaGradSolver : public SGDSolver<Dtype> {
 template <typename Dtype>
 class RMSPropSolver : public SGDSolver<Dtype> {
  public:
- explicit RMSPropSolver(const SolverParameter& param)
- : SGDSolver<Dtype>(param) { constructor_sanity_check(); }
- explicit RMSPropSolver(const string& param_file)
- : SGDSolver<Dtype>(param_file) { constructor_sanity_check(); }
+ explicit RMSPropSolver(const SolverParameter& param,
+ Solver<Dtype> *root_solver = NULL)
+ : SGDSolver<Dtype>(param, root_solver)
+ { constructor_sanity_check(); }
+ explicit RMSPropSolver(const string& param_file,
+ Solver<Dtype> *root_solver = NULL)
+ : SGDSolver<Dtype>(param_file, root_solver)
+ { constructor_sanity_check(); }
  virtual inline const char* type() const { return "RMSProp"; }
 
  protected:
@@ -106,10 +118,12 @@ class RMSPropSolver : public SGDSolver<Dtype> {
 template <typename Dtype>
 class AdaDeltaSolver : public SGDSolver<Dtype> {
  public:
- explicit AdaDeltaSolver(const SolverParameter& param)
- : SGDSolver<Dtype>(param) { AdaDeltaPreSolve(); }
- explicit AdaDeltaSolver(const string& param_file)
- : SGDSolver<Dtype>(param_file) { AdaDeltaPreSolve(); }
+ explicit AdaDeltaSolver(const SolverParameter& param,
+ Solver<Dtype> *root_solver = NULL)
+ : SGDSolver<Dtype>(param, root_solver) { AdaDeltaPreSolve(); }
+ explicit AdaDeltaSolver(const string& param_file,
+ Solver<Dtype> *root_solver = NULL)
+ : SGDSolver<Dtype>(param_file, root_solver) { AdaDeltaPreSolve(); }
  virtual inline const char* type() const { return "AdaDelta"; }
 
  protected:
@@ -130,10 +144,12 @@ class AdaDeltaSolver : public SGDSolver<Dtype> {
 template <typename Dtype>
 class AdamSolver : public SGDSolver<Dtype> {
  public:
- explicit AdamSolver(const SolverParameter& param)
- : SGDSolver<Dtype>(param) { AdamPreSolve();}
- explicit AdamSolver(const string& param_file)
- : SGDSolver<Dtype>(param_file) { AdamPreSolve(); }
+ explicit AdamSolver(const SolverParameter& param,
+ Solver<Dtype> *root_solver = NULL)
+ : SGDSolver<Dtype>(param, root_solver) { AdamPreSolve();}
+ explicit AdamSolver(const string& param_file,
+ Solver<Dtype> *root_solver = NULL)
+ : SGDSolver<Dtype>(param_file, root_solver) { AdamPreSolve(); }
  virtual inline const char* type() const { return "Adam"; }
 
  protected:

diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp
@@ -6,6 +6,7 @@
 
 #include "caffe/net.hpp"
 #include "caffe/solver_factory.hpp"
+#include "caffe/util/benchmark.hpp"
 
 namespace caffe {
 
@@ -76,9 +77,14 @@ class Solver {
 
  // Invoked at specific points during an iteration
  class Callback {
+ public:
+ virtual void allreduce(int param_id) = 0;
+ virtual void syncCommStream() = 0;
+
  protected:
  virtual void on_start() = 0;
- virtual void on_gradients_ready() = 0;
+ virtual void allreduce() = 0;
+ virtual void soft_barrier() = 0;
 
  template <typename T>
  friend class Solver;
@@ -129,6 +135,10 @@ class Solver {
  // True iff a request to stop early was received.
  bool requested_early_exit_;
 
+ // Timing information
+ Timer iteration_timer_;
+ float iterations_last_;
+
  DISABLE_COPY_AND_ASSIGN(Solver);
 };