From 2b7c2e4f6fb050e2257f165a150bf4c912deb56b Mon Sep 17 00:00:00 2001 From: Ronghang Hu Date: Sun, 9 Aug 2015 14:43:47 -0700 Subject: [PATCH] rebase & clean up HDF5DataLayer Prefetch Adapt HDF5DataLayer Prefetch to #2836 --- include/caffe/util/hdf5.hpp | 36 ++++++-- include/caffe/util/io.hpp | 28 ------ src/caffe/layers/hdf5_data_layer.cpp | 8 +- src/caffe/test/test_hdf5data_layer.cpp | 2 +- src/caffe/util/hdf5.cpp | 102 +++++++++++++++++++-- src/caffe/util/io.cpp | 117 ------------------------- 6 files changed, 128 insertions(+), 165 deletions(-) diff --git a/include/caffe/util/hdf5.hpp b/include/caffe/util/hdf5.hpp index ce568c5eb0d..4df8213bbd9 100644 --- a/include/caffe/util/hdf5.hpp +++ b/include/caffe/util/hdf5.hpp @@ -10,21 +10,43 @@ namespace caffe { +/** + * @brief Shapes a Blob to read "num" rows of HDF5 data. If num == -1, take + * the num of the HDF5 dataset. + * + * @param file_id the HDF5 file handle + * @param dataset_name the name of the HDF5 dataset to read + * @param num the number of rows to read: either num >= 0, + * or num == -1 for the number of rows in the HDF5 dataset + * @param blob the Blob to shape + * + * The HDF5 dataset could be N(>=1) dimensions as long as N doesn't exceed + * Blob's maximum dimension. + */ template -void hdf5_load_nd_dataset_helper( - hid_t file_id, const char* dataset_name_, int min_dim, int max_dim, +void HDF5PrepareBlob(hid_t file_id, const char* dataset_name, int num, Blob* blob); +/** + * @brief Reads rows [offset, offset + data->num() - 1] into Blob* data, which + * must have been pre-shaped using HDF5PrepareBlob (or otherwise). + */ template -void hdf5_load_nd_dataset( - hid_t file_id, const char* dataset_name_, int min_dim, int max_dim, - Blob* blob); +int HDF5ReadRowsToBlob(hid_t file_id, const char* dataset_name, int h5_offset, + int blob_offset, Blob* blob); + +template +void hdf5_load_nd_dataset_helper(hid_t file_id, const char* dataset_name_, + int min_dim, int max_dim, Blob* blob); + +template +void hdf5_load_nd_dataset(hid_t file_id, const char* dataset_name_, int min_dim, + int max_dim, Blob* blob); template void hdf5_save_nd_dataset( - const hid_t file_id, const string& dataset_name, const Blob& blob, + const hid_t file_id, const string& dataset_name, const Blob& blob, bool write_diff = false); - int hdf5_load_int(hid_t loc_id, const string& dataset_name); void hdf5_save_int(hid_t loc_id, const string& dataset_name, int i); string hdf5_load_string(hid_t loc_id, const string& dataset_name); diff --git a/include/caffe/util/io.hpp b/include/caffe/util/io.hpp index 3ea24d1bd85..c0938ad0625 100644 --- a/include/caffe/util/io.hpp +++ b/include/caffe/util/io.hpp @@ -136,34 +136,6 @@ cv::Mat DecodeDatumToCVMat(const Datum& datum, bool is_color); void CVMatToDatum(const cv::Mat& cv_img, Datum* datum); -/** - * @brief Shapes a Blob to read "num" rows of HDF5 data. If num == -1, take - * the num of the HDF5 dataset. - * - * @param file_id the HDF5 file handle - * @param dataset_name the name of the HDF5 dataset to read - * @param num the number of rows to read: either num >= 0, - * or num == -1 for the number of rows in the HDF5 dataset - * @param blob the Blob to shape - * - * The HDF5 dataset could be N(>=1) dimensions as long as N doesn't exceed Blob's maximum dimension. - */ -template -void HDF5PrepareBlob(hid_t file_id, const char* dataset_name, int num, - Blob* blob); - -/** - * @brief Reads rows [offset, offset + data->num() - 1] into Blob* data, which - * must have been pre-shaped using HDF5PrepareBlob (or otherwise). - */ -template -int HDF5ReadRowsToBlob(hid_t file_id, const char* dataset_name, - int h5_offset, int blob_offset, Blob* blob); - -template -void hdf5_save_nd_dataset( - const hid_t file_id, const string& dataset_name, const Blob& blob); - } // namespace caffe #endif // CAFFE_UTIL_IO_H_ diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp index 5aea2932f16..0167fd6a260 100644 --- a/src/caffe/layers/hdf5_data_layer.cpp +++ b/src/caffe/layers/hdf5_data_layer.cpp @@ -8,8 +8,6 @@ #include #include -#include "hdf5.h" -#include "hdf5_hl.h" #include "stdint.h" #include "caffe/data_layers.hpp" @@ -77,7 +75,7 @@ void HDF5DataLayer::FillHDF5FileData() { template void HDF5DataLayer::DataLayerSetUp(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { // Refuse transformation parameters since HDF5 is totally generic. CHECK(!this->layer_param_.has_transform_param()) << this->type() << " does not transform data."; @@ -151,14 +149,12 @@ void HDF5DataLayer::InternalThreadEntry() { template void HDF5DataLayer::Forward_cpu(const vector*>& bottom, - const vector*>& top) { + const vector*>& top) { this->JoinPrefetchThread(); - for (int i = 0; i < top.size(); ++i) { const int count = top[i]->count(); caffe_copy(count, hdf_blobs_[i]->cpu_data(), top[i]->mutable_cpu_data()); } - this->CreatePrefetchThread(); } diff --git a/src/caffe/test/test_hdf5data_layer.cpp b/src/caffe/test/test_hdf5data_layer.cpp index e2f24d1bb79..a1225f04f28 100644 --- a/src/caffe/test/test_hdf5data_layer.cpp +++ b/src/caffe/test/test_hdf5data_layer.cpp @@ -117,7 +117,7 @@ class HDF5DataLayerTest : public MultiDeviceTest { } } } -}; +} TYPED_TEST_CASE(HDF5DataLayerTest, TestDtypesAndDevices); diff --git a/src/caffe/util/hdf5.cpp b/src/caffe/util/hdf5.cpp index d0d05f70f8f..53ba45dbc6e 100644 --- a/src/caffe/util/hdf5.cpp +++ b/src/caffe/util/hdf5.cpp @@ -1,10 +1,102 @@ #include "caffe/util/hdf5.hpp" +#include #include #include namespace caffe { +template +void HDF5PrepareBlob(hid_t file_id, const char* dataset_name, int num, + Blob* blob) { + // Verify that the dataset exists. + CHECK(H5LTfind_dataset(file_id, dataset_name)) + << "Failed to find HDF5 dataset " << dataset_name; + herr_t status; + int ndims; + CHECK_LE(0, H5LTget_dataset_ndims(file_id, dataset_name, &ndims)) + << "Failed to get dataset ndims for " << dataset_name; + CHECK_GE(ndims, 1) << "HDF5 dataset must have at least 1 dimension."; + CHECK_LE(ndims, kMaxBlobAxes) + << "HDF5 dataset must have at most " + << kMaxBlobAxes << " dimensions, to fit in a Blob."; + + // Verify that the data format is what we expect: float or double. + std::vector dims(ndims); + H5T_class_t h5_class; + status = H5LTget_dataset_info( + file_id, dataset_name, dims.data(), &h5_class, NULL); + CHECK_GE(status, 0) << "Failed to get dataset info for " << dataset_name; + CHECK_EQ(h5_class, H5T_FLOAT) << "Expected float or double data"; + CHECK_GE(num, -1) << "num must be -1 (to indicate the number of rows" + "in the dataset) or non-negative."; + + vector blob_dims(dims.size()); + blob_dims[0] = (num == -1) ? dims[0] : num; + for (int i = 1; i < dims.size(); ++i) { + blob_dims[i] = dims[i]; + } + blob->Reshape(blob_dims); +} + +template +void HDF5PrepareBlob(hid_t file_id, const char* dataset_name, int num, + Blob* blob); + +template +void HDF5PrepareBlob(hid_t file_id, const char* dataset_name, int num, + Blob* blob); + +template +int HDF5ReadRowsToBlob(hid_t file_id, const char* dataset_name, + int h5_offset, int blob_offset, Blob* blob) { + int ndims; + CHECK_LE(0, H5LTget_dataset_ndims(file_id, dataset_name, &ndims)) + << "Failed to get dataset ndims for " << dataset_name; + std::vector dims(ndims); + H5T_class_t h5_class; + herr_t status = H5LTget_dataset_info( + file_id, dataset_name, dims.data(), &h5_class, NULL); + CHECK_GE(status, 0) << "Failed to get dataset info for " << dataset_name; + CHECK_EQ(h5_class, H5T_FLOAT) << "Expected float or double data"; + hid_t dataset = H5Dopen2(file_id, dataset_name, H5P_DEFAULT); + hid_t dataspace = H5Dget_space(dataset); + vector slab_start(ndims, 0); + slab_start[0] = h5_offset; + const int num_rows_available = dims[0] - h5_offset; + const int num_rows = std::min(blob->num() - blob_offset, num_rows_available); + if (num_rows <= 0) { + return 0; + } + vector slab_count(ndims, num_rows); + for (int i = 1; i < ndims; ++i) { + slab_count[i] = dims[i]; + } + status = H5Sselect_hyperslab(dataspace, H5S_SELECT_SET, + slab_start.data(), NULL, slab_count.data(), NULL); + CHECK_GE(status, 0) << "Failed to select slab."; + hid_t memspace = H5Screate_simple(ndims, slab_count.data(), NULL); + const int data_size = blob->count() / blob->num(); + // separate multiplication to avoid a possible overflow + const int blob_offset_size = blob_offset * data_size; + hid_t type = (sizeof(Dtype) == 4) ? H5T_NATIVE_FLOAT : H5T_NATIVE_DOUBLE; + status = H5Dread(dataset, type, memspace, dataspace, H5P_DEFAULT, + blob->mutable_cpu_data() + blob_offset_size); + CHECK_GE(status, 0) << "Failed to read dataset " << dataset_name; + H5Dclose(dataset); + H5Sclose(dataspace); + H5Sclose(memspace); + return num_rows; +} + +template +int HDF5ReadRowsToBlob(hid_t file_id, const char* dataset_name, + int h5_offset, int blob_offset, Blob* data); + +template +int HDF5ReadRowsToBlob(hid_t file_id, const char* dataset_name, + int h5_offset, int blob_offset, Blob* data); + // Verifies format of data stored in HDF5 file and reshapes blob accordingly. template void hdf5_load_nd_dataset_helper( @@ -59,7 +151,7 @@ void hdf5_save_nd_dataset( const hid_t file_id, const string& dataset_name, const Blob& blob, bool write_diff) { int num_axes = blob.num_axes(); - hsize_t *dims = new hsize_t[num_axes]; + std::vector dims(num_axes); for (int i = 0; i < num_axes; ++i) { dims[i] = blob.shape(i); } @@ -70,9 +162,8 @@ void hdf5_save_nd_dataset( data = blob.cpu_data(); } herr_t status = H5LTmake_dataset_float( - file_id, dataset_name.c_str(), num_axes, dims, data); + file_id, dataset_name.c_str(), num_axes, dims.data(), data); CHECK_GE(status, 0) << "Failed to make float dataset " << dataset_name; - delete[] dims; } template <> @@ -80,7 +171,7 @@ void hdf5_save_nd_dataset( hid_t file_id, const string& dataset_name, const Blob& blob, bool write_diff) { int num_axes = blob.num_axes(); - hsize_t *dims = new hsize_t[num_axes]; + std::vector dims(num_axes); for (int i = 0; i < num_axes; ++i) { dims[i] = blob.shape(i); } @@ -91,9 +182,8 @@ void hdf5_save_nd_dataset( data = blob.cpu_data(); } herr_t status = H5LTmake_dataset_double( - file_id, dataset_name.c_str(), num_axes, dims, data); + file_id, dataset_name.c_str(), num_axes, dims.data(), data); CHECK_GE(status, 0) << "Failed to make double dataset " << dataset_name; - delete[] dims; } string hdf5_load_string(hid_t loc_id, const string& dataset_name) { diff --git a/src/caffe/util/io.cpp b/src/caffe/util/io.cpp index 7954166460f..6f03314202c 100644 --- a/src/caffe/util/io.cpp +++ b/src/caffe/util/io.cpp @@ -228,122 +228,5 @@ void CVMatToDatum(const cv::Mat& cv_img, Datum* datum) { datum->set_data(buffer); } -// Verifies format of data stored in HDF5 file and reshapes blob accordingly. -template -void HDF5PrepareBlob(hid_t file_id, const char* dataset_name, int num, - Blob* blob) { - // Verify that the dataset exists. - CHECK(H5LTfind_dataset(file_id, dataset_name)) - << "Failed to find HDF5 dataset " << dataset_name; - herr_t status; - int ndims; - CHECK_LE(0, H5LTget_dataset_ndims(file_id, dataset_name, &ndims)) - << "Failed to get dataset ndims for " << dataset_name; - CHECK_GE(ndims, 1) << "HDF5 dataset must have at least 1 dimension."; - CHECK_LE(ndims, kMaxBlobAxes) - << "HDF5 dataset must have at most " - << kMaxBlobAxes << " dimensions, to fit in a Blob."; - - // Verify that the data format is what we expect: float or double. - std::vector dims(ndims); - H5T_class_t h5_class; - status = H5LTget_dataset_info( - file_id, dataset_name, dims.data(), &h5_class, NULL); - CHECK_GE(status, 0) << "Failed to get dataset info for " << dataset_name; - CHECK_EQ(h5_class, H5T_FLOAT) << "Expected float or double data"; - CHECK_GE(num, -1) << "num must be -1 (to indicate the number of rows" - "in the dataset) or non-negative."; - - vector blob_dims(dims.size()); - blob_dims[0] = (num == -1) ? dims[0] : num; - for (int i = 1; i < dims.size(); ++i) { - blob_dims[i] = dims[i]; - } - blob->Reshape(blob_dims); -} - -template -void HDF5PrepareBlob(hid_t file_id, const char* dataset_name, int num, - Blob* blob); - -template -void HDF5PrepareBlob(hid_t file_id, const char* dataset_name, int num, - Blob* blob); - -template -int HDF5ReadRowsToBlob(hid_t file_id, const char* dataset_name, - int h5_offset, int blob_offset, Blob* blob) { - int ndims; - CHECK_LE(0, H5LTget_dataset_ndims(file_id, dataset_name, &ndims)) - << "Failed to get dataset ndims for " << dataset_name; - std::vector dims(ndims); - H5T_class_t h5_class; - herr_t status = H5LTget_dataset_info( - file_id, dataset_name, dims.data(), &h5_class, NULL); - CHECK_GE(status, 0) << "Failed to get dataset info for " << dataset_name; - CHECK_EQ(h5_class, H5T_FLOAT) << "Expected float or double data"; - hid_t dataset = H5Dopen2(file_id, dataset_name, H5P_DEFAULT); - hid_t dataspace = H5Dget_space(dataset); - vector slab_start(ndims, 0); - slab_start[0] = h5_offset; - const int num_rows_available = dims[0] - h5_offset; - const int num_rows = std::min(blob->num() - blob_offset, num_rows_available); - if (num_rows <= 0) { - return 0; - } - vector slab_count(ndims, num_rows); - for (int i = 1; i < ndims; ++i) { - slab_count[i] = dims[i]; - } - status = H5Sselect_hyperslab(dataspace, H5S_SELECT_SET, - slab_start.data(), NULL, slab_count.data(), NULL); - CHECK_GE(status, 0) << "Failed to select slab."; - hid_t memspace = H5Screate_simple(ndims, slab_count.data(), NULL); - const int data_size = blob->count() / blob->num(); - // separate multiplication to avoid a possible overflow - const int blob_offset_size = blob_offset * data_size; - hid_t type = (sizeof(Dtype) == 4) ? H5T_NATIVE_FLOAT : H5T_NATIVE_DOUBLE; - status = H5Dread(dataset, type, memspace, dataspace, H5P_DEFAULT, - blob->mutable_cpu_data() + blob_offset_size); - CHECK_GE(status, 0) << "Failed to read dataset " << dataset_name; - H5Dclose(dataset); - H5Sclose(dataspace); - H5Sclose(memspace); - return num_rows; -} - -template -int HDF5ReadRowsToBlob(hid_t file_id, const char* dataset_name, - int h5_offset, int blob_offset, Blob* data); - -template -int HDF5ReadRowsToBlob(hid_t file_id, const char* dataset_name, - int h5_offset, int blob_offset, Blob* data); - -template <> -void hdf5_save_nd_dataset( - const hid_t file_id, const string& dataset_name, const Blob& blob) { - hsize_t dims[HDF5_NUM_DIMS]; - dims[0] = blob.num(); - dims[1] = blob.channels(); - dims[2] = blob.height(); - dims[3] = blob.width(); - herr_t status = H5LTmake_dataset_float( - file_id, dataset_name.c_str(), HDF5_NUM_DIMS, dims, blob.cpu_data()); - CHECK_GE(status, 0) << "Failed to make float dataset " << dataset_name; -} - -template <> -void hdf5_save_nd_dataset( - const hid_t file_id, const string& dataset_name, const Blob& blob) { - hsize_t dims[HDF5_NUM_DIMS]; - dims[0] = blob.num(); - dims[1] = blob.channels(); - dims[2] = blob.height(); - dims[3] = blob.width(); - herr_t status = H5LTmake_dataset_double( - file_id, dataset_name.c_str(), HDF5_NUM_DIMS, dims, blob.cpu_data()); - CHECK_GE(status, 0) << "Failed to make double dataset " << dataset_name; -} } // namespace caffe