Skip to content

Commit

Permalink
rebase & clean up HDF5DataLayer Prefetch
Browse files Browse the repository at this point in the history
Adapt HDF5DataLayer Prefetch to #2836
  • Loading branch information
ronghanghu committed Aug 9, 2015
1 parent 87b27d1 commit 2b7c2e4
Show file tree
Hide file tree
Showing 6 changed files with 128 additions and 165 deletions.
36 changes: 29 additions & 7 deletions include/caffe/util/hdf5.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,43 @@

namespace caffe {

/**
* @brief Shapes a Blob to read "num" rows of HDF5 data. If num == -1, take
* the num of the HDF5 dataset.
*
* @param file_id the HDF5 file handle
* @param dataset_name the name of the HDF5 dataset to read
* @param num the number of rows to read: either num >= 0,
* or num == -1 for the number of rows in the HDF5 dataset
* @param blob the Blob to shape
*
* The HDF5 dataset could be N(>=1) dimensions as long as N doesn't exceed
* Blob's maximum dimension.
*/
template <typename Dtype>
void hdf5_load_nd_dataset_helper(
hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
void HDF5PrepareBlob(hid_t file_id, const char* dataset_name, int num,
Blob<Dtype>* blob);

/**
* @brief Reads rows [offset, offset + data->num() - 1] into Blob* data, which
* must have been pre-shaped using HDF5PrepareBlob (or otherwise).
*/
template <typename Dtype>
void hdf5_load_nd_dataset(
hid_t file_id, const char* dataset_name_, int min_dim, int max_dim,
Blob<Dtype>* blob);
int HDF5ReadRowsToBlob(hid_t file_id, const char* dataset_name, int h5_offset,
int blob_offset, Blob<Dtype>* blob);

template <typename Dtype>
void hdf5_load_nd_dataset_helper(hid_t file_id, const char* dataset_name_,
int min_dim, int max_dim, Blob<Dtype>* blob);

template <typename Dtype>
void hdf5_load_nd_dataset(hid_t file_id, const char* dataset_name_, int min_dim,
int max_dim, Blob<Dtype>* blob);

template <typename Dtype>
void hdf5_save_nd_dataset(
const hid_t file_id, const string& dataset_name, const Blob<Dtype>& blob,
const hid_t file_id, const string& dataset_name, const Blob<Dtype>& blob,
bool write_diff = false);

int hdf5_load_int(hid_t loc_id, const string& dataset_name);
void hdf5_save_int(hid_t loc_id, const string& dataset_name, int i);
string hdf5_load_string(hid_t loc_id, const string& dataset_name);
Expand Down
28 changes: 0 additions & 28 deletions include/caffe/util/io.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,34 +136,6 @@ cv::Mat DecodeDatumToCVMat(const Datum& datum, bool is_color);

void CVMatToDatum(const cv::Mat& cv_img, Datum* datum);

/**
* @brief Shapes a Blob to read "num" rows of HDF5 data. If num == -1, take
* the num of the HDF5 dataset.
*
* @param file_id the HDF5 file handle
* @param dataset_name the name of the HDF5 dataset to read
* @param num the number of rows to read: either num >= 0,
* or num == -1 for the number of rows in the HDF5 dataset
* @param blob the Blob to shape
*
* The HDF5 dataset could be N(>=1) dimensions as long as N doesn't exceed Blob's maximum dimension.
*/
template <typename Dtype>
void HDF5PrepareBlob(hid_t file_id, const char* dataset_name, int num,
Blob<Dtype>* blob);

/**
* @brief Reads rows [offset, offset + data->num() - 1] into Blob* data, which
* must have been pre-shaped using HDF5PrepareBlob (or otherwise).
*/
template <typename Dtype>
int HDF5ReadRowsToBlob(hid_t file_id, const char* dataset_name,
int h5_offset, int blob_offset, Blob<Dtype>* blob);

template <typename Dtype>
void hdf5_save_nd_dataset(
const hid_t file_id, const string& dataset_name, const Blob<Dtype>& blob);

} // namespace caffe

#endif // CAFFE_UTIL_IO_H_
8 changes: 2 additions & 6 deletions src/caffe/layers/hdf5_data_layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
#include <string>
#include <vector>

#include "hdf5.h"
#include "hdf5_hl.h"
#include "stdint.h"

#include "caffe/data_layers.hpp"
Expand Down Expand Up @@ -77,7 +75,7 @@ void HDF5DataLayer<Dtype>::FillHDF5FileData() {

template <typename Dtype>
void HDF5DataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
const vector<Blob<Dtype>*>& top) {
// Refuse transformation parameters since HDF5 is totally generic.
CHECK(!this->layer_param_.has_transform_param()) <<
this->type() << " does not transform data.";
Expand Down Expand Up @@ -151,14 +149,12 @@ void HDF5DataLayer<Dtype>::InternalThreadEntry() {

template <typename Dtype>
void HDF5DataLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
const vector<Blob<Dtype>*>& top) {
this->JoinPrefetchThread();

for (int i = 0; i < top.size(); ++i) {
const int count = top[i]->count();
caffe_copy(count, hdf_blobs_[i]->cpu_data(), top[i]->mutable_cpu_data());
}

this->CreatePrefetchThread();
}

Expand Down
2 changes: 1 addition & 1 deletion src/caffe/test/test_hdf5data_layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ class HDF5DataLayerTest : public MultiDeviceTest<TypeParam> {
}
}
}
};
}

TYPED_TEST_CASE(HDF5DataLayerTest, TestDtypesAndDevices);

Expand Down
102 changes: 96 additions & 6 deletions src/caffe/util/hdf5.cpp
Original file line number Diff line number Diff line change
@@ -1,10 +1,102 @@
#include "caffe/util/hdf5.hpp"

#include <algorithm>
#include <string>
#include <vector>

namespace caffe {

template <typename Dtype>
void HDF5PrepareBlob(hid_t file_id, const char* dataset_name, int num,
Blob<Dtype>* blob) {
// Verify that the dataset exists.
CHECK(H5LTfind_dataset(file_id, dataset_name))
<< "Failed to find HDF5 dataset " << dataset_name;
herr_t status;
int ndims;
CHECK_LE(0, H5LTget_dataset_ndims(file_id, dataset_name, &ndims))
<< "Failed to get dataset ndims for " << dataset_name;
CHECK_GE(ndims, 1) << "HDF5 dataset must have at least 1 dimension.";
CHECK_LE(ndims, kMaxBlobAxes)
<< "HDF5 dataset must have at most "
<< kMaxBlobAxes << " dimensions, to fit in a Blob.";

// Verify that the data format is what we expect: float or double.
std::vector<hsize_t> dims(ndims);
H5T_class_t h5_class;
status = H5LTget_dataset_info(
file_id, dataset_name, dims.data(), &h5_class, NULL);
CHECK_GE(status, 0) << "Failed to get dataset info for " << dataset_name;
CHECK_EQ(h5_class, H5T_FLOAT) << "Expected float or double data";
CHECK_GE(num, -1) << "num must be -1 (to indicate the number of rows"
"in the dataset) or non-negative.";

vector<int> blob_dims(dims.size());
blob_dims[0] = (num == -1) ? dims[0] : num;
for (int i = 1; i < dims.size(); ++i) {
blob_dims[i] = dims[i];
}
blob->Reshape(blob_dims);
}

template
void HDF5PrepareBlob<float>(hid_t file_id, const char* dataset_name, int num,
Blob<float>* blob);

template
void HDF5PrepareBlob<double>(hid_t file_id, const char* dataset_name, int num,
Blob<double>* blob);

template <typename Dtype>
int HDF5ReadRowsToBlob(hid_t file_id, const char* dataset_name,
int h5_offset, int blob_offset, Blob<Dtype>* blob) {
int ndims;
CHECK_LE(0, H5LTget_dataset_ndims(file_id, dataset_name, &ndims))
<< "Failed to get dataset ndims for " << dataset_name;
std::vector<hsize_t> dims(ndims);
H5T_class_t h5_class;
herr_t status = H5LTget_dataset_info(
file_id, dataset_name, dims.data(), &h5_class, NULL);
CHECK_GE(status, 0) << "Failed to get dataset info for " << dataset_name;
CHECK_EQ(h5_class, H5T_FLOAT) << "Expected float or double data";
hid_t dataset = H5Dopen2(file_id, dataset_name, H5P_DEFAULT);
hid_t dataspace = H5Dget_space(dataset);
vector<hsize_t> slab_start(ndims, 0);
slab_start[0] = h5_offset;
const int num_rows_available = dims[0] - h5_offset;
const int num_rows = std::min(blob->num() - blob_offset, num_rows_available);
if (num_rows <= 0) {
return 0;
}
vector<hsize_t> slab_count(ndims, num_rows);
for (int i = 1; i < ndims; ++i) {
slab_count[i] = dims[i];
}
status = H5Sselect_hyperslab(dataspace, H5S_SELECT_SET,
slab_start.data(), NULL, slab_count.data(), NULL);
CHECK_GE(status, 0) << "Failed to select slab.";
hid_t memspace = H5Screate_simple(ndims, slab_count.data(), NULL);
const int data_size = blob->count() / blob->num();
// separate multiplication to avoid a possible overflow
const int blob_offset_size = blob_offset * data_size;
hid_t type = (sizeof(Dtype) == 4) ? H5T_NATIVE_FLOAT : H5T_NATIVE_DOUBLE;
status = H5Dread(dataset, type, memspace, dataspace, H5P_DEFAULT,
blob->mutable_cpu_data() + blob_offset_size);
CHECK_GE(status, 0) << "Failed to read dataset " << dataset_name;
H5Dclose(dataset);
H5Sclose(dataspace);
H5Sclose(memspace);
return num_rows;
}

template
int HDF5ReadRowsToBlob<float>(hid_t file_id, const char* dataset_name,
int h5_offset, int blob_offset, Blob<float>* data);

template
int HDF5ReadRowsToBlob<double>(hid_t file_id, const char* dataset_name,
int h5_offset, int blob_offset, Blob<double>* data);

// Verifies format of data stored in HDF5 file and reshapes blob accordingly.
template <typename Dtype>
void hdf5_load_nd_dataset_helper(
Expand Down Expand Up @@ -59,7 +151,7 @@ void hdf5_save_nd_dataset<float>(
const hid_t file_id, const string& dataset_name, const Blob<float>& blob,
bool write_diff) {
int num_axes = blob.num_axes();
hsize_t *dims = new hsize_t[num_axes];
std::vector<hsize_t> dims(num_axes);
for (int i = 0; i < num_axes; ++i) {
dims[i] = blob.shape(i);
}
Expand All @@ -70,17 +162,16 @@ void hdf5_save_nd_dataset<float>(
data = blob.cpu_data();
}
herr_t status = H5LTmake_dataset_float(
file_id, dataset_name.c_str(), num_axes, dims, data);
file_id, dataset_name.c_str(), num_axes, dims.data(), data);
CHECK_GE(status, 0) << "Failed to make float dataset " << dataset_name;
delete[] dims;
}

template <>
void hdf5_save_nd_dataset<double>(
hid_t file_id, const string& dataset_name, const Blob<double>& blob,
bool write_diff) {
int num_axes = blob.num_axes();
hsize_t *dims = new hsize_t[num_axes];
std::vector<hsize_t> dims(num_axes);
for (int i = 0; i < num_axes; ++i) {
dims[i] = blob.shape(i);
}
Expand All @@ -91,9 +182,8 @@ void hdf5_save_nd_dataset<double>(
data = blob.cpu_data();
}
herr_t status = H5LTmake_dataset_double(
file_id, dataset_name.c_str(), num_axes, dims, data);
file_id, dataset_name.c_str(), num_axes, dims.data(), data);
CHECK_GE(status, 0) << "Failed to make double dataset " << dataset_name;
delete[] dims;
}

string hdf5_load_string(hid_t loc_id, const string& dataset_name) {
Expand Down
117 changes: 0 additions & 117 deletions src/caffe/util/io.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -228,122 +228,5 @@ void CVMatToDatum(const cv::Mat& cv_img, Datum* datum) {
datum->set_data(buffer);
}

// Verifies format of data stored in HDF5 file and reshapes blob accordingly.
template <typename Dtype>
void HDF5PrepareBlob(hid_t file_id, const char* dataset_name, int num,
Blob<Dtype>* blob) {
// Verify that the dataset exists.
CHECK(H5LTfind_dataset(file_id, dataset_name))
<< "Failed to find HDF5 dataset " << dataset_name;
herr_t status;
int ndims;
CHECK_LE(0, H5LTget_dataset_ndims(file_id, dataset_name, &ndims))
<< "Failed to get dataset ndims for " << dataset_name;
CHECK_GE(ndims, 1) << "HDF5 dataset must have at least 1 dimension.";
CHECK_LE(ndims, kMaxBlobAxes)
<< "HDF5 dataset must have at most "
<< kMaxBlobAxes << " dimensions, to fit in a Blob.";

// Verify that the data format is what we expect: float or double.
std::vector<hsize_t> dims(ndims);
H5T_class_t h5_class;
status = H5LTget_dataset_info(
file_id, dataset_name, dims.data(), &h5_class, NULL);
CHECK_GE(status, 0) << "Failed to get dataset info for " << dataset_name;
CHECK_EQ(h5_class, H5T_FLOAT) << "Expected float or double data";
CHECK_GE(num, -1) << "num must be -1 (to indicate the number of rows"
"in the dataset) or non-negative.";

vector<int> blob_dims(dims.size());
blob_dims[0] = (num == -1) ? dims[0] : num;
for (int i = 1; i < dims.size(); ++i) {
blob_dims[i] = dims[i];
}
blob->Reshape(blob_dims);
}

template
void HDF5PrepareBlob<float>(hid_t file_id, const char* dataset_name, int num,
Blob<float>* blob);

template
void HDF5PrepareBlob<double>(hid_t file_id, const char* dataset_name, int num,
Blob<double>* blob);

template <typename Dtype>
int HDF5ReadRowsToBlob(hid_t file_id, const char* dataset_name,
int h5_offset, int blob_offset, Blob<Dtype>* blob) {
int ndims;
CHECK_LE(0, H5LTget_dataset_ndims(file_id, dataset_name, &ndims))
<< "Failed to get dataset ndims for " << dataset_name;
std::vector<hsize_t> dims(ndims);
H5T_class_t h5_class;
herr_t status = H5LTget_dataset_info(
file_id, dataset_name, dims.data(), &h5_class, NULL);
CHECK_GE(status, 0) << "Failed to get dataset info for " << dataset_name;
CHECK_EQ(h5_class, H5T_FLOAT) << "Expected float or double data";
hid_t dataset = H5Dopen2(file_id, dataset_name, H5P_DEFAULT);
hid_t dataspace = H5Dget_space(dataset);
vector<hsize_t> slab_start(ndims, 0);
slab_start[0] = h5_offset;
const int num_rows_available = dims[0] - h5_offset;
const int num_rows = std::min(blob->num() - blob_offset, num_rows_available);
if (num_rows <= 0) {
return 0;
}
vector<hsize_t> slab_count(ndims, num_rows);
for (int i = 1; i < ndims; ++i) {
slab_count[i] = dims[i];
}
status = H5Sselect_hyperslab(dataspace, H5S_SELECT_SET,
slab_start.data(), NULL, slab_count.data(), NULL);
CHECK_GE(status, 0) << "Failed to select slab.";
hid_t memspace = H5Screate_simple(ndims, slab_count.data(), NULL);
const int data_size = blob->count() / blob->num();
// separate multiplication to avoid a possible overflow
const int blob_offset_size = blob_offset * data_size;
hid_t type = (sizeof(Dtype) == 4) ? H5T_NATIVE_FLOAT : H5T_NATIVE_DOUBLE;
status = H5Dread(dataset, type, memspace, dataspace, H5P_DEFAULT,
blob->mutable_cpu_data() + blob_offset_size);
CHECK_GE(status, 0) << "Failed to read dataset " << dataset_name;
H5Dclose(dataset);
H5Sclose(dataspace);
H5Sclose(memspace);
return num_rows;
}

template
int HDF5ReadRowsToBlob<float>(hid_t file_id, const char* dataset_name,
int h5_offset, int blob_offset, Blob<float>* data);

template
int HDF5ReadRowsToBlob<double>(hid_t file_id, const char* dataset_name,
int h5_offset, int blob_offset, Blob<double>* data);

template <>
void hdf5_save_nd_dataset<float>(
const hid_t file_id, const string& dataset_name, const Blob<float>& blob) {
hsize_t dims[HDF5_NUM_DIMS];
dims[0] = blob.num();
dims[1] = blob.channels();
dims[2] = blob.height();
dims[3] = blob.width();
herr_t status = H5LTmake_dataset_float(
file_id, dataset_name.c_str(), HDF5_NUM_DIMS, dims, blob.cpu_data());
CHECK_GE(status, 0) << "Failed to make float dataset " << dataset_name;
}

template <>
void hdf5_save_nd_dataset<double>(
const hid_t file_id, const string& dataset_name, const Blob<double>& blob) {
hsize_t dims[HDF5_NUM_DIMS];
dims[0] = blob.num();
dims[1] = blob.channels();
dims[2] = blob.height();
dims[3] = blob.width();
herr_t status = H5LTmake_dataset_double(
file_id, dataset_name.c_str(), HDF5_NUM_DIMS, dims, blob.cpu_data());
CHECK_GE(status, 0) << "Failed to make double dataset " << dataset_name;
}

} // namespace caffe

0 comments on commit 2b7c2e4

Please sign in to comment.