diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc index 2c2b62f5d12f6..551f62798e3b5 100644 --- a/cpp/src/parquet/reader_test.cc +++ b/cpp/src/parquet/reader_test.cc @@ -120,11 +120,27 @@ std::string concatenated_gzip_members() { return data_file("concatenated_gzip_members.parquet"); } +std::string byte_stream_split() { return data_file("byte_stream_split.zstd.parquet"); } + +template +std::vector ReadColumnValues(ParquetFileReader* file_reader, int row_group, + int column, int64_t expected_values_read) { + auto column_reader = checked_pointer_cast>( + file_reader->RowGroup(row_group)->Column(column)); + std::vector values(expected_values_read); + int64_t values_read; + auto levels_read = column_reader->ReadBatch(expected_values_read, nullptr, nullptr, + values.data(), &values_read); + EXPECT_EQ(expected_values_read, levels_read); + EXPECT_EQ(expected_values_read, values_read); + return values; +} + // TODO: Assert on definition and repetition levels -template +template void AssertColumnValues(std::shared_ptr> col, int64_t batch_size, int64_t expected_levels_read, - std::vector& expected_values, + const std::vector& expected_values, int64_t expected_values_read) { std::vector values(batch_size); int64_t values_read; @@ -1412,7 +1428,6 @@ TEST_P(TestCodec, LargeFileValues) { // column 0 ("a") auto col = checked_pointer_cast(group->Column(0)); - std::vector values(kNumRows); int64_t values_read; auto levels_read = @@ -1474,6 +1489,38 @@ TEST(TestFileReader, TestOverflowInt16PageOrdinal) { } } +#ifdef ARROW_WITH_ZSTD +TEST(TestByteStreamSplit, FloatIntegrationFile) { + auto file_path = byte_stream_split(); + auto file = ParquetFileReader::OpenFile(file_path); + + const int64_t kNumRows = 300; + + ASSERT_EQ(kNumRows, file->metadata()->num_rows()); + ASSERT_EQ(2, file->metadata()->num_columns()); + ASSERT_EQ(1, file->metadata()->num_row_groups()); + + // column 0 ("f32") + { + auto values = + ReadColumnValues(file.get(), /*row_group=*/0, /*column=*/0, kNumRows); + ASSERT_EQ(values[0], 1.7640524f); + ASSERT_EQ(values[1], 0.4001572f); + ASSERT_EQ(values[kNumRows - 2], -0.39944902f); + ASSERT_EQ(values[kNumRows - 1], 0.37005588f); + } + // column 1 ("f64") + { + auto values = + ReadColumnValues(file.get(), /*row_group=*/0, /*column=*/1, kNumRows); + ASSERT_EQ(values[0], -1.3065268517353166); + ASSERT_EQ(values[1], 1.658130679618188); + ASSERT_EQ(values[kNumRows - 2], -0.9301565025243212); + ASSERT_EQ(values[kNumRows - 1], -0.17858909208732915); + } +} +#endif // ARROW_WITH_ZSTD + struct PageIndexReaderParam { std::vector row_group_indices; std::vector column_indices; diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing index d69d979223e88..4cb3cff24c965 160000 --- a/cpp/submodules/parquet-testing +++ b/cpp/submodules/parquet-testing @@ -1 +1 @@ -Subproject commit d69d979223e883faef9dc6fe3cf573087243c28a +Subproject commit 4cb3cff24c965fb329cdae763eabce47395a68a0