Source code for plateau.io.testing.utils

import math

import numpy as np
import pandas as pd
from pyarrow.parquet import ParquetFile

from plateau.io.eager import store_dataframes_as_dataset


[docs] def create_dataset(dataset_uuid, store_factory, metadata_version): df = pd.DataFrame( {"P": np.arange(0, 10), "L": np.arange(0, 10), "TARGET": np.arange(10, 20)} ) df_list = [df.copy(deep=True), df.copy(deep=True)] return store_dataframes_as_dataset( dfs=df_list, store=store_factory, dataset_uuid=dataset_uuid, metadata_version=metadata_version, secondary_indices="P", )
[docs] def assert_num_row_groups(store, dataset, part_num_rows, part_chunk_size): """Assert that the row groups of each partition match the expectation based on the number of rows and the chunk size.""" # Iterate over the partitions of each index value for index, partitions in dataset.indices["p"].index_dct.items(): for part_key in partitions: key = dataset.partitions[part_key].files["table"] parquet_file = ParquetFile(store.open(key)) if part_chunk_size[index] is None: assert parquet_file.num_row_groups == 1 else: assert parquet_file.num_row_groups == math.ceil( part_num_rows[index] / part_chunk_size[index] )