Source code for nested_pandas.datasets.generation

import numpy as np

from nested_pandas import NestedFrame


[docs] def generate_data(n_base, n_layer, seed=None) -> NestedFrame: """Generates a toy dataset. Parameters ---------- n_base : int The number of rows to generate for the base layer n_layer : int, or dict The number of rows per n_base row to generate for a nested layer. Alternatively, a dictionary of layer label, layer_size pairs may be specified to created multiple nested columns with custom sizing. seed : int A seed to use for random generation of data Returns ------- NestedFrame The constructed NestedFrame. Examples -------- >>> from nested_pandas.datasets import generate_data >>> nf1 = generate_data(10,100) >>> nf2 = generate_data(10, {"nested_a": 100, "nested_b": 200}) """ # use provided seed, "None" acts as if no seed is provided randomstate = np.random.RandomState(seed=seed) # Generate base data base_data = {"a": randomstate.random(n_base), "b": randomstate.random(n_base) * 2} base_nf = NestedFrame(data=base_data) # In case of int, create a single nested layer called "nested" if isinstance(n_layer, int): n_layer = {"nested": n_layer} # It should be a dictionary if isinstance(n_layer, dict): for key in n_layer: layer_size = n_layer[key] layer_data = { "t": randomstate.random(layer_size * n_base) * 20, "flux": randomstate.random(layer_size * n_base) * 100, "flux_error": np.full(layer_size * n_base, 1.0), "band": randomstate.choice(["r", "g"], size=layer_size * n_base), "index": np.arange(layer_size * n_base) % n_base, } layer_nf = NestedFrame(data=layer_data).set_index("index") base_nf = base_nf.join_nested(layer_nf, key) return base_nf else: raise TypeError("Input to n_layer is not an int or dict.")
[docs] def generate_parquet_file(n_base, n_layer, path, seed=None): """Generates a toy dataset and outputs it as a parquet file. Parameters ---------- n_base : int The number of rows to generate for the base layer n_layer : int, or dict The number of rows per n_base row to generate for a nested layer. Alternatively, a dictionary of layer label, layer_size pairs may be specified to created multiple nested columns with custom sizing. path : str, The path to the parquet file to write. seed : int, default=None A seed to use for random generation of data Returns ------- None """ nf = generate_data(n_base, n_layer, seed) nf.to_parquet(path)