Source code for nested_pandas.utils.utils

import numpy as np
import pandas as pd
import pyarrow as pa

from nested_pandas import NestedFrame


[docs] def count_nested(df, nested, by=None, join=True) -> NestedFrame: """Counts the number of rows of a nested dataframe. Parameters ---------- df: NestedFrame A NestedFrame that contains the desired `nested` series to count. nested: 'str' The label of the nested series to count. by: 'str', optional Specifies a column within nested to count by, returning a count for each unique value in `by`. join: bool, optional Join the output count columns to df and return df, otherwise just return a NestedFrame containing only the count columns. Returns ------- NestedFrame Examples -------- >>> import pandas as pd >>> # Show all columns >>> pd.set_option("display.width", 200) >>> pd.set_option("display.max_columns", None) >>> from nested_pandas.datasets.generation import generate_data >>> nf = generate_data(5, 10, seed=1) >>> from nested_pandas.utils import count_nested >>> count_nested(nf, "nested") a b nested n_nested 0 0.417022 0.184677 [{t: 8.38389, flux: 10.233443, flux_error: 1.0... 10 1 0.720324 0.372520 [{t: 13.70439, flux: 41.405599, flux_error: 1.... 10 2 0.000114 0.691121 [{t: 4.089045, flux: 69.440016, flux_error: 1.... 10 3 0.302333 0.793535 [{t: 17.562349, flux: 41.417927, flux_error: 1... 10 4 0.146756 1.077633 [{t: 0.547752, flux: 4.995346, flux_error: 1.0... 10 `count_nested` also allows counting by a given subcolumn, for example we can count by "band" label: >>> # join=False, allows the result to be kept separate from the original nf >>> count_nested(nf, "nested", by="band", join=False) n_nested_g n_nested_r 0 8 2 1 5 5 2 5 5 3 6 4 4 6 4 """ if by is None: counts = pd.Series(df[nested].nest.len(), name=f"n_{nested}", index=df.index) counts = counts.astype(pd.ArrowDtype(pa.int32())) else: counts = df.map_rows( lambda x: dict(zip(*np.unique(x, return_counts=True), strict=False)), columns=f"{nested}.{by}", row_container="args", ) counts = counts.astype(pd.ArrowDtype(pa.int32())) counts = counts.rename(columns={colname: f"n_{nested}_{colname}" for colname in counts.columns}) counts = counts.reindex(sorted(counts.columns), axis=1) counts = counts.fillna(0) if join: return df.join(counts) # else just return the counts NestedFrame if isinstance(counts, pd.Series): # for by=None, which returns a Series counts = NestedFrame(counts.to_frame()) return counts