Source code for nested_pandas.nestedframe.core

# typing.Self and "|" union syntax don't exist in Python 3.9
from __future__ import annotations

from collections import defaultdict
from collections.abc import Callable
from typing import Literal

import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from deprecated import deprecated
from pandas._libs import lib
from pandas._typing import Any, AnyAll, Axis, Hashable, IndexLabel, Mapping
from pandas.api.extensions import no_default
from pandas.core.computation.eval import Expr, ensure_scope
from pandas.core.dtypes.common import is_bool_dtype
from pandas.core.dtypes.inference import is_list_like

from nested_pandas.nestedframe.expr import (
    _identify_aliases,
    _NestResolver,
    _SeriesFromNest,
    _subexprs_by_nest,
)
from nested_pandas.series.dtype import NestedDtype
from nested_pandas.series.ext_array import NestedExtensionArray
from nested_pandas.series.nestedseries import NestedSeries
from nested_pandas.series.packer import pack, pack_lists, pack_sorted_df_into_struct

pd.set_option("display.max_rows", 30)
pd.set_option("display.min_rows", 5)


[docs] class NestedFrame(pd.DataFrame): """A Pandas Dataframe extension with support for nested structure. See https://pandas.pydata.org/docs/development/extending.html#subclassing-pandas-data-structures """ # https://pandas.pydata.org/docs/development/extending.html#arithmetic-with-3rd-party-types # The __pandas_priority__ of DataFrame is 4000, so give NestedFrame a # higher priority, so that binary operations involving this class and # Series produce instances of this class, preserving the type and origin. __pandas_priority__ = 4500 # The "_aliases" attribute is usually None or not even present, but when it is present, # it indicates that an evaluation is in progress, and that columns and fields with names # that are not identifier-like have been aliases to cleaned names, and this attribute # contains those aliases, keyed by the cleaned name. _metadata = ["_aliases"]
[docs] def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) self._cast_cols_to_nested(struct_list=False)
def _cast_cols_to_nested(self, *, struct_list: bool) -> None: """Cast arrow columns to nested. Parameters ---------- struct_list : bool If `False` cast list-struct columns only. If `True`, also try to cast struct-list columns validating if they have valid nested structure. """ for column, dtype in self.dtypes.items(): if not isinstance(dtype, pd.ArrowDtype): continue pa_type = dtype.pyarrow_dtype if pa.types.is_struct(pa_type) and not struct_list: continue if not NestedExtensionArray.is_input_pa_type_supported(pa_type): continue self[column] = NestedExtensionArray(pa.array(self[column])) @property def _constructor(self) -> Self: # type: ignore[name-defined] # noqa: F821 return NestedFrame @property def _constructor_expanddim(self) -> Self: # type: ignore[name-defined] # noqa: F821 return NestedFrame @property def all_columns(self) -> dict: """returns a dictionary of columns for each base/nested dataframe""" all_columns = {"base": self.columns} for column in self.columns: if isinstance(self.dtypes[column], NestedDtype): nest_cols = self[column].columns all_columns[column] = nest_cols return all_columns @property def nested_columns(self) -> list: """retrieves the base column names for all nested dataframes""" nested_mask = self.dtypes.apply(lambda dtype: isinstance(dtype, NestedDtype)) return self.columns[nested_mask].tolist() @property def base_columns(self) -> list[str]: """Returns the list of base (non-nested) column names""" nested_mask = self.dtypes.apply(lambda dtype: not isinstance(dtype, NestedDtype)) return self.columns[nested_mask].tolist() def _repr_html_(self) -> str | None: """Override html representation""" # Without nested columns (or empty), just do representation as normal if len(self.nested_columns) == 0 or len(self) == 0: # This mimics pandas behavior if pd.get_option("display.max_rows") is None: # If max_rows is None, just show the header return super().to_html(max_rows=None, show_dimensions=True) if self.shape[0] > pd.get_option("display.max_rows"): return super().to_html(max_rows=pd.get_option("display.min_rows"), show_dimensions=True) else: return super().to_html(max_rows=pd.get_option("display.max_rows"), show_dimensions=True) # Nested Column Formatting # Display nested columns as small html dataframes with a single row def repack_row(chunk, header=True): # If the chunk is None or empty, return None (displayed same as Null) if chunk is None or len(chunk) == 0: return None n_rows = len(chunk) if n_rows <= 2: # For 1 or 2 rows, show all rows without a footer chunk = chunk.round(8) max_rows_html = n_rows else: # For 3+ rows, show first row and a "+N rows" footer chunk = chunk.head(1).round(8) chunk.astype({col: object for col in chunk.columns}) # cast to string for info row len_row = pd.DataFrame( { col: [f"<i>+{n_rows - 1} rows</i>"] if i == 0 else ["..."] for i, col in enumerate(chunk.columns) } ) chunk = pd.concat([chunk, len_row], ignore_index=True) max_rows_html = 2 # Estimate width and resize html_res = chunk.to_html( max_rows=max_rows_html, max_cols=5, show_dimensions=False, index=False, header=header, escape=False, ) return html_res # Handle sizing, trim html dataframe if output will be truncated df_shape = self.shape # grab original shape information for later if pd.get_option("display.max_rows") is None: html_df = self.copy() elif df_shape[0] > pd.get_option("display.max_rows"): html_df = self.head(pd.get_option("display.min_rows") + 1) else: html_df = self.copy() # replace index to ensure proper behavior for duplicate index values index_values = html_df.index html_df = html_df.reset_index(drop=True) repr = html_df.style.format({col: repack_row for col in self.nested_columns}) # Create a mapping function to retrieve original index def map_true_index(index): return index_values[index] repr = repr.format_index(map_true_index, axis=0) # Recover some truncation formatting, limited to head truncation if pd.get_option("display.max_rows") is None: # Just display header return repr.to_html(max_rows=0) elif df_shape[0] > pd.get_option("display.max_rows"): # when over the max_rows threshold, display with truncation ("..." row at the end) html_repr = repr.to_html(max_rows=pd.get_option("display.min_rows")) else: # when under the max_rows threshold, display all rows (behavior of 0 here) html_repr = repr.to_html(max_rows=0) # Manually append dimensionality to a styler output html_repr += f"{df_shape[0]} rows x {df_shape[1]} columns" return html_repr def _parse_hierarchical_components(self, delimited_path: str, delimiter: str = ".") -> list[str]: """ Given a string that may be a delimited path, parse it into its components, respecting backticks that are used to protect component names that may contain the delimiter. """ aliases = getattr(self, "_aliases", None) if aliases is None: delimited_path, aliases = _identify_aliases(delimited_path) return [aliases.get(x, x) for x in delimited_path.split(delimiter)] def _is_known_hierarchical_column(self, components: list[str] | str) -> bool: """Determine whether a string is a known hierarchical column name""" if isinstance(components, str): components = self._parse_hierarchical_components(components) if len(components) < 2: return False base_name = components[0] if self._is_nested_column(base_name): nested_name = ".".join(components[1:]) return nested_name in self.dtypes[base_name].column_dtypes return False def _is_nested_column(self, col: str): return col in self.columns and isinstance(self.dtypes[col], NestedDtype) def _is_known_column(self, components: list[str] | str) -> bool: """Determine whether a list of field components describes a known column name""" if isinstance(components, str): components = self._parse_hierarchical_components(components) if ".".join(components) in self.columns: return True return self._is_known_hierarchical_column(components) def __getitem__(self, item): """Adds custom __getitem__ behavior for nested columns""" if isinstance(item, str): return self._getitem_str(item) elif self._is_key_list(item): return self._getitem_list(item) return super().__getitem__(item) def _getitem_str(self, item): if self._is_nested_column(item): return NestedSeries(super().__getitem__(item)) # Preempt the nested check if the item is a base column, with or without # dots and backticks. if item in self.columns: return super().__getitem__(item) components = self._parse_hierarchical_components(item) # One more check on the entirety of the item name, in case backticks were used # (even if they weren't necessary). cleaned_item = ".".join(components) if cleaned_item in self.columns: return super().__getitem__(cleaned_item) # If a nested column name is passed, return a flat series for that column # flat series is chosen over list series for utility # e.g. native ability to do something like ndf["nested.a"] + 3 if self._is_known_hierarchical_column(components): nested = components[0] field = ".".join(components[1:]) return self[nested].nest.to_flat(columns=[field])[field] else: raise KeyError(f"Column '{cleaned_item}' not found in nested columns or base columns") @staticmethod def _is_key_list(item): if not is_list_like(item): return False if is_bool_dtype(item): return False return all(isinstance(k, str) for k in item) def _getitem_list(self, item): unknown_cols = [k for k in item if not self._is_known_column(k)] if unknown_cols: raise KeyError(f"{unknown_cols} not in index") non_nested_keys = [k for k in item if k in self.columns] result = super().__getitem__(non_nested_keys).copy() components = [self._parse_hierarchical_components(k) for k in item] nested_components = [c for c in components if self._is_known_hierarchical_column(c)] nested_columns = defaultdict(list) for comps in nested_components: nested_columns[comps[0]].append(".".join(comps[1:])) for c in nested_columns: result[c] = self[c].nest[nested_columns[c]] return result def __setitem__(self, key, value): """Custom __setitem__ for NestedFrame: auto-nest DataFrame assignment to new columns.""" # If assigning a DataFrame to a new column, auto-nest it # Special handling paths for assignment of dataframes to nested columns if isinstance(key, str) and isinstance(value, pd.DataFrame | NestedFrame): # if all columns are NestedDtype, combine them into a single nested column if np.array([isinstance(dtype, NestedDtype) for dtype in value.dtypes]).all(): for i, col in enumerate(value.columns): if i == 0: new_nested = value[col] else: # there must be a better way than through list columns list_cols = value[col].to_lists() for column in value[col].columns: new_nested = new_nested.nest.set_list_column(column, list_cols[column]) value = new_nested # Assign a DataFrame as a new column, auto-nesting it elif key not in self.columns: # Note this uses the default approach for join_nested, which is a left join on index new_df = self.join_nested(value, name=key) self._update_inplace(new_df) return components = self._parse_hierarchical_components(key) # Replacing or adding columns to a nested structure # Allows statements like ndf["nested.t"] = ndf["nested.t"] - 5 # Or ndf["nested.base_t"] = ndf["nested.t"] - 5 # Performance note: This requires building a new nested structure # TODO: Support assignment of a new column to an existing nested col from a list series if self._is_known_hierarchical_column(components) or ( len(components) > 1 and components[0] in self.nested_columns ): if len(components) != 2: raise ValueError(f"Only one level of nesting is supported; given {key}") nested, field = components # Support a special case of embedding a base column into a nested column, with values being # repeated in each nested list-array. if isinstance(value, pd.Series) and self.index.equals(value.index): new_nested_series = self[nested].nest.set_filled_column(field, value) else: new_nested_series = self[nested].nest.set_flat_column(field, value) return super().__setitem__(nested, new_nested_series) # Adding a new nested structure from a column # Allows statements like ndf["new_nested.t"] = ndf["nested.t"] - 5 if len(components) > 1: new_nested, field = components if isinstance(value, pd.Series): value.name = field value = value.to_frame() new_df = self.join_nested(value, name=new_nested) self._update_inplace(new_df) return None super().__setitem__(key, value) self._cast_cols_to_nested(struct_list=False) def __delitem__(self, key): """Delete a column or a nested field using dot notation (e.g., del nf['nested.x'])""" self.drop([key], axis=1, inplace=True)
[docs] def get_subcolumns(self, nested_columns="all") -> list[str]: """Returns a set of all subcolumn names from a set of nested columns, including dot notation Parameters ---------- nested_columns : 'all' or str or list of str, optional The nested columns to get subcolumns from. Default is 'all', which means all nested columns. Returns ------- list of str A list of subcolumn names in dot notation, e.g. 'nested.a' Examples -------- >>> from nested_pandas.datasets import generate_data >>> nf = generate_data(5,10, seed=1) >>> nf["nested2"] = nf["nested"] # create a second nested column for demonstration >>> nf.get_subcolumns() # doctest: +NORMALIZE_WHITESPACE ['nested.t', 'nested.flux', 'nested.flux_error', 'nested.band', 'nested2.t', 'nested2.flux', 'nested2.flux_error', 'nested2.band'] >>> nf.get_subcolumns("nested") ['nested.t', 'nested.flux', 'nested.flux_error', 'nested.band'] """ # By default, get all subcolumns from all nested columns if nested_columns == "all": nested_columns = self.nested_columns if isinstance(nested_columns, str): nested_columns = [nested_columns] subcols = [] for nested_column in nested_columns: subcols += [f"{nested_column}.{col}" for col in self[nested_column].columns] # I don't believe we need an error if we don't find any, as upstream errors will always trigger # on wrong column names return subcols
@deprecated( version="0.6.0", reason="`add_nested` will be removed in version 0.7.0, use `join_nested` instead.", ) def add_nested( self, obj, name: str, *, how: str = "left", on: None | str | list[str] = None, dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None, ) -> Self: # type: ignore[name-defined] # noqa: F821 """Packs input object to a nested column and adds it to the NestedFrame This method returns a new NestedFrame with the added nested column. Parameters ---------- obj : pd.DataFrame or a sequence of items convertible to nested structures The object to be packed into nested pd.Series and added to the NestedFrame. If a DataFrame is passed, it must have non-unique index values, which are used to pack the DataFrame. If a sequence of elements is passed, it is packed into a nested pd.Series. Sequence elements may be individual pd.DataFrames, dictionaries (keys are nested column names, values are arrays of the same length), or any other object convertible to pa.StructArray. Additionally, None and pd.NA are allowed as elements to represent missing values. name : str The name of the nested column to be added to the NestedFrame. how : {'left', 'right', 'outer', 'inner'}, default: 'left' How to handle the operation of the two objects: - left: use calling frame's index. - right: use the calling frame's index and order but drop values not in the other frame's index. - outer: form union of calling frame's index with other frame's index, and sort it lexicographically. - inner: form intersection of calling frame's index with other frame's index, preserving the order of the calling index. on : str or list of str, default: None Column(s) in the calling frame to join on instead of the index. The original index is always preserved. The column(s) are used only as join keys and are dropped from the nested structure. dtype : dtype or None NestedDtype to use for the nested column; pd.ArrowDtype or pa.DataType can also be used to specify the nested dtype. If None, the dtype is inferred from the input object. Returns ------- NestedFrame A new NestedFrame with the added nested column. Examples -------- >>> import nested_pandas as npd >>> nf = npd.NestedFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, ... index=[0,1,2]) >>> nf2 = npd.NestedFrame({"c":[1,2,3,4,5,6,7,8,9]}, ... index=[0,0,0,1,1,1,2,2,2]) >>> # By default, aligns on the index >>> nf.add_nested(nf2, "nested") a b nested 0 1 4 [{c: 1}; …] (3 rows) 1 2 5 [{c: 4}; …] (3 rows) 2 3 6 [{c: 7}; …] (3 rows) >>> # We can also align on columns. The index is preserved. >>> nf = npd.NestedFrame({"a": [1,2,2,3], "b": [4,4,5,6]}).set_index(["a", "b"]) >>> nf2 = npd.NestedFrame({"a": [1,2,2,2], "b": [4,4,4,5], "c": [1,2,3,4]}) >>> nf.join_nested(nf2, "nested", on=["a", "b"]) # doctest: +NORMALIZE_WHITESPACE nested a b 1 4 [{c: 1}] 2 4 [{c: 2}; …] (2 rows) 5 [{c: 4}] 3 6 None """ return self.join_nested(obj, name, how=how, on=on, dtype=dtype)
[docs] def join_nested( self, obj, name: str, *, how: str = "left", on: None | str | list[str] = None, dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None, ) -> Self: # type: ignore[name-defined] # noqa: F821 """Packs input object to a nested column and adds it to the NestedFrame This method returns a new NestedFrame with the added nested column. Parameters ---------- obj : pd.DataFrame or a sequence of items convertible to nested structures The object to be packed into nested pd.Series and added to the NestedFrame. If a DataFrame is passed, it must have non-unique index values, which are used to pack the DataFrame. If a sequence of elements is passed, it is packed into a nested pd.Series. Sequence elements may be individual pd.DataFrames, dictionaries (keys are nested column names, values are arrays of the same length), or any other object convertible to pa.StructArray. Additionally, None and pd.NA are allowed as elements to represent missing values. name : str The name of the nested column to be joined to the NestedFrame. how : {'left', 'right', 'outer', 'inner'}, default: 'left' How to handle the operation of the two objects: - left: use calling frame's index. - right: use the calling frame's index and order but drop values not in the other frame's index. - outer: form union of calling frame's index with other frame's index, and sort it lexicographically. - inner: form intersection of calling frame's index with other frame's index, preserving the order of the calling index. on : str or list of str, default: None Column(s) in the calling frame to join on instead of the index. The original index is always preserved. The column(s) are used only as join keys and are dropped from the nested structure. dtype : dtype or None NestedDtype to use for the nested column; pd.ArrowDtype or pa.DataType can also be used to specify the nested dtype. If None, the dtype is inferred from the input object. Returns ------- NestedFrame A new NestedFrame with the joined nested column. Examples -------- >>> import nested_pandas as npd >>> nf = npd.NestedFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, ... index=[0,1,2]) >>> nf2 = npd.NestedFrame({"c":[1,2,3,4,5,6,7,8,9]}, ... index=[0,0,0,1,1,1,2,2,2]) >>> # By default, aligns on the index >>> nf.join_nested(nf2, "nested") a b nested 0 1 4 [{c: 1}; …] (3 rows) 1 2 5 [{c: 4}; …] (3 rows) 2 3 6 [{c: 7}; …] (3 rows) >>> # We can also align on columns. The index is preserved. >>> nf = npd.NestedFrame({"a": [1,2,2,3], "b": [4,4,5,6]}).set_index(["a", "b"]) >>> nf2 = npd.NestedFrame({"a": [1,2,2,2], "b": [4,4,4,5], "c": [1,2,3,4]}) >>> nf.join_nested(nf2, "nested", on=["a", "b"]) # doctest: +NORMALIZE_WHITESPACE nested a b 1 4 [{c: 1}] 2 4 [{c: 2}; …] (2 rows) 5 [{c: 4}] 3 6 None """ # Add sources to objects packed = pack(obj, name=name, on=on, dtype=dtype) new_df = self.copy() res = new_df.join(packed, how=how, on=on) # In some cases join returns a DataFrame, so convert back to NestedFrame # For example, with empty dataframes if not isinstance(res, NestedFrame): res = NestedFrame(res) return res
[docs] def nest_lists(self, columns: list[str], name: str) -> NestedFrame: """Creates a new NestedFrame where the specified list-value columns are packed into a nested column. Parameters ---------- columns : list[str] The list-value columns that should be packed into a nested column. All columns in the list will attempt to be packed into a single nested column with the name provided in `nested_name`. name : str The column name of the new nested column which we will pack the list-value columns into. This column will be added to the NestedFrame. Returns ------- NestedFrame A new NestedFrame with the added nested columns Examples -------- >>> import nested_pandas as npd >>> nf = npd.NestedFrame({"c":[1,2,3], "d":[2,4,6], ... "e":[[1,2,3], [4,5,6], [7,8,9]]}, ... index=[0,1,2]) >>> nf.nest_lists(columns=["e"], name="nested") c d nested 0 1 2 [{e: 1}; …] (3 rows) 1 2 4 [{e: 4}; …] (3 rows) 2 3 6 [{e: 7}; …] (3 rows) """ return NestedFrame.from_lists(self.copy(), list_columns=columns, name=name)
[docs] @classmethod def from_flat(cls, df, base_columns, nested_columns=None, on: str | None = None, name="nested"): """Creates a NestedFrame with base and nested columns from a flat dataframe. Parameters ---------- df: pd.DataFrame or NestedFrame A flat dataframe. base_columns: list-like The columns that should be used as base (flat) columns in the output dataframe. nested_columns: list-like, or None The columns that should be packed into a nested column. All columns in the list will attempt to be packed into a single nested column with the name provided in `nested_name`. If None, is defined as all columns not in `base_columns`. on: str or None The name of a column to use as the new index. Typically, the index should have a unique value per row for base columns, and should repeat for nested columns. For example, a dataframe with two columns; a=[1,1,1,2,2,2] and b=[5,10,15,20,25,30] would want an index like [0,0,0,1,1,1] if a is chosen as a base column. If not provided the current index will be used. name: The name of the output column the `nested_columns` are packed into. Returns ------- NestedFrame A NestedFrame with the specified nesting structure. Examples -------- >>> import nested_pandas as npd >>> nf = npd.NestedFrame({"a":[1,1,1,2,2], "b":[2,2,2,4,4], ... "c":[1,2,3,4,5], "d":[2,4,6,8,10]}, ... index=[0,0,0,1,1]) >>> npd.NestedFrame.from_flat(nf, base_columns=["a","b"]) a b nested 0 1 2 [{c: 1, d: 2}; …] (3 rows) 1 2 4 [{c: 4, d: 8}; …] (2 rows) """ # Resolve new index if on is not None: # if a base column is chosen remove it if on in base_columns: base_columns = [col for col in base_columns if col != on] df = df.set_index(on) # drop duplicates on index out_df = df[base_columns][~df.index.duplicated(keep="first")] # Convert df to NestedFrame if needed if not isinstance(out_df, NestedFrame): out_df = NestedFrame(out_df) # add nested if nested_columns is None: nested_columns = [col for col in df.columns if col not in base_columns] return out_df.join_nested(df[nested_columns], name=name)
[docs] @classmethod def from_lists(cls, df, base_columns=None, list_columns=None, name="nested"): """Creates a NestedFrame with base and nested columns from a flat dataframe. Parameters ---------- df: pd.DataFrame or NestedFrame A dataframe with list columns. base_columns: list-like, or None Any columns that have non-list values in the input df. These will simply be kept as identical columns in the result list_columns: list-like, or None The list-value columns that should be packed into a nested column. All columns in the list will attempt to be packed into a single nested column with the name provided in `nested_name`. If None, is defined as all columns not in `base_columns`. name: The name of the output column the `nested_columns` are packed into. Returns ------- NestedFrame A NestedFrame with the specified nesting structure. Examples -------- >>> import nested_pandas as npd >>> nf = npd.NestedFrame({"c":[1,2,3], "d":[2,4,6], ... "e":[[1,2,3], [4,5,6], [7,8,9]]}, ... index=[0,1,2]) >>> npd.NestedFrame.from_lists(nf, base_columns=["c","d"]) c d nested 0 1 2 [{e: 1}; …] (3 rows) 1 2 4 [{e: 4}; …] (3 rows) 2 3 6 [{e: 7}; …] (3 rows) """ # Resolve base and list columns if base_columns is None: if list_columns is None: # with no inputs, assume all columns are list-valued list_columns = df.columns else: # if list_columns are defined, assume everything else is base base_columns = [col for col in df.columns if col not in list_columns] else: if list_columns is None: # with defined base_columns, assume everything else is list list_columns = [col for col in df.columns if col not in base_columns] if len(list_columns) == 0: raise ValueError("No columns were assigned as list columns.") # Pack list columns into a nested column if len(df) == 0: # if the dataframe is empty, just return an empty nested column # since there are no iterable values to pack packed_df = NestedFrame().join_nested(df[list_columns], name=name) packed_df.index.name = df.index.name else: # Check that each column has iterable elements for col in list_columns: # Check if the column is iterable based on its first value. # This is a simple heuristic but infers more than its dtype # which will probably be an object. sample_val = df[col].iloc[0] if not hasattr(sample_val, "__iter__") and not isinstance(sample_val, str | bytes): raise ValueError( f"Cannot pack column {col} which does not contain an iterable list based " "on its first value, {sample_val}." ) packed_df = pack_lists(df[list_columns]) packed_df.name = name # concat the nested column to the base_column df if base_columns is not None: return pd.concat([df[base_columns], packed_df], axis=1) # or just return the packed_df as a nestedframe if no base cols else: return NestedFrame(packed_df.to_frame())
[docs] def drop( self, labels=None, *, axis=0, index=None, columns=None, level=None, inplace=False, errors="raise", ): """Drop specified labels from rows or columns. Remove rows or columns by specifying label names and corresponding axis, or by directly specifying index or column names. When using a multi-index, labels on different levels can be removed by specifying the level. See the `user guide <https://pandas.pydata.org/docs/user_guide /advanced.html#advanced-shown-levels>`_ for more information about the now unused levels. Parameters ---------- labels: single label or list-like Index or column labels to drop. A tuple will be used as a single label and not treated as a list-like. Nested sub-columns are accessed using dot notation (e.g. "nested.col1"). axis: {0 or ‘index’, 1 or ‘columns’}, default 0 Whether to drop labels from the index (0 or ‘index’) or columns (1 or ‘columns’). index: single label or list-like Alternative to specifying axis (labels, axis=0 is equivalent to index=labels). columns: single label or list-like Alternative to specifying axis (labels, axis=1 is equivalent to columns=labels). level: int or level name, optional For MultiIndex, level from which the labels will be removed. inplace: bool, default False If False, return a copy. Otherwise, do operation in place and return None. errors: {‘ignore’, ‘raise’}, default ‘raise’ If ‘ignore’, suppress error and only existing labels are dropped. Returns ------- DataFrame or None Returns DataFrame or None DataFrame with the specified index or column labels removed or None if inplace=True. Examples -------- >>> from nested_pandas.datasets.generation import generate_data >>> nf = generate_data(5,5, seed=1) >>> # drop the "t" column from "nested" >>> nf = nf.drop(["nested.t"], axis=1) >>> nf a b nested 0 0.417022 0.184677 [{flux: 31.551563, flux_error: 1.0, band: 'r'}... 1 0.720324 0.372520 [{flux: 68.650093, flux_error: 1.0, band: 'g'}... 2 0.000114 0.691121 [{flux: 83.462567, flux_error: 1.0, band: 'g'}... 3 0.302333 0.793535 [{flux: 1.828828, flux_error: 1.0, band: 'g'};... 4 0.146756 1.077633 [{flux: 75.014431, flux_error: 1.0, band: 'g'}... """ # axis 1 requires special handling for nested columns if axis == 1 or columns is not None: # label convergence if isinstance(labels, str): labels = [labels] elif columns is not None: labels = [columns] if isinstance(columns, str) else columns columns = None axis = 1 nested_labels = [label for label in labels if self._is_known_hierarchical_column(label)] base_labels = [label for label in labels if not self._is_known_hierarchical_column(label)] # split nested_labels by nested column if len(nested_labels) > 0: nested_cols = set([label.split(".")[0] for label in nested_labels]) # drop targeted sub-columns for each nested column for col in nested_cols: sub_cols = [label.split(".")[1] for label in nested_labels if label.split(".")[0] == col] if inplace: self[col] = self[col].nest.drop(sub_cols) else: self = self.assign(**{f"{col}": self[col].nest.drop(sub_cols)}) # drop remaining base columns if len(base_labels) > 0: return super().drop( labels=base_labels, axis=axis, index=index, columns=columns, level=level, inplace=inplace, errors=errors, ) else: return self if not inplace else None # Otherwise just drop like pandas return super().drop( labels=labels, axis=axis, index=index, columns=columns, level=level, inplace=inplace, errors=errors, )
def split( self, nested_col: str, by: str, values=None, drop_by_col: bool = False, drop_nested: bool = False, ) -> NestedFrame: """Split a nested column into multiple nested columns by a categorical sub-column. Parameters ---------- nested_col : str The name of the nested column to split. by : str The name of the sub-column within nested_col to split on. values : list or str or None, optional The specific values to split on. If None, all unique values are used. If a string is provided, it is iterated as a list of characters. drop_by_col : bool, default False If True, the sub-column specified by `by` is dropped from each new nested column. drop_nested : bool, default False If True, the original nested column is dropped from the result. Returns ------- NestedFrame A new NestedFrame with one new nested column per unique value in `by`, named ``{nested_col}_{value}``. Examples -------- >>> from nested_pandas.datasets.generation import generate_data >>> nf = generate_data(5, 5, seed=1) >>> nf.split("nested", by="band")[["a", "b", "nested_r"]] # doctest: +SKIP a b nested_r 0 0.417022 0.184677 [{t: 8.38389, flux: 31.551563, band: 'r'}; …] (2 rows) 1 0.720324 0.372520 [{t: 19.365232, flux: 90.85955, band: 'r'}; …] (2 rows) 2 0.000114 0.691121 [{t: 11.173797, flux: 28.044399, band: 'r'}; …] (3 rows) 3 0.302333 0.793535 [{t: 2.807739, flux: 78.927933, band: 'r'}] 4 0.146756 1.077633 [{t: 17.527783, flux: 13.002857, band: 'r'}; …] (2 rows) """ if nested_col not in self.nested_columns: raise ValueError( f"'{nested_col}' is not a nested column. Available nested columns: {self.nested_columns}" ) if by not in self[nested_col].nest.columns: raise ValueError( f"'{by}' is not a sub-column of '{nested_col}'. " f"Available sub-columns: {list(self[nested_col].nest.columns)}" ) has_values = values is not None split_values = self[f"{nested_col}.{by}"].unique() if not has_values else list(values) if len(self) == 0: result = self.copy() if has_values: for val in split_values: result[f"{nested_col}_{val}"] = None if drop_nested: result = result.drop(labels=[nested_col], axis=1) return result is_str = pd.api.types.is_string_dtype(self[f"{nested_col}.{by}"]) result = self.copy() for val in split_values: val_repr = f"'{val}'" if is_str else val queried = self.query(f"{nested_col}.{by}=={val_repr}") if queried is None or len(queried) == 0: if has_values: result[f"{nested_col}_{val}"] = None continue filtered = queried[nested_col] if drop_by_col: filtered = filtered.nest.drop(by) result[f"{nested_col}_{val}"] = filtered if drop_nested: result = result.drop(labels=[nested_col], axis=1) return result
[docs] def min(self, exclude_nest: bool = False, numeric_only: bool = False, **kwargs): """ Return the minimum value of each column as a series, including nested columns with prefix to indicate the source column. This computes the column-wise minimum (axis=0) across base and nested columns. Row-wise minimum (axis=1) are not supported, as reductions along columns are the primary intended behavior for NestedFrame. By default, missing values (NaNs) will be skipped in the computation. For non-numeric columns (e.g., strings), the method returns the lexicographically smallest value when `numeric_only=False` (default). Parameters ---------- exclude_nest : bool, default False If set to True, will exclude the nested structure and only computes the minimum over the base columns numeric_only : bool, default False Include only float, int, boolean columns. **kwargs See the documentation for :meth:`pandas.DataFrame.min` for complete details on the keyword arguments accepted by :meth:`min`. Returns ------- pandas.Series Examples -------- >>> from nested_pandas.datasets.generation import generate_data >>> nf = generate_data(5,5, seed=1) >>> nf_min = nf.min() >>> nf_min a 0.000114 b 0.184677 nested.t 0.547752 nested.flux 1.828828 nested.flux_error 1.0 nested.band g dtype: object See Also -------- :meth:`pandas.DataFrame.min` """ if not self.nested_columns: return super().min(numeric_only=numeric_only, **kwargs) # handle base columns base_col = [col for col in self.columns if col not in self.nested_columns] base_min = super().__getitem__(base_col).min(numeric_only=numeric_only, **kwargs) if exclude_nest: return base_min # handle nested columns nested_mins = [] for nest_col in self.nested_columns: nested_df = self[nest_col].explode() nested_df.columns = [f"{nest_col}.{col}" for col in nested_df.columns] nested_mins.append(nested_df.min(numeric_only=numeric_only, **kwargs)) # Combine base and nested min values into a single Series if applicable and return if base_min.empty: return pd.concat(nested_mins) else: return pd.concat([base_min] + nested_mins)
[docs] def max(self, exclude_nest: bool = False, numeric_only: bool = False, **kwargs): """ Return the maximum value of each column as a series, including nested columns with prefix to indicate the source column. This computes the column-wise maximum (axis=0) across base and nested columns. Row-wise maximum (axis=1) are not supported, as reductions along columns are the primary intended behavior for NestedFrame. By default, missing values (NaNs) will be skipped in the computation. For non-numeric columns (e.g., strings), the method returns the lexicographically largest value when `numeric_only=False` (default). Parameters ---------- exclude_nest : bool, default False If set to True, will exclude the nested structure and only computes the maximum over the base columns numeric_only : bool, default False Include only float, int, boolean columns. **kwargs See the documentation for :meth:`pandas.DataFrame.max` for complete details on the keyword arguments accepted by :meth:`max`. Returns ------- pandas.Series Examples -------- >>> from nested_pandas.datasets.generation import generate_data >>> nf = generate_data(5,5, seed=1) >>> nf_max = nf.max() >>> nf_max a 0.720324 b 1.077633 nested.t 19.365232 nested.flux 98.886109 nested.flux_error 1.0 nested.band r dtype: object See Also -------- :meth:`pandas.DataFrame.max` """ if not self.nested_columns: return super().max(numeric_only=numeric_only, **kwargs) # handle base columns base_col = [col for col in self.columns if col not in self.nested_columns] base_max = super().__getitem__(base_col).max(numeric_only=numeric_only, **kwargs) if exclude_nest: return base_max # handle nested columns nested_maxs = [] for nest_col in self.nested_columns: nested_df = self[nest_col].explode() nested_df.columns = [f"{nest_col}.{col}" for col in nested_df.columns] nested_maxs.append(nested_df.max(numeric_only=numeric_only, **kwargs)) # Combine base and nested max values into a single Series if applicable and return if base_max.empty: return pd.concat(nested_maxs) else: return pd.concat([base_max] + nested_maxs)
[docs] def describe(self, exclude_nest: bool = False, percentiles=None, include=None, exclude=None): """ Generate descriptive statistics, including nested columns with prefix to indicate the source. Descriptive statistics include those that summarize the central tendency, dispersion and shape of a dataset's distribution, excluding NaN values, similar to the behavior of `pandas.DataFrame.describe()`. Nested columns use `pyarrow` data types for efficiency, which are not always directly compatible with pandas' type-based filtering. - pyarrow strings are not viewed as object type. - numerical types from pyarrow (i.e., int, double) are still matched by pandas' `np.number`, so filtering with `include=[np.number]` will include numeric nested columns. Parameters ---------- exclude_nest : bool, default False If set to True, will exclude the nested structure and only computes the statistics over the base columns percentiles : list-like of numbers, optional The percentiles to include in the output. All should fall between 0 and 1. Defaults to [.25, .5, .75]. include : 'all', list-like of dtypes or None (default), optional A white list of data types to include in the output. exclude : list-like of dtypes or None (default), optional A black list of data types to exclude from the output. Returns ------- NestedFrame A NestedFrame with the summary statistics. Raises ------ ValueError If no statistics can be generated from the columns. A combined error message will be given. Examples -------- >>> from nested_pandas.datasets.generation import generate_data >>> nf = generate_data(5,5, seed=1) >>> nf_desc = nf.describe() >>> nf_desc a b nested.t nested.flux nested.flux_error count 5.000000 5.000000 25.0 25.0 25.0 mean 0.317310 0.623897 10.095623 45.252724 1.0 std 0.274904 0.351880 6.434858 30.152261 0.0 min 0.000114 0.184677 0.547752 1.828828 1.0 25% 0.146756 0.372520 3.96203 21.162812 1.0 50% 0.302333 0.691121 10.663306 44.789353 1.0 75% 0.417022 0.793535 16.014891 69.975836 1.0 max 0.720324 1.077633 19.365232 98.886109 1.0 -See Also -------- -:meth:`pandas.DataFrame.describe` """ result = [] errors = [] check = ["_base"] # a list of all possible columns to call describe() if not exclude_nest: check.extend(self.nested_columns) if not self.nested_columns: return NestedFrame(super().describe(percentiles=percentiles, include=include, exclude=exclude)) for checkable in check: # check the base columns if checkable == "_base": try: base_col = [col for col in self.columns if col not in self.nested_columns] base_desc = ( super() .__getitem__(base_col) .describe( percentiles=percentiles, include=include, exclude=exclude, ) ) except ValueError as err: # continue if value error caused by no matching type or empty base columns errors.append(f"Base columns: {err}") continue result.append(base_desc) # check the nested columns else: nested_df = self[checkable].explode() nested_df.columns = [f"{checkable}.{col}" for col in nested_df.columns] try: nested_desc = nested_df.describe( percentiles=percentiles, include=include, exclude=exclude, ) except ValueError as err: # continue if value error caused by no matching type for nested columns errors.append(f"Nested column '{checkable}': {err}") continue result.append(nested_desc) if not result: raise ValueError(f"All columns in {check} failed.\n" + "\n".join(errors)) if include is None and exclude is None: # try only get the numeric columns and drop the others numeric_dtypes = [r.select_dtypes(include=[np.number]) for r in result] non_empty_numeric_dtypes = [r for r in numeric_dtypes if not r.empty] if non_empty_numeric_dtypes: result = non_empty_numeric_dtypes return NestedFrame(pd.concat(result, axis=1))
[docs] def explode(self, column: IndexLabel, ignore_index: bool = False): """ Transform each element of a list-like base column to a row, replicating index values. Parameters ---------- column : IndexLabel Column(s) to explode. For multiple columns, specify a non-empty list with each element be str or tuple, and all specified columns their list-like data on same row of the frame must have matching length. ignore_index : bool, default False If True, the resulting index will be labeled 0, 1, ..., n - 1. Returns ------- NestedFrame Exploded lists and to rows of the subset columns; index will be duplicated for these rows. Raises ------ ValueError It raises if: 1) columns of the frame are not unique, 2) specified columns to explode is an empty list, 3) specified columns to explode do not have matching counts of elements rowwise in the frame. See Also -------- :meth:`pandas.DataFrame.explode` Examples -------- >>> from nested_pandas.datasets.generation import generate_data >>> nf = generate_data(3,3, seed=1) >>> nf_explode = nf.explode(column="nested") >>> nf_explode a b t flux flux_error band 0 0.417022 0.604665 3.725204 67.046751 1.0 g 0 0.417022 0.604665 10.776335 14.038694 1.0 g 0 0.417022 0.604665 4.089045 96.826158 1.0 g 1 0.720324 0.293512 6.911215 41.73048 1.0 r 1 0.720324 0.293512 8.38389 19.810149 1.0 r 1 0.720324 0.293512 17.562349 31.342418 1.0 g 2 0.000114 0.184677 7.935349 55.868983 1.0 r 2 0.000114 0.184677 13.70439 80.074457 1.0 r 2 0.000114 0.184677 0.547752 69.232262 1.0 g """ if isinstance(column, str): columns = [column] elif isinstance(column, list): columns = column if len(columns) == 0: raise ValueError("`column` must not be empty") if len(set(columns)) != len(columns): raise ValueError("`column` must have unique elements") else: raise ValueError("`column` must be str or list") if len(extra_cols := set(columns) - set(self.columns)) > 0: if len(extra_cols) == 1: raise ValueError( f"column {extra_cols.pop()} not found, available columns: {list(self.columns)}" ) raise ValueError( f"columns {sorted(extra_cols)} not found, available columns: {list(self.columns)}" ) nested_columns = [col for col in columns if col in self.nested_columns] base_columns = [col for col in columns if col not in nested_columns] # Shortcut for the base-column-only case if len(nested_columns) == 0: return NestedFrame(super().explode(columns, ignore_index=ignore_index)) # Handle duplicated index use-case: use "ordinal" index, but keep the original one as a column to # restore it later. default_index_name = "__index_" index_col_name = self.index.name or default_index_name w_ordinal_idx = self.reset_index(drop=False, names=index_col_name) # Call pandas.DataFrame.explode for non-nested columns all_but_requested_nested_columns = [col for col in w_ordinal_idx.columns if col not in nested_columns] base_exploded = w_ordinal_idx[all_but_requested_nested_columns] if len(all_but_requested_nested_columns) > 0 and len(base_columns) > 0: base_exploded = super(NestedFrame, base_exploded).explode(base_columns, ignore_index=False) base_exploded = NestedFrame(base_exploded) # Check if it was actually exploded, or no list-columns were there. # This could fail in the case when all lists had one element only, we ignore that edge-case here. is_base_exploded = not w_ordinal_idx.index.equals(base_exploded.index) # Unnest each requested nested column and store as a "flat" dataframe. flat_frames: list[Self] = [] # type: ignore[name-defined] # noqa: F821 for nested_col in nested_columns: # Check if counts (lengths) in nested columns mismatch if len(flat_frames) > 0 and np.any( w_ordinal_idx[nested_col].nest.len() != w_ordinal_idx[nested_columns[0]].nest.len() ): raise ValueError( f"One or few rows of {nested_col} have different element counts from {nested_columns[0]}" ) flat = w_ordinal_idx[nested_col].explode() # Check if counts (lengths) of this nested column mismatch with one of the list columns. if is_base_exploded and not base_exploded.index.equals(flat.index): raise ValueError( f"One or few rows of {nested_col} have different element counts " f"from one or few of these columns: {base_columns}" ) flat_frames.append(flat) if is_base_exploded: result = pd.concat([base_exploded] + flat_frames, axis=1) else: # Join works here, because we used the ordinal index before exploding result = base_exploded.join(pd.concat(flat_frames, axis=1)) if ignore_index: return result.drop(index_col_name, axis=1).reset_index(drop=True) # Restore original index result = result.set_index(index_col_name, drop=True) if result.index.name == default_index_name: result.index.name = None return result
[docs] def fillna( self, value: Hashable | Mapping | pd.Series | pd.DataFrame | None = None, *, axis: Axis | None = None, inplace: bool = False, limit: int | None = None, ) -> NestedFrame | None: """ Fill NA/NaN values using the specified method for base and nested columns. Parameters ---------- value : scalar, dict, Series, or DataFrame Value to use to fill holes (e.g. 0), alternately a dict/Series/DataFrame of values specifying which value to use for each column. Values not in the dict/Series/DataFrame will not be filled. This value cannot be a list. axis : {axes_single_arg}, default None Axis along which to fill missing values. inplace : bool, default False If True, fill in-place. Note: this will modify any other views on this object (e.g., a no-copy slice for a column in a NestedFrame). limit : int, default None The maximum number of entries along the entire axis where NaNs will be filled. Must be greater than 0 if not None. Currently, limit on nested columns is not supported, meaning that all Nans will be filled (if there is a value specified) regardless of the input. Returns ------- NestedFrame or None NestedFrame with missing values filled or None if ``inplace=True``. See Also -------- :meth:`pandas.DataFrame.fillna` Examples -------- >>> import nested_pandas as npd >>> nf = npd.NestedFrame( ... data={"a": [np.nan, 20, np.nan], "b": [np.nan, np.nan, 30], "c": [10, np.nan, np.nan]}, ... index=[0, 1, 2] ... ) >>> nested = pd.DataFrame( ... data={"d": [np.nan, np.nan, np.nan], "e": [np.nan, 1, np.nan]}, ... index=[0, 1, 2] ... ) >>> nf = nf.join_nested(nested, "nested") >>> nf.fillna(0) a b c nested 0 0.0 0.0 10.0 [{d: 0.0, e: 0.0}] 1 20.0 0.0 0.0 [{d: 0.0, e: 1.0}] 2 0.0 30.0 0.0 [{d: 0.0, e: 0.0}] """ if not self.nested_columns: return super().fillna(value=value, axis=axis, inplace=inplace, limit=limit) base_cols = [col for col in self.columns if col not in self.nested_columns] filled_df = super().__getitem__(base_cols).fillna(value=value, axis=axis, inplace=False, limit=limit) for nest_col in self.nested_columns: nested_df = self[nest_col].explode() nested_value: Any if isinstance(value, Mapping): nested_value = {} for k, v in value.items(): if k.startswith(f"{nest_col}."): subcol = k.split(".", 1)[1] # strip prefix nested_value[subcol] = v else: nested_value = value nested_df = nested_df.fillna(value=nested_value, axis=axis, inplace=False, limit=None) filled_df = filled_df.join_nested(nested_df, nest_col) if inplace: self._update_inplace(filled_df) return None return filled_df
[docs] def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: """Evaluate a string describing operations on NestedFrame columns. Operates on columns only, not specific rows or elements. This allows `eval` to run arbitrary code, which can make you vulnerable to code injection if you pass user input to this function. Works the same way as `pd.DataFrame.eval`, except that this method will also automatically unpack nested columns into NestedSeries, and the resulting expression will have the dimensions of the unpacked series. Parameters ---------- expr : str The expression string to evaluate. inplace : bool, default False If the expression contains an assignment, whether to perform the operation inplace and mutate the existing NestedFrame. Otherwise, a new NestedFrame is returned. **kwargs See the documentation for :meth:`pandas.DataFrame.eval` for complete details on the keyword arguments accepted by :meth:`eval`. Returns ------- ndarray, scalar, pandas object, nested-pandas object, or None The result of the evaluation or None if ``inplace=True``. See Also -------- :meth:`pandas.DataFrame.eval` """ _, aliases = _identify_aliases(expr) self._aliases: dict[str, str] | None = aliases kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + (_NestResolver(self),) kwargs["inplace"] = inplace kwargs["parser"] = "nested-pandas" answer = super().eval(expr, **kwargs) # If the result is a _SeriesFromNest, set the metadata manually # This is a bit of a hack, as it's a backstop for super().eval() # not propagating the metadata correctly, `for some reason`. # Furthermore, it relies on the assumption that the first resolver # is the only one that matters. Because we disallow multi-layer # queries, this is potentially safe, though eval statements that target # multiple nests may have strange behavior. if isinstance(answer, _SeriesFromNest) and not hasattr(answer, "nest_name"): nest_key = list(kwargs["resolvers"][0].keys())[0] answer.nest_name = kwargs["resolvers"][0][nest_key]._nest_name answer.flat_nest = kwargs["resolvers"][0][nest_key]._flat_nest self._aliases = None return answer
def extract_nest_names( self, expr: str, local_dict=None, global_dict=None, resolvers=(), level: int = 0, target=None, **kwargs, ) -> set[str]: """ Given a string expression, parse it and visit the resulting expression tree, surfacing the nesting types. The purpose is to identify expressions that attempt to mix base and nested columns, or columns from two different nests. """ index_resolvers = self._get_index_resolvers() column_resolvers = self._get_cleaned_column_resolvers() resolvers = resolvers + (_NestResolver(self), column_resolvers, index_resolvers) # Parser needs to be the "nested-pandas" parser. # We also need the same variable context that eval() will have, so that # backtick-quoted names are substituted as expected. env = ensure_scope( level + 1, global_dict=global_dict, local_dict=local_dict, resolvers=resolvers, target=target, ) parsed_expr = Expr(expr, parser="nested-pandas", env=env) expr_tree = parsed_expr.terms separable = _subexprs_by_nest([], expr_tree) return set(separable.keys())
[docs] def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame | None: """Query the columns of a NestedFrame with a boolean expression. Specified queries can target nested columns in addition to the typical column set Parameters ---------- expr : str The query string to evaluate. Access nested columns using `nested_df.nested_col` (where `nested_df` refers to a particular nested dataframe and `nested_col` is a column of that nested dataframe). You can refer to variables in the environment by prefixing them with an '@' character like ``@a + b``. You can refer to column names that are not valid Python variable names by surrounding them in backticks. Thus, column names containing spaces or punctuations (besides underscores) or starting with digits must be surrounded by backticks. (For example, a column named "Area (cm^2)" would be referenced as ```Area (cm^2)```). Column names which are Python keywords (like "list", "for", "import", etc) cannot be used. For example, if one of your columns is called ``a a`` and you want to sum it with ``b``, your query should be ```a a` + b``. inplace : bool Whether to modify the DataFrame rather than creating a new one. **kwargs See the documentation for :meth:`pandas.DataFrame.query` for complete details on the keyword arguments accepted by :meth:`query`. Returns ------- NestedFrame NestedFrame resulting from the provided query expression. Examples -------- >>> from nested_pandas.datasets.generation import generate_data >>> nf = generate_data(5,5, seed=1) >>> nf = nf.query("nested.t > 10") >>> nf a b nested 0 0.417022 0.184677 [{t: 13.40935, flux: 98.886109, flux_error: 1.... 1 0.720324 0.372520 [{t: 13.70439, flux: 68.650093, flux_error: 1.... 2 0.000114 0.691121 [{t: 11.173797, flux: 28.044399, flux_error: 1... 3 0.302333 0.793535 [{t: 17.562349, flux: 1.828828, flux_error: 1.... 4 0.146756 1.077633 [{t: 17.527783, flux: 13.002857, flux_error: 1... Most of the Series and NestedSeries attibutes and methods are available through the query interface. For example, to query based on the length of the nested frames, you can do: >>> nf = nf.query("nested.len() > 2") >>> nf a b nested 0 0.417022 0.184677 [{t: 13.40935, flux: 98.886109, flux_error: 1.... 3 0.302333 0.793535 [{t: 17.562349, flux: 1.828828, flux_error: 1.... 4 0.146756 1.077633 [{t: 17.527783, flux: 13.002857, flux_error: 1... See Also -------- :meth:`pandas.DataFrame.query` Notes ----- Queries that target a particular nested structure return a dataframe with rows of that particular nested structure filtered. For example, querying the NestedFrame "df" with nested structure "my_nested" as below will return all rows of df, but with mynested filtered by the condition: `nf.query("mynested.a > 2")` """ if not isinstance(expr, str): msg = f"expr must be a string to be evaluated, {type(expr)} given" raise ValueError(msg) kwargs["level"] = kwargs.pop("level", 0) + 1 kwargs["target"] = None # At present, the query expression must be either entirely within a # single nest, or have nothing but base columns. Mixed structures are not # supported, so preflight the expression. nest_names = self.extract_nest_names(expr, **kwargs) if len(nest_names) > 1: raise ValueError("Queries cannot target multiple structs/layers, write a separate query for each") result = self.eval(expr, **kwargs) # If the result is a _SeriesFromNest, then the evaluation has caused unpacking, # which means that a nested attribute was referenced. Apply this result # to the nest and repack. Otherwise, apply it to this instance as usual, # since it operated on the base attributes. if isinstance(result, _SeriesFromNest): nest_name, flat_nest = result.nest_name, result.flat_nest # Reset index to "ordinal" like [0, 0, 0, 1, 1, 2, 2, 2] list_index = self[nest_name].array.get_list_index() flat_nest = flat_nest.set_index(list_index) query_result = result.set_axis(list_index) # Selecting flat values matching the query result new_flat_nest = flat_nest[query_result] new_df = self._set_filtered_flat_df(nest_name, new_flat_nest) else: new_df = self.loc[result] if inplace: self._update_inplace(new_df) return None else: return new_df
def _set_filtered_flat_df(self, nest_name, flat_df): """Set a filtered flat dataframe for a nested column Here we assume that flat_df has filtered "ordinal" index, e.g. flat_df.index == [0, 2, 2, 2], while self.index is arbitrary (e.g. ["a", "b", "a"]), and self[nest_name].array.list_index is [0, 0, 1, 1, 1, 2, 2, 2, 2]. """ new_df = self.reset_index(drop=True) new_df[nest_name] = pack_sorted_df_into_struct(flat_df, name=nest_name) return new_df.set_index(self.index) def _resolve_dropna_target(self, on_nested, subset): """resolves the target layer for a given set of dropna kwargs""" nested_cols = self.nested_columns # first check the subset kwarg input subset_target = [] if subset: if isinstance(subset, str): subset = [subset] for col in subset: # Without a ".", always assume base layer if "." not in col: subset_target.append("base") else: layer, col = col.split(".") if layer in nested_cols: subset_target.append(layer) else: raise ValueError(f"layer '{layer}' not found in the base columns") # Check for 1 target subset_target = np.unique(subset_target) if len(subset_target) > 1: # prohibit multi-target operations raise ValueError( f"Targeted multiple nested structures ({subset_target}), write one command per target dataframe" # noqa ) subset_target = str(subset_target[0]) # Next check the on_nested kwarg input if on_nested and on_nested not in nested_cols: raise ValueError("Provided nested layer not found in nested dataframes") # Resolve target layer target = "base" if on_nested and subset_target: if on_nested != subset_target: raise ValueError( f"Provided on_nested={on_nested}, but subset columns are from {subset_target}. Make sure these are aligned or just use subset." # noqa ) else: target = subset_target elif on_nested: target = str(on_nested) elif subset_target: target = str(subset_target) return target, subset
[docs] def dropna( self, *, axis: Axis = 0, how: AnyAll | lib.NoDefault = no_default, thresh: int | lib.NoDefault = no_default, on_nested: bool = False, subset: IndexLabel | None = None, inplace: bool = False, ignore_index: bool = False, ) -> NestedFrame | None: """ Remove missing values for one layer of the NestedFrame. Parameters ---------- axis : {0 or 'index', 1 or 'columns'}, default 0 Determine if rows or columns which contain missing values are removed. * 0, or 'index' : Drop rows which contain missing values. * 1, or 'columns' : Drop columns which contain missing value. Only a single axis is allowed. how : {'any', 'all'}, default 'any' Determine if row or column is removed from DataFrame, when we have at least one NA or all NA. * 'any' : If any NA values are present, drop that row or column. * 'all' : If all values are NA, drop that row or column. thresh : int, optional Require that many non-NA values. Cannot be combined with how. on_nested : str or bool, optional If not False, applies the call to the nested dataframe in the column with label equal to the provided string. If specified, the nested dataframe should align with any columns given in `subset`. subset : column label or sequence of labels, optional Labels along other axis to consider, e.g. if you are dropping rows these would be a list of columns to include. Access nested columns using `nested_df.nested_col` (where `nested_df` refers to a particular nested dataframe and `nested_col` is a column of that nested dataframe). inplace : bool, default False Whether to modify the DataFrame rather than creating a new one. ignore_index : bool, default ``False`` If ``True``, the resulting axis will be labeled 0, 1, …, n - 1. .. versionadded:: 2.0.0 Returns ------- DataFrame or None DataFrame with NA entries dropped from it or None if ``inplace=True``. Examples -------- A common usecase for `dropna` is to remove empty nested rows: >>> from nested_pandas.datasets.generation import generate_data >>> nf = generate_data(5,5, seed=1) >>> # this query empties several of the nested dataframes >>> nf = nf.query("nested.t > 19") >>> nf a b nested 0 0.417022 0.184677 None 1 0.720324 0.372520 [{t: 19.365232, flux: 90.85955, flux_error: 1.... 2 0.000114 0.691121 [{t: 19.157791, flux: 14.672857, flux_error: 1... 3 0.302333 0.793535 None 4 0.146756 1.077633 None >>> # dropna removes rows with those emptied dataframes >>> nf.dropna(subset="nested") a b nested 1 0.720324 0.372520 [{t: 19.365232, flux: 90.85955, flux_error: 1.... 2 0.000114 0.691121 [{t: 19.157791, flux: 14.672857, flux_error: 1... `dropna` can also be used on nested columns: >>> nf = generate_data(5,5, seed=1) >>> # Either on the whole dataframe >>> nf.dropna(on_nested="nested") a b nested 0 0.417022 0.184677 [{t: 8.38389, flux: 31.551563, flux_error: 1.0... 1 0.720324 0.372520 [{t: 13.70439, flux: 68.650093, flux_error: 1.... 2 0.000114 0.691121 [{t: 4.089045, flux: 83.462567, flux_error: 1.... 3 0.302333 0.793535 [{t: 17.562349, flux: 1.828828, flux_error: 1.... 4 0.146756 1.077633 [{t: 0.547752, flux: 75.014431, flux_error: 1.... >>> # or on a specific nested column >>> nf.dropna(subset="nested.t") a b nested 0 0.417022 0.184677 [{t: 8.38389, flux: 31.551563, flux_error: 1.0... 1 0.720324 0.372520 [{t: 13.70439, flux: 68.650093, flux_error: 1.... 2 0.000114 0.691121 [{t: 4.089045, flux: 83.462567, flux_error: 1.... 3 0.302333 0.793535 [{t: 17.562349, flux: 1.828828, flux_error: 1.... 4 0.146756 1.077633 [{t: 0.547752, flux: 75.014431, flux_error: 1.... Notes ----- Operations that target a particular nested structure return a dataframe with rows of that particular nested structure affected. Values for `on_nested` and `subset` should be consistent in pointing to a single layer, multi-layer operations are not supported. """ # determine target dataframe target, subset = self._resolve_dropna_target(on_nested, subset) if target == "base": return super().dropna( axis=axis, how=how, thresh=thresh, subset=subset, inplace=inplace, ignore_index=ignore_index, ) if ignore_index: raise ValueError("ignore_index is not supported for nested columns") if subset is not None: subset = [col.split(".")[-1] for col in subset] target_flat = self[target].explode() target_flat = target_flat.set_index(self[target].array.get_list_index()) if inplace: target_flat.dropna( axis=axis, how=how, thresh=thresh, subset=subset, inplace=True, ) else: target_flat = target_flat.dropna( axis=axis, how=how, thresh=thresh, subset=subset, inplace=False, ) new_df = self._set_filtered_flat_df(nest_name=target, flat_df=target_flat) if inplace: self._update_inplace(new_df) return None return new_df
[docs] def sort_values( self, by, *, axis=0, ascending=True, inplace=False, kind="quicksort", na_position="last", ignore_index=False, key=None, ): """ Sort by the values along either axis. Parameters ---------- by : str or list of str Name or list of names to sort by. Access nested columns using `nested_df.nested_col` (where `nested_df` refers to a particular nested dataframe and `nested_col` is a column of that nested dataframe). axis : {0 or 'index', 1 or 'columns'}, default 0 Axis to be sorted. ascending : bool or list of bool, default True Sort ascending vs. descending. Specify list for multiple sort orders. If this is a list of bools, must match the length of the by. inplace : bool, default False If True, perform operation in-place. kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort' Choice of sorting algorithm. See also ndarray.np.sort for more information. mergesort is the only stable algorithm. For DataFrames, this option is only applied when sorting on a single column or label. na_position : {'first', 'last'}, default 'last' Puts NaNs at the beginning if first; last puts NaNs at the end. ignore_index : bool, default False If True, the resulting axis will be labeled 0, 1, …, n - 1. Always False when applied to nested layers. key : callable, optional Apply the key function to the values before sorting. Returns ------- DataFrame or None DataFrame with sorted values if inplace=False, None otherwise. Examples --------- >>> from nested_pandas.datasets.generation import generate_data >>> nf = generate_data(5,5, seed=1) >>> # Sort nested values >>> nf.sort_values(by="nested.band") a b nested 0 0.417022 0.184677 [{t: 13.40935, flux: 98.886109, flux_error: 1.... 1 0.720324 0.372520 [{t: 13.70439, flux: 68.650093, flux_error: 1.... 2 0.000114 0.691121 [{t: 4.089045, flux: 83.462567, flux_error: 1.... 3 0.302333 0.793535 [{t: 17.562349, flux: 1.828828, flux_error: 1.... 4 0.146756 1.077633 [{t: 0.547752, flux: 75.014431, flux_error: 1.... """ # Resolve target layer target = [] if isinstance(by, str): by = [by] # Check "by" columns for hierarchical references for col in by: if self._is_known_hierarchical_column(col): target.append(col.split(".")[0]) else: target.append("base") # Ensure one target layer, preventing multi-layer operations target = np.unique(target) if len(target) > 1: raise ValueError("Queries cannot target multiple structs/layers, write a separate query for each") target = str(target[0]) # Apply pandas sort_values if target == "base": return super().sort_values( by=by, axis=axis, ascending=ascending, inplace=inplace, kind=kind, na_position=na_position, ignore_index=ignore_index, key=key, ) else: # target is a nested column target_flat = self[target].explode() target_flat = target_flat.set_index(self[target].array.get_list_index()) if target_flat.index.name is None: # set name if not present target_flat.index.name = "index" # Index must always be the first sort key for nested columns nested_by = [target_flat.index.name] + [col.split(".")[-1] for col in by] # Augment the ascending kwarg to include the index if isinstance(ascending, bool): ascending = [True] + [ascending] * len(by) elif isinstance(ascending, list): ascending = [True] + ascending target_flat = target_flat.sort_values( by=nested_by, axis=axis, ascending=ascending, kind=kind, na_position=na_position, ignore_index=False, key=key, inplace=False, ) # Could be optimized, as number of rows doesn't change new_df = self._set_filtered_flat_df(nest_name=target, flat_df=target_flat) if inplace: self._update_inplace(new_df) return None return new_df
@deprecated( version="0.6.0", reason="`reduce` will be removed in version 0.7.0, use `map_rows` instead.", ) def reduce(self, func, *args, infer_nesting=True, append_columns=False, **kwargs) -> NestedFrame: # type: ignore[override] """ Takes a function and applies it to each top-level row of the NestedFrame. The user may specify which columns the function is applied to, with columns from the 'base' layer being passed to the function as scalars and columns from the nested layers being passed as numpy arrays. Parameters ---------- func : callable Function to apply to each nested dataframe. The first arguments to `func` should be which columns to apply the function to. See the Notes for recommendations on writing func outputs. args : positional arguments A list of string column names to pull from the NestedFrame to pass along to the function. If the function has additional arguments, pass them as keyword arguments (e.g. `arg_name=value`). infer_nesting : bool, default True If True, the function will pack output columns into nested structures based on column names adhering to a nested naming scheme. E.g. "nested.b" and "nested.c" will be packed into a column called "nested" with columns "b" and "c". If False, all outputs will be returned as base columns. append_columns : bool, default False if True, the output columns should be appended to those in the original NestedFrame. kwargs : keyword arguments, optional Keyword arguments to pass to the function. Returns ------- `NestedFrame` `NestedFrame` with the results of the function applied to the columns of the frame. Examples -------- >>> from nested_pandas.datasets.generation import generate_data >>> import numpy as np >>> nf = generate_data(5,5, seed=1) >>> >>> # define a custom user function >>> # reduce will return a NestedFrame with two columns >>> def example_func(base_col, nested_col): ... return { ... "mean": np.mean(nested_col), ... "mean_minus_base": np.mean(nested_col) - base_col, ... } >>> >>> # apply the function >>> nf.reduce(example_func, "a", "nested.t") mean mean_minus_base 0 11.533440 11.116418 1 10.307751 9.587426 2 8.294042 8.293928 3 9.655291 9.352958 4 10.687591 10.540836 You may want the result of a `reduce` call to have nested structure, we can achieve this by using the `infer_nesting` kwarg: >>> # define a custom user function that returns nested structure >>> def example_func(base_col1, base_col2, nested_col): ... '''reduce will return a NestedFrame with nested structure''' ... return {"offsets.t_a": nested_col - base_col1, ... "offsets.t_b": nested_col - base_col2} By giving both output columns the prefix "offsets.", we signal to reduce to infer that these should be packed into a nested column called "offsets". >>> # apply the function with `infer_nesting` (True by default) >>> nf.reduce(example_func, "a", "b", "nested.t") offsets 0 [{t_a: 7.966868, t_b: 8.199213}; …] (5 rows) 1 [{t_a: 12.984066, t_b: 13.33187}; …] (5 rows) 2 [{t_a: 4.088931, t_b: 3.397924}; …] (5 rows) 3 [{t_a: 17.260016, t_b: 16.768814}; …] (5 rows) 4 [{t_a: 0.400996, t_b: -0.529882}; …] (5 rows) Notes ----- By default, `reduce` will produce a `NestedFrame` with enumerated column names for each returned value of the function. For more useful naming, it's recommended to have `func` return a dictionary where each key is an output column of the dataframe returned by `reduce` (as shown above). """ # Parse through the initial args to determine the columns to apply the function to requested_columns = [] for arg in args: # Stop when we reach an argument that is not a valid column, as we assume # that the remaining args are extra arguments to the function if not isinstance(arg, str): raise TypeError( f"Received an argument '{arg}' that is not a string. " "All arguments to `reduce` must be strings corresponding to" " column names to pass along to the function. If your function" " has additional arguments, pass them as kwargs (arg_name=value)." ) components = self._parse_hierarchical_components(arg) if not self._is_known_column(components): raise ValueError( f"Received a string argument '{arg}' that was not found in the columns list. " "All arguments to `reduce` must be strings corresponding to" " column names to pass along to the function. If your function" " has additional arguments, pass them as kwargs (arg_name=value)." ) layer = "base" if len(components) < 2 else components[0] col = components[-1] requested_columns.append((layer, col)) # We require the first *args to be the columns to apply the function to if not requested_columns: raise ValueError("No columns in `*args` specified to apply function to") # The remaining args are the extra arguments to the function other than columns extra_args: tuple[Any, ...] = () # empty tuple to make mypy happy if len(requested_columns) < len(args): extra_args = args[len(requested_columns) :] iterators = [] for layer, col in requested_columns: if layer == "base": iterators.append(self[col]) else: iterators.append(self[layer].array.iter_field_lists(col)) results = [func(*cols, *extra_args, **kwargs) for cols in zip(*iterators, strict=True)] results_nf = NestedFrame(results, index=self.index) if infer_nesting: # find potential nested structures from columns nested_cols = list( np.unique( [ column.split(".", 1)[0] for column in results_nf.columns if isinstance(column, str) and "." in column ] ) ) # pack results into nested structures for layer in nested_cols: layer_cols = [col for col in results_nf.columns if col.startswith(f"{layer}.")] rename_df = results_nf[layer_cols].rename(columns=lambda x: x.split(".", 1)[1]) nested_col = pack_lists(rename_df, name=layer) results_nf = results_nf[ [col for col in results_nf.columns if not col.startswith(f"{layer}.")] ] results_nf[layer] = nested_col if append_columns: # Append the results to the original NestedFrame return pd.concat([self, results_nf], axis=1) # Otherwise, return the results as a new NestedFrame return results_nf def _apply_njit_map_rows(self, requested_columns, func): """ Apply njit map_rows to njit custom function with requested_columns. Currently only supports 1 or 2 arguments custom function. """ try: import numba # noqa except ImportError as err: raise ImportError( "njit=True requires numba, please install with pip install numba" "or conda install conda-forge::numba" ) from err from . import njit_funcs if len(requested_columns) == 1: layer, col_name = requested_columns[0] if layer == "base": base_col = np.asarray(self[col_name]) results = njit_funcs._map_rows_njit1_base(func, base_col) else: nested_array = self[layer] offsets = np.asarray(nested_array.array.list_offsets) nested_col = np.asarray(nested_array[col_name]) results = njit_funcs._map_rows_njit1_nested(func, offsets, nested_col) else: # 2 requested columns for 2-arg custom function layer1, col1_name = requested_columns[0] layer2, col2_name = requested_columns[1] if layer1 == "base" and layer2 == "base": base_col1 = np.asarray(self[col1_name]) base_col2 = np.asarray(self[col2_name]) results = njit_funcs._map_rows_njit2_base_base(func, base_col1, base_col2) elif layer1 == "base": base_col1 = np.asarray(self[col1_name]) nested_array2 = self[layer2] offsets = np.asarray(nested_array2.array.list_offsets) col2 = np.asarray(nested_array2[col2_name]) results = njit_funcs._map_rows_njit2_base_nest(func, base_col1, offsets, col2) elif layer2 == "base": nested_array1 = self[layer1] offsets = np.asarray(nested_array1.array.list_offsets) col1 = np.asarray(nested_array1[col1_name]) base_col2 = np.asarray(self[col2_name]) results = njit_funcs._map_rows_njit2_nest_base(func, offsets, col1, base_col2) else: nested_array1 = self[layer1] nested_array2 = self[layer2] offsets1 = np.asarray(nested_array1.array.list_offsets) offsets2 = np.asarray(nested_array2.array.list_offsets) col1 = np.asarray(nested_array1[col1_name]) col2 = np.asarray(nested_array2[col2_name]) results = njit_funcs._map_rows_njit2_nest_nest(func, offsets1, offsets2, col1, col2) return results.tolist()
[docs] def map_rows( self, func: Callable[..., Any], columns: None | str | list[str] = None, *, row_container: Literal["dict"] | Literal["args"] = "dict", output_names: None | str | list[str] = None, infer_nesting: bool = True, append_columns: bool = False, njit: bool = False, **kwargs, ) -> NestedFrame: # type: ignore[override] """ Takes a function and applies it to each top-level row of the NestedFrame. Nested columns are packaged alongside base columns and available for function use, where base columns are passed as scalars and nested columns are passed as numpy arrays. The way in which the row data is packaged is configurable (by default, a dictionary) and controlled by the `row_container` argument. Parameters ---------- func : callable Function to apply to each nested dataframe. The first arguments to `func` should be which columns to apply the function to. See the Notes for recommendations on writing func outputs. columns : None | str | list of str Specifies which columns to pass to the function in the row_container format. If None, all columns are passed. If list of str, those columns are passed. If str, a single column is passed or if the string is a nested column, then all nested sub-columns are passed (e.g. columns="nested" passes all columns of the nested dataframe "nested"). To pass individual nested sub-columns, use the hierarchical column name (e.g. columns=["nested.t",...]). row_container : 'dict' or 'args', default 'dict' Specifies how the row data will be packaged when passed as an input to the function. If 'dict', the function will be called as `func({"col1": value, ...}, **kwargs)`, so func should expect a single dictionary input with keys corresponding to column names. If 'args', the function will be called as `func(value, ..., **kwargs)`, so func should expect positional arguments corresponding to the columns specified in `args`. output_names : None | str | list of str Specifies the names of the output columns in the resulting NestedFrame. If None, the function will return whatever names the user function returns. If specified will override any names returned by the user function provided the number of names matches the number of outputs. When not specified and the user function returns values without names (e.g. a list or tuple), the output columns will be enumerated (e.g. "0", "1", ...). infer_nesting : bool, default True If True, the function will pack output columns into nested structures based on column names adhering to a nested naming scheme. E.g. "nested.b" and "nested.c" will be packed into a column called "nested" with columns "b" and "c". If False, all outputs will be returned as base columns. Note that this will trigger off of names specified in `output_names` in addition to names returned by the user function. append_columns : bool, default False If True, the output columns are appended to those in the original NestedFrame. The output columns can contain nested sub-columns, which should be specified using their hierarchical column name (e.g. "nested.x"). If their base nested column exists in the original NestedFrame, the new output sub-columns will be added into the frame of the existing nested column. See an example below. njit : bool, default False If Ture, the function will try to use numba's njit to speed up the execution. This will only work if the custom function is compatible with njit and the requested columns are at most two. Note that using njit will disable support for `row_container="dict"`. kwargs : keyword arguments, optional Keyword arguments to pass to the function. Returns ------- `NestedFrame` `NestedFrame` with the results of the function applied to the columns of the frame. Examples -------- >>> from nested_pandas.datasets.generation import generate_data >>> import numpy as np >>> nf = generate_data(5,5, seed=1) >>> # define a custom user function >>> # map_rows will return a NestedFrame with two columns >>> def example_func(row): ... return np.mean(row["nested.t"]), np.mean(row["nested.t"]) - row["a"] >>> # apply the function >>> nf.map_rows(example_func, output_names=["mean", "mean_minus_base"]) mean mean_minus_base 0 11.533440 11.116418 1 10.307751 9.587426 2 8.294042 8.293928 3 9.655291 9.352958 4 10.687591 10.540836 We can pass along only the columns we need for the function using the `columns` argument, which removes the performance overhead of packaging all columns for each row: >>> nf.map_rows(example_func, columns=["a", "nested.t"], output_names=["mean", "mean_minus_base"]) mean mean_minus_base 0 11.533440 11.116418 1 10.307751 9.587426 2 8.294042 8.293928 3 9.655291 9.352958 4 10.687591 10.540836 Alternatively, we can pass along the row data as positional arguments instead of a dictionary by setting `row_container="args"` and adjusting our function signature accordingly: >>> def example_func(a, time): ... return np.mean(time), np.mean(time) - a >>> nf.map_rows(example_func, ... columns=["a", "nested.t"], ... output_names=["mean", "mean_minus_base"], ... row_container="args") mean mean_minus_base 0 11.533440 11.116418 1 10.307751 9.587426 2 8.294042 8.293928 3 9.655291 9.352958 4 10.687591 10.540836 Additional arguments that don't depend on row data can be passed as kwargs: >>> def example_func(row, scale): ... return np.mean(row["nested.t"]) * scale >>> nf.map_rows(example_func, columns=["nested.t"], output_names="mean", scale=1) mean 0 11.533440 1 10.307751 2 8.294042 3 9.655291 4 10.687591 Functions that target a single nested structure can just pass along the nested column name and all sub-columns will be available: >>> def first_val(row): ... return {"first_"+key.split(".")[1]:row[key][0] for key in row.keys()} >>> nf.map_rows(first_val, columns="nested") first_t first_flux first_flux_error first_band 0 8.383890 31.551563 1.0 r 1 13.704390 68.650093 1.0 g 2 4.089045 83.462567 1.0 g 3 17.562349 1.828828 1.0 g 4 0.547752 75.014431 1.0 g You may want the result of a `map_rows` call to have nested structure, we can achieve this by using the `infer_nesting` kwarg: >>> # define a custom user function that returns nested structure >>> def example_func(row): ... '''map_rows will return a NestedFrame with nested structure''' ... return {"offsets.t_a": row["nested.t"] - row["a"], ... "offsets.t_b": row["nested.t"] - row["b"]} By giving both output columns the prefix "offsets.", we signal to map_rows to infer that these should be packed into a nested column called "offsets". >>> # apply the function with `infer_nesting` (True by default) >>> nf.map_rows(example_func, columns=["a", "b", "nested.t"], infer_nesting=True) offsets 0 [{t_a: 7.966868, t_b: 8.199213}; …] (5 rows) 1 [{t_a: 12.984066, t_b: 13.33187}; …] (5 rows) 2 [{t_a: 4.088931, t_b: 3.397924}; …] (5 rows) 3 [{t_a: 17.260016, t_b: 16.768814}; …] (5 rows) 4 [{t_a: 0.400996, t_b: -0.529882}; …] (5 rows) You may also want to append the output columns to the original NestedFrame. We can achieve this by using the `append_columns` kwarg: >>> # define a custom user function that creates a nested sub-column >>> def example_func(row): ... '''map_rows will return a sub-column for the existing 'nested' column''' ... return row["nested.t"] - row["a"] >>> # apply the function with `append_columns` (False by default) >>> nf.map_rows(example_func, ... columns=["a", "nested.t"], ... output_names=["nested.t_a"], ... append_columns=True) a b nested 0 0.417022 0.184677 [{t: 8.38389, flux: 31.551563, flux_error: 1.0... 1 0.720324 0.372520 [{t: 13.70439, flux: 68.650093, flux_error: 1.... 2 0.000114 0.691121 [{t: 4.089045, flux: 83.462567, flux_error: 1.... 3 0.302333 0.793535 [{t: 17.562349, flux: 1.828828, flux_error: 1.... 4 0.146756 1.077633 [{t: 0.547752, flux: 75.014431, flux_error: 1.... Notes ----- If concerned about performance, specify `columns` to only include the columns needed for the function, as this will avoid the overhead of packaging all columns for each row. By default, `map_rows` will produce a `NestedFrame` with enumerated column names for each returned value of the function. It's recommended to either specify `output_names` or have `func` return a dictionary where each key is an output column of the dataframe returned by `map_rows` (as shown above). >>> def example_func(row): ... return np.mean(row["nested.t"]), np.mean(row["nested.t"]) - row["a"] >>> # first output column will be named "0", second "1" >>> nf.map_rows(example_func) 0 1 0 11.533440 11.116418 1 10.307751 9.587426 2 8.294042 8.293928 3 9.655291 9.352958 4 10.687591 10.540836 """ # Determine args if columns is None: # If None, pass all columns, with nested columns expanded to sub-columns columns = self.base_columns + self.get_subcolumns(nested_columns="all") elif isinstance(columns, str): # If it's a nested column, grab all sub-columns columns = self.get_subcolumns(columns) if columns in self.nested_columns else [columns] # Check arg validity requested_columns = [] for arg in columns: if not isinstance(arg, str): raise TypeError( f"Received an argument '{arg}' that is not a string. " "All arguments to `map_rows` must be strings corresponding to" " column names to pass along to the function." ) components = self._parse_hierarchical_components(arg) if not self._is_known_column(components): raise ValueError( f"Received a string argument '{arg}' that was not found in the columns list. " "All arguments to `map_rows` must be strings corresponding to" " column names to pass along to the function." ) layer = "base" if len(components) < 2 else components[0] col = components[-1] requested_columns.append((layer, col)) # Construct row containers and apply results = [] if row_container == "dict": if njit: raise ValueError( "njit execution is not supported for `row_container='dict'`, " "use `row_container='args'` instead." ) arg_dict = {} for layer, col in requested_columns: if layer == "base": arg_dict[col] = self[col] else: arg_dict[".".join([layer, col])] = self[layer].array.iter_field_lists(col) results = [ func( {col: val for col, val in zip(arg_dict.keys(), row, strict=True)}, **kwargs, ) for row in zip(*arg_dict.values(), strict=True) ] elif row_container == "args": if njit: try: results = self._apply_njit_map_rows(requested_columns, func) # except Exception as err: except Exception as err: raise ValueError( "njit execution for map_rowsis only supported for " "numba.jit decorated functions with at most 2 arguments" ) from err else: # Default python execution iterators = [] for layer, col in requested_columns: if layer == "base": iterators.append(self[col]) else: iterators.append(self[layer].array.iter_field_lists(col)) results = [func(*cols, **kwargs) for cols in zip(*iterators, strict=True)] # If the func returns a single array per row wrap results in a `NestedSeries`. # Otherwise, Pandas will try to expand array elements into separate columns. if results and isinstance(results[0], np.ndarray): results_nf = NestedFrame(NestedSeries(results, index=self.index)) else: results_nf = NestedFrame(results, index=self.index) # Override output names if specified if output_names is not None: if isinstance(output_names, str): output_names = [output_names] if len(output_names) != len(results_nf.columns): raise ValueError( f"Number of output names ({len(output_names)}) does not match " f"the number of outputs from the function ({len(results_nf.columns)})" ) results_nf.columns = output_names if infer_nesting: # find potential nested structures from columns nested_cols = list( np.unique( [ column.split(".", 1)[0] for column in results_nf.columns if isinstance(column, str) and "." in column ] ) ) # pack results into nested structures for layer in nested_cols: layer_cols = [col for col in results_nf.columns if col.startswith(f"{layer}.")] rename_df = results_nf[layer_cols].rename(columns=lambda x: x.split(".", 1)[1]) nested_col = pack_lists(rename_df, name=layer) results_nf = results_nf[ [col for col in results_nf.columns if not col.startswith(f"{layer}.")] ] results_nf[layer] = nested_col if append_columns: # Append sub-columns to existing nested columns self_nested_cols = [col for col in results_nf.nested_columns if col in self.nested_columns] for col in self_nested_cols: sub_columns = results_nf.get_subcolumns(col) for sub_col in sub_columns: self = self.assign(**{f"{sub_col}": results_nf[sub_col]}) # Append other base and nested columns base_results_nf = results_nf.drop(columns=self_nested_cols) return pd.concat([self, base_results_nf], axis=1) # Otherwise, return the results as a new NestedFrame return results_nf
[docs] def to_pandas(self, list_struct=False, large_list=False) -> pd.DataFrame: """Convert to an ordinal pandas DataFrame, with no NestedDtype series. NestedDtype is cast to pd.ArrowDtype Parameters ---------- list_struct: bool If True, cast nested columns to pandas struct-list arrow extension array columns. If False (default), cast nested columns to list-struct array columns. large_list : bool If False (default), use regular ``list_`` (int32 offsets). Set to True to use ``large_list`` (int64 offsets), which is required when the total number of nested elements across all rows exceeds ``2**31 - 1``. Returns ------- pd.DataFrame Ordinal pandas DataFrame. Examples -------- >>> from nested_pandas.datasets.generation import generate_data >>> nf = generate_data(5,5, seed=1) >>> nf.to_pandas() a b nested 0 0.417022 0.184677 {'t': array([ 8.38389029, 13.4093502 , 16.0148... 1 0.720324 0.372520 {'t': array([13.70439001, 8.34609605, 19.3652... 2 0.000114 0.691121 {'t': array([ 4.08904499, 11.17379657, 6.2684... 3 0.302333 0.793535 {'t': array([17.56234873, 2.80773877, 13.8464... 4 0.146756 1.077633 {'t': array([ 0.54775186, 3.96202978, 17.5277... """ df = pd.DataFrame(self) for col in self.nested_columns: df[col] = df[col].array.to_arrow_ext_array(list_struct=list_struct, large_list=large_list) return df
[docs] def to_parquet(self, path, large_list=False, **kwargs) -> None: """Creates parquet file(s) with the data of a NestedFrame, either as a single parquet file where each nested dataset is packed into its own column or as an individual parquet file for each layer. Note that here we always opt to use the pyarrow engine for writing parquet files. Parameters ---------- path : str The path to the parquet file large_list : bool If False (default), use regular ``list_`` (int32 offsets). Set to True to use ``large_list`` (int64 offsets), which is required when the total number of nested elements across all rows exceeds ``2**31 - 1``. kwargs : keyword arguments, optional Keyword arguments to pass to `pyarrow.parquet.write_table <https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html>`_ Returns ------- None Examples -------- >>> from nested_pandas.datasets.generation import generate_data >>> nf = generate_data(5,5, seed=1) >>> nf.to_parquet("nestedframe.parquet") # doctest: +SKIP """ df = self.to_pandas(list_struct=False, large_list=large_list) # Write through pyarrow # This is potentially not zero-copy # Note: Without pandas metadata, index writing is not as robust set # preserve_index=None for best behavior but index will generally # need to be set manually on load table = pa.Table.from_pandas(df, preserve_index=None) # Drop pandas metadata to make sure nesteddtypes are not preserved # Do this by rebuilding the schema table = table.cast(pa.schema([field for field in table.schema])) return pq.write_table(table, path, **kwargs)