# typing.Self and "|" union syntax don't exist in Python 3.9
from __future__ import annotations
from collections import defaultdict
from collections.abc import Callable
from typing import Literal
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from deprecated import deprecated
from pandas._libs import lib
from pandas._typing import Any, AnyAll, Axis, Hashable, IndexLabel, Mapping
from pandas.api.extensions import no_default
from pandas.core.computation.eval import Expr, ensure_scope
from pandas.core.dtypes.common import is_bool_dtype
from pandas.core.dtypes.inference import is_list_like
from nested_pandas.nestedframe.expr import (
_identify_aliases,
_NestResolver,
_SeriesFromNest,
_subexprs_by_nest,
)
from nested_pandas.series.dtype import NestedDtype
from nested_pandas.series.ext_array import NestedExtensionArray
from nested_pandas.series.nestedseries import NestedSeries
from nested_pandas.series.packer import pack, pack_lists, pack_sorted_df_into_struct
pd.set_option("display.max_rows", 30)
pd.set_option("display.min_rows", 5)
[docs]
class NestedFrame(pd.DataFrame):
"""A Pandas Dataframe extension with support for nested structure.
See https://pandas.pydata.org/docs/development/extending.html#subclassing-pandas-data-structures
"""
# https://pandas.pydata.org/docs/development/extending.html#arithmetic-with-3rd-party-types
# The __pandas_priority__ of DataFrame is 4000, so give NestedFrame a
# higher priority, so that binary operations involving this class and
# Series produce instances of this class, preserving the type and origin.
__pandas_priority__ = 4500
# The "_aliases" attribute is usually None or not even present, but when it is present,
# it indicates that an evaluation is in progress, and that columns and fields with names
# that are not identifier-like have been aliases to cleaned names, and this attribute
# contains those aliases, keyed by the cleaned name.
_metadata = ["_aliases"]
[docs]
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self._cast_cols_to_nested(struct_list=False)
def _cast_cols_to_nested(self, *, struct_list: bool) -> None:
"""Cast arrow columns to nested.
Parameters
----------
struct_list : bool
If `False` cast list-struct columns only. If `True`, also
try to cast struct-list columns validating if they have
valid nested structure.
"""
for column, dtype in self.dtypes.items():
if not isinstance(dtype, pd.ArrowDtype):
continue
pa_type = dtype.pyarrow_dtype
if pa.types.is_struct(pa_type) and not struct_list:
continue
if not NestedExtensionArray.is_input_pa_type_supported(pa_type):
continue
self[column] = NestedExtensionArray(pa.array(self[column]))
@property
def _constructor(self) -> Self: # type: ignore[name-defined] # noqa: F821
return NestedFrame
@property
def _constructor_expanddim(self) -> Self: # type: ignore[name-defined] # noqa: F821
return NestedFrame
@property
def all_columns(self) -> dict:
"""returns a dictionary of columns for each base/nested dataframe"""
all_columns = {"base": self.columns}
for column in self.columns:
if isinstance(self.dtypes[column], NestedDtype):
nest_cols = self[column].columns
all_columns[column] = nest_cols
return all_columns
@property
def nested_columns(self) -> list:
"""retrieves the base column names for all nested dataframes"""
nested_mask = self.dtypes.apply(lambda dtype: isinstance(dtype, NestedDtype))
return self.columns[nested_mask].tolist()
@property
def base_columns(self) -> list[str]:
"""Returns the list of base (non-nested) column names"""
nested_mask = self.dtypes.apply(lambda dtype: not isinstance(dtype, NestedDtype))
return self.columns[nested_mask].tolist()
def _repr_html_(self) -> str | None:
"""Override html representation"""
# Without nested columns (or empty), just do representation as normal
if len(self.nested_columns) == 0 or len(self) == 0:
# This mimics pandas behavior
if pd.get_option("display.max_rows") is None:
# If max_rows is None, just show the header
return super().to_html(max_rows=None, show_dimensions=True)
if self.shape[0] > pd.get_option("display.max_rows"):
return super().to_html(max_rows=pd.get_option("display.min_rows"), show_dimensions=True)
else:
return super().to_html(max_rows=pd.get_option("display.max_rows"), show_dimensions=True)
# Nested Column Formatting
# Display nested columns as small html dataframes with a single row
def repack_row(chunk, header=True):
# If the chunk is None or empty, return None (displayed same as Null)
if chunk is None or len(chunk) == 0:
return None
n_rows = len(chunk)
if n_rows <= 2:
# For 1 or 2 rows, show all rows without a footer
chunk = chunk.round(8)
max_rows_html = n_rows
else:
# For 3+ rows, show first row and a "+N rows" footer
chunk = chunk.head(1).round(8)
chunk.astype({col: object for col in chunk.columns}) # cast to string for info row
len_row = pd.DataFrame(
{
col: [f"<i>+{n_rows - 1} rows</i>"] if i == 0 else ["..."]
for i, col in enumerate(chunk.columns)
}
)
chunk = pd.concat([chunk, len_row], ignore_index=True)
max_rows_html = 2
# Estimate width and resize
html_res = chunk.to_html(
max_rows=max_rows_html,
max_cols=5,
show_dimensions=False,
index=False,
header=header,
escape=False,
)
return html_res
# Handle sizing, trim html dataframe if output will be truncated
df_shape = self.shape # grab original shape information for later
if pd.get_option("display.max_rows") is None:
html_df = self.copy()
elif df_shape[0] > pd.get_option("display.max_rows"):
html_df = self.head(pd.get_option("display.min_rows") + 1)
else:
html_df = self.copy()
# replace index to ensure proper behavior for duplicate index values
index_values = html_df.index
html_df = html_df.reset_index(drop=True)
repr = html_df.style.format({col: repack_row for col in self.nested_columns})
# Create a mapping function to retrieve original index
def map_true_index(index):
return index_values[index]
repr = repr.format_index(map_true_index, axis=0)
# Recover some truncation formatting, limited to head truncation
if pd.get_option("display.max_rows") is None:
# Just display header
return repr.to_html(max_rows=0)
elif df_shape[0] > pd.get_option("display.max_rows"):
# when over the max_rows threshold, display with truncation ("..." row at the end)
html_repr = repr.to_html(max_rows=pd.get_option("display.min_rows"))
else:
# when under the max_rows threshold, display all rows (behavior of 0 here)
html_repr = repr.to_html(max_rows=0)
# Manually append dimensionality to a styler output
html_repr += f"{df_shape[0]} rows x {df_shape[1]} columns"
return html_repr
def _parse_hierarchical_components(self, delimited_path: str, delimiter: str = ".") -> list[str]:
"""
Given a string that may be a delimited path, parse it into its components,
respecting backticks that are used to protect component names that may contain the delimiter.
"""
aliases = getattr(self, "_aliases", None)
if aliases is None:
delimited_path, aliases = _identify_aliases(delimited_path)
return [aliases.get(x, x) for x in delimited_path.split(delimiter)]
def _is_known_hierarchical_column(self, components: list[str] | str) -> bool:
"""Determine whether a string is a known hierarchical column name"""
if isinstance(components, str):
components = self._parse_hierarchical_components(components)
if len(components) < 2:
return False
base_name = components[0]
if self._is_nested_column(base_name):
nested_name = ".".join(components[1:])
return nested_name in self.dtypes[base_name].column_dtypes
return False
def _is_nested_column(self, col: str):
return col in self.columns and isinstance(self.dtypes[col], NestedDtype)
def _is_known_column(self, components: list[str] | str) -> bool:
"""Determine whether a list of field components describes a known column name"""
if isinstance(components, str):
components = self._parse_hierarchical_components(components)
if ".".join(components) in self.columns:
return True
return self._is_known_hierarchical_column(components)
def __getitem__(self, item):
"""Adds custom __getitem__ behavior for nested columns"""
if isinstance(item, str):
return self._getitem_str(item)
elif self._is_key_list(item):
return self._getitem_list(item)
return super().__getitem__(item)
def _getitem_str(self, item):
if self._is_nested_column(item):
return NestedSeries(super().__getitem__(item))
# Preempt the nested check if the item is a base column, with or without
# dots and backticks.
if item in self.columns:
return super().__getitem__(item)
components = self._parse_hierarchical_components(item)
# One more check on the entirety of the item name, in case backticks were used
# (even if they weren't necessary).
cleaned_item = ".".join(components)
if cleaned_item in self.columns:
return super().__getitem__(cleaned_item)
# If a nested column name is passed, return a flat series for that column
# flat series is chosen over list series for utility
# e.g. native ability to do something like ndf["nested.a"] + 3
if self._is_known_hierarchical_column(components):
nested = components[0]
field = ".".join(components[1:])
return self[nested].nest.to_flat(columns=[field])[field]
else:
raise KeyError(f"Column '{cleaned_item}' not found in nested columns or base columns")
@staticmethod
def _is_key_list(item):
if not is_list_like(item):
return False
if is_bool_dtype(item):
return False
return all(isinstance(k, str) for k in item)
def _getitem_list(self, item):
unknown_cols = [k for k in item if not self._is_known_column(k)]
if unknown_cols:
raise KeyError(f"{unknown_cols} not in index")
non_nested_keys = [k for k in item if k in self.columns]
result = super().__getitem__(non_nested_keys).copy()
components = [self._parse_hierarchical_components(k) for k in item]
nested_components = [c for c in components if self._is_known_hierarchical_column(c)]
nested_columns = defaultdict(list)
for comps in nested_components:
nested_columns[comps[0]].append(".".join(comps[1:]))
for c in nested_columns:
result[c] = self[c].nest[nested_columns[c]]
return result
def __setitem__(self, key, value):
"""Custom __setitem__ for NestedFrame: auto-nest DataFrame assignment to new columns."""
# If assigning a DataFrame to a new column, auto-nest it
# Special handling paths for assignment of dataframes to nested columns
if isinstance(key, str) and isinstance(value, pd.DataFrame | NestedFrame):
# if all columns are NestedDtype, combine them into a single nested column
if np.array([isinstance(dtype, NestedDtype) for dtype in value.dtypes]).all():
for i, col in enumerate(value.columns):
if i == 0:
new_nested = value[col]
else:
# there must be a better way than through list columns
list_cols = value[col].to_lists()
for column in value[col].columns:
new_nested = new_nested.nest.set_list_column(column, list_cols[column])
value = new_nested
# Assign a DataFrame as a new column, auto-nesting it
elif key not in self.columns:
# Note this uses the default approach for join_nested, which is a left join on index
new_df = self.join_nested(value, name=key)
self._update_inplace(new_df)
return
components = self._parse_hierarchical_components(key)
# Replacing or adding columns to a nested structure
# Allows statements like ndf["nested.t"] = ndf["nested.t"] - 5
# Or ndf["nested.base_t"] = ndf["nested.t"] - 5
# Performance note: This requires building a new nested structure
# TODO: Support assignment of a new column to an existing nested col from a list series
if self._is_known_hierarchical_column(components) or (
len(components) > 1 and components[0] in self.nested_columns
):
if len(components) != 2:
raise ValueError(f"Only one level of nesting is supported; given {key}")
nested, field = components
# Support a special case of embedding a base column into a nested column, with values being
# repeated in each nested list-array.
if isinstance(value, pd.Series) and self.index.equals(value.index):
new_nested_series = self[nested].nest.set_filled_column(field, value)
else:
new_nested_series = self[nested].nest.set_flat_column(field, value)
return super().__setitem__(nested, new_nested_series)
# Adding a new nested structure from a column
# Allows statements like ndf["new_nested.t"] = ndf["nested.t"] - 5
if len(components) > 1:
new_nested, field = components
if isinstance(value, pd.Series):
value.name = field
value = value.to_frame()
new_df = self.join_nested(value, name=new_nested)
self._update_inplace(new_df)
return None
super().__setitem__(key, value)
self._cast_cols_to_nested(struct_list=False)
def __delitem__(self, key):
"""Delete a column or a nested field using dot notation (e.g., del nf['nested.x'])"""
self.drop([key], axis=1, inplace=True)
[docs]
def get_subcolumns(self, nested_columns="all") -> list[str]:
"""Returns a set of all subcolumn names from a set of nested columns, including dot notation
Parameters
----------
nested_columns : 'all' or str or list of str, optional
The nested columns to get subcolumns from. Default is 'all', which means all nested columns.
Returns
-------
list of str
A list of subcolumn names in dot notation, e.g. 'nested.a'
Examples
--------
>>> from nested_pandas.datasets import generate_data
>>> nf = generate_data(5,10, seed=1)
>>> nf["nested2"] = nf["nested"] # create a second nested column for demonstration
>>> nf.get_subcolumns() # doctest: +NORMALIZE_WHITESPACE
['nested.t', 'nested.flux', 'nested.flux_error', 'nested.band',
'nested2.t', 'nested2.flux', 'nested2.flux_error', 'nested2.band']
>>> nf.get_subcolumns("nested")
['nested.t', 'nested.flux', 'nested.flux_error', 'nested.band']
"""
# By default, get all subcolumns from all nested columns
if nested_columns == "all":
nested_columns = self.nested_columns
if isinstance(nested_columns, str):
nested_columns = [nested_columns]
subcols = []
for nested_column in nested_columns:
subcols += [f"{nested_column}.{col}" for col in self[nested_column].columns]
# I don't believe we need an error if we don't find any, as upstream errors will always trigger
# on wrong column names
return subcols
@deprecated(
version="0.6.0",
reason="`add_nested` will be removed in version 0.7.0, use `join_nested` instead.",
)
def add_nested(
self,
obj,
name: str,
*,
how: str = "left",
on: None | str | list[str] = None,
dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None,
) -> Self: # type: ignore[name-defined] # noqa: F821
"""Packs input object to a nested column and adds it to the NestedFrame
This method returns a new NestedFrame with the added nested column.
Parameters
----------
obj : pd.DataFrame or a sequence of items convertible to nested structures
The object to be packed into nested pd.Series and added to
the NestedFrame. If a DataFrame is passed, it must have non-unique
index values, which are used to pack the DataFrame. If a sequence
of elements is passed, it is packed into a nested pd.Series.
Sequence elements may be individual pd.DataFrames, dictionaries
(keys are nested column names, values are arrays of the same
length), or any other object convertible to pa.StructArray.
Additionally, None and pd.NA are allowed as elements to represent
missing values.
name : str
The name of the nested column to be added to the NestedFrame.
how : {'left', 'right', 'outer', 'inner'}, default: 'left'
How to handle the operation of the two objects:
- left: use calling frame's index.
- right: use the calling frame's index and order but drop values
not in the other frame's index.
- outer: form union of calling frame's index with other frame's
index, and sort it lexicographically.
- inner: form intersection of calling frame's index with other
frame's index, preserving the order of the calling index.
on : str or list of str, default: None
Column(s) in the calling frame to join on instead of the index.
The original index is always preserved. The column(s) are used
only as join keys and are dropped from the nested structure.
dtype : dtype or None
NestedDtype to use for the nested column; pd.ArrowDtype or
pa.DataType can also be used to specify the nested dtype. If None,
the dtype is inferred from the input object.
Returns
-------
NestedFrame
A new NestedFrame with the added nested column.
Examples
--------
>>> import nested_pandas as npd
>>> nf = npd.NestedFrame({"a": [1, 2, 3], "b": [4, 5, 6]},
... index=[0,1,2])
>>> nf2 = npd.NestedFrame({"c":[1,2,3,4,5,6,7,8,9]},
... index=[0,0,0,1,1,1,2,2,2])
>>> # By default, aligns on the index
>>> nf.add_nested(nf2, "nested")
a b nested
0 1 4 [{c: 1}; …] (3 rows)
1 2 5 [{c: 4}; …] (3 rows)
2 3 6 [{c: 7}; …] (3 rows)
>>> # We can also align on columns. The index is preserved.
>>> nf = npd.NestedFrame({"a": [1,2,2,3], "b": [4,4,5,6]}).set_index(["a", "b"])
>>> nf2 = npd.NestedFrame({"a": [1,2,2,2], "b": [4,4,4,5], "c": [1,2,3,4]})
>>> nf.join_nested(nf2, "nested", on=["a", "b"]) # doctest: +NORMALIZE_WHITESPACE
nested
a b
1 4 [{c: 1}]
2 4 [{c: 2}; …] (2 rows)
5 [{c: 4}]
3 6 None
"""
return self.join_nested(obj, name, how=how, on=on, dtype=dtype)
[docs]
def join_nested(
self,
obj,
name: str,
*,
how: str = "left",
on: None | str | list[str] = None,
dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None,
) -> Self: # type: ignore[name-defined] # noqa: F821
"""Packs input object to a nested column and adds it to the NestedFrame
This method returns a new NestedFrame with the added nested column.
Parameters
----------
obj : pd.DataFrame or a sequence of items convertible to nested structures
The object to be packed into nested pd.Series and added to
the NestedFrame. If a DataFrame is passed, it must have non-unique
index values, which are used to pack the DataFrame. If a sequence
of elements is passed, it is packed into a nested pd.Series.
Sequence elements may be individual pd.DataFrames, dictionaries
(keys are nested column names, values are arrays of the same
length), or any other object convertible to pa.StructArray.
Additionally, None and pd.NA are allowed as elements to represent
missing values.
name : str
The name of the nested column to be joined to the NestedFrame.
how : {'left', 'right', 'outer', 'inner'}, default: 'left'
How to handle the operation of the two objects:
- left: use calling frame's index.
- right: use the calling frame's index and order but drop values
not in the other frame's index.
- outer: form union of calling frame's index with other frame's
index, and sort it lexicographically.
- inner: form intersection of calling frame's index with other
frame's index, preserving the order of the calling index.
on : str or list of str, default: None
Column(s) in the calling frame to join on instead of the index.
The original index is always preserved. The column(s) are used
only as join keys and are dropped from the nested structure.
dtype : dtype or None
NestedDtype to use for the nested column; pd.ArrowDtype or
pa.DataType can also be used to specify the nested dtype. If None,
the dtype is inferred from the input object.
Returns
-------
NestedFrame
A new NestedFrame with the joined nested column.
Examples
--------
>>> import nested_pandas as npd
>>> nf = npd.NestedFrame({"a": [1, 2, 3], "b": [4, 5, 6]},
... index=[0,1,2])
>>> nf2 = npd.NestedFrame({"c":[1,2,3,4,5,6,7,8,9]},
... index=[0,0,0,1,1,1,2,2,2])
>>> # By default, aligns on the index
>>> nf.join_nested(nf2, "nested")
a b nested
0 1 4 [{c: 1}; …] (3 rows)
1 2 5 [{c: 4}; …] (3 rows)
2 3 6 [{c: 7}; …] (3 rows)
>>> # We can also align on columns. The index is preserved.
>>> nf = npd.NestedFrame({"a": [1,2,2,3], "b": [4,4,5,6]}).set_index(["a", "b"])
>>> nf2 = npd.NestedFrame({"a": [1,2,2,2], "b": [4,4,4,5], "c": [1,2,3,4]})
>>> nf.join_nested(nf2, "nested", on=["a", "b"]) # doctest: +NORMALIZE_WHITESPACE
nested
a b
1 4 [{c: 1}]
2 4 [{c: 2}; …] (2 rows)
5 [{c: 4}]
3 6 None
"""
# Add sources to objects
packed = pack(obj, name=name, on=on, dtype=dtype)
new_df = self.copy()
res = new_df.join(packed, how=how, on=on)
# In some cases join returns a DataFrame, so convert back to NestedFrame
# For example, with empty dataframes
if not isinstance(res, NestedFrame):
res = NestedFrame(res)
return res
[docs]
def nest_lists(self, columns: list[str], name: str) -> NestedFrame:
"""Creates a new NestedFrame where the specified list-value columns are packed into a
nested column.
Parameters
----------
columns : list[str]
The list-value columns that should be packed into a nested column.
All columns in the list will attempt to be packed into a single
nested column with the name provided in `nested_name`.
name : str
The column name of the new nested column which we will pack the list-value
columns into. This column will be added to the NestedFrame.
Returns
-------
NestedFrame
A new NestedFrame with the added nested columns
Examples
--------
>>> import nested_pandas as npd
>>> nf = npd.NestedFrame({"c":[1,2,3], "d":[2,4,6],
... "e":[[1,2,3], [4,5,6], [7,8,9]]},
... index=[0,1,2])
>>> nf.nest_lists(columns=["e"], name="nested")
c d nested
0 1 2 [{e: 1}; …] (3 rows)
1 2 4 [{e: 4}; …] (3 rows)
2 3 6 [{e: 7}; …] (3 rows)
"""
return NestedFrame.from_lists(self.copy(), list_columns=columns, name=name)
[docs]
@classmethod
def from_flat(cls, df, base_columns, nested_columns=None, on: str | None = None, name="nested"):
"""Creates a NestedFrame with base and nested columns from a flat
dataframe.
Parameters
----------
df: pd.DataFrame or NestedFrame
A flat dataframe.
base_columns: list-like
The columns that should be used as base (flat) columns in the
output dataframe.
nested_columns: list-like, or None
The columns that should be packed into a nested column. All columns
in the list will attempt to be packed into a single nested column
with the name provided in `nested_name`. If None, is defined as all
columns not in `base_columns`.
on: str or None
The name of a column to use as the new index. Typically, the index
should have a unique value per row for base columns, and should
repeat for nested columns. For example, a dataframe with two
columns; a=[1,1,1,2,2,2] and b=[5,10,15,20,25,30] would want an
index like [0,0,0,1,1,1] if a is chosen as a base column. If not
provided the current index will be used.
name:
The name of the output column the `nested_columns` are packed into.
Returns
-------
NestedFrame
A NestedFrame with the specified nesting structure.
Examples
--------
>>> import nested_pandas as npd
>>> nf = npd.NestedFrame({"a":[1,1,1,2,2], "b":[2,2,2,4,4],
... "c":[1,2,3,4,5], "d":[2,4,6,8,10]},
... index=[0,0,0,1,1])
>>> npd.NestedFrame.from_flat(nf, base_columns=["a","b"])
a b nested
0 1 2 [{c: 1, d: 2}; …] (3 rows)
1 2 4 [{c: 4, d: 8}; …] (2 rows)
"""
# Resolve new index
if on is not None:
# if a base column is chosen remove it
if on in base_columns:
base_columns = [col for col in base_columns if col != on]
df = df.set_index(on)
# drop duplicates on index
out_df = df[base_columns][~df.index.duplicated(keep="first")]
# Convert df to NestedFrame if needed
if not isinstance(out_df, NestedFrame):
out_df = NestedFrame(out_df)
# add nested
if nested_columns is None:
nested_columns = [col for col in df.columns if col not in base_columns]
return out_df.join_nested(df[nested_columns], name=name)
[docs]
@classmethod
def from_lists(cls, df, base_columns=None, list_columns=None, name="nested"):
"""Creates a NestedFrame with base and nested columns from a flat
dataframe.
Parameters
----------
df: pd.DataFrame or NestedFrame
A dataframe with list columns.
base_columns: list-like, or None
Any columns that have non-list values in the input df. These will
simply be kept as identical columns in the result
list_columns: list-like, or None
The list-value columns that should be packed into a nested column.
All columns in the list will attempt to be packed into a single
nested column with the name provided in `nested_name`. If None, is
defined as all columns not in `base_columns`.
name:
The name of the output column the `nested_columns` are packed into.
Returns
-------
NestedFrame
A NestedFrame with the specified nesting structure.
Examples
--------
>>> import nested_pandas as npd
>>> nf = npd.NestedFrame({"c":[1,2,3], "d":[2,4,6],
... "e":[[1,2,3], [4,5,6], [7,8,9]]},
... index=[0,1,2])
>>> npd.NestedFrame.from_lists(nf, base_columns=["c","d"])
c d nested
0 1 2 [{e: 1}; …] (3 rows)
1 2 4 [{e: 4}; …] (3 rows)
2 3 6 [{e: 7}; …] (3 rows)
"""
# Resolve base and list columns
if base_columns is None:
if list_columns is None:
# with no inputs, assume all columns are list-valued
list_columns = df.columns
else:
# if list_columns are defined, assume everything else is base
base_columns = [col for col in df.columns if col not in list_columns]
else:
if list_columns is None:
# with defined base_columns, assume everything else is list
list_columns = [col for col in df.columns if col not in base_columns]
if len(list_columns) == 0:
raise ValueError("No columns were assigned as list columns.")
# Pack list columns into a nested column
if len(df) == 0:
# if the dataframe is empty, just return an empty nested column
# since there are no iterable values to pack
packed_df = NestedFrame().join_nested(df[list_columns], name=name)
packed_df.index.name = df.index.name
else:
# Check that each column has iterable elements
for col in list_columns:
# Check if the column is iterable based on its first value.
# This is a simple heuristic but infers more than its dtype
# which will probably be an object.
sample_val = df[col].iloc[0]
if not hasattr(sample_val, "__iter__") and not isinstance(sample_val, str | bytes):
raise ValueError(
f"Cannot pack column {col} which does not contain an iterable list based "
"on its first value, {sample_val}."
)
packed_df = pack_lists(df[list_columns])
packed_df.name = name
# concat the nested column to the base_column df
if base_columns is not None:
return pd.concat([df[base_columns], packed_df], axis=1)
# or just return the packed_df as a nestedframe if no base cols
else:
return NestedFrame(packed_df.to_frame())
[docs]
def drop(
self,
labels=None,
*,
axis=0,
index=None,
columns=None,
level=None,
inplace=False,
errors="raise",
):
"""Drop specified labels from rows or columns.
Remove rows or columns by specifying label names and corresponding
axis, or by directly specifying index or column names. When using a
multi-index, labels on different levels can be removed by
specifying the level. See the `user guide <https://pandas.pydata.org/docs/user_guide
/advanced.html#advanced-shown-levels>`_ for more information about
the now unused levels.
Parameters
----------
labels: single label or list-like
Index or column labels to drop. A tuple will be used as a single
label and not treated as a list-like. Nested sub-columns are
accessed using dot notation (e.g. "nested.col1").
axis: {0 or ‘index’, 1 or ‘columns’}, default 0
Whether to drop labels from the index (0 or ‘index’) or
columns (1 or ‘columns’).
index: single label or list-like
Alternative to specifying axis (labels, axis=0 is equivalent to
index=labels).
columns: single label or list-like
Alternative to specifying axis (labels, axis=1 is equivalent to
columns=labels).
level: int or level name, optional
For MultiIndex, level from which the labels will be removed.
inplace: bool, default False
If False, return a copy. Otherwise, do operation in place and
return None.
errors: {‘ignore’, ‘raise’}, default ‘raise’
If ‘ignore’, suppress error and only existing labels are dropped.
Returns
-------
DataFrame or None
Returns DataFrame or None DataFrame with the specified index or
column labels removed or None if inplace=True.
Examples
--------
>>> from nested_pandas.datasets.generation import generate_data
>>> nf = generate_data(5,5, seed=1)
>>> # drop the "t" column from "nested"
>>> nf = nf.drop(["nested.t"], axis=1)
>>> nf
a b nested
0 0.417022 0.184677 [{flux: 31.551563, flux_error: 1.0, band: 'r'}...
1 0.720324 0.372520 [{flux: 68.650093, flux_error: 1.0, band: 'g'}...
2 0.000114 0.691121 [{flux: 83.462567, flux_error: 1.0, band: 'g'}...
3 0.302333 0.793535 [{flux: 1.828828, flux_error: 1.0, band: 'g'};...
4 0.146756 1.077633 [{flux: 75.014431, flux_error: 1.0, band: 'g'}...
"""
# axis 1 requires special handling for nested columns
if axis == 1 or columns is not None:
# label convergence
if isinstance(labels, str):
labels = [labels]
elif columns is not None:
labels = [columns] if isinstance(columns, str) else columns
columns = None
axis = 1
nested_labels = [label for label in labels if self._is_known_hierarchical_column(label)]
base_labels = [label for label in labels if not self._is_known_hierarchical_column(label)]
# split nested_labels by nested column
if len(nested_labels) > 0:
nested_cols = set([label.split(".")[0] for label in nested_labels])
# drop targeted sub-columns for each nested column
for col in nested_cols:
sub_cols = [label.split(".")[1] for label in nested_labels if label.split(".")[0] == col]
if inplace:
self[col] = self[col].nest.drop(sub_cols)
else:
self = self.assign(**{f"{col}": self[col].nest.drop(sub_cols)})
# drop remaining base columns
if len(base_labels) > 0:
return super().drop(
labels=base_labels,
axis=axis,
index=index,
columns=columns,
level=level,
inplace=inplace,
errors=errors,
)
else:
return self if not inplace else None
# Otherwise just drop like pandas
return super().drop(
labels=labels,
axis=axis,
index=index,
columns=columns,
level=level,
inplace=inplace,
errors=errors,
)
def split(
self,
nested_col: str,
by: str,
values=None,
drop_by_col: bool = False,
drop_nested: bool = False,
) -> NestedFrame:
"""Split a nested column into multiple nested columns by a categorical sub-column.
Parameters
----------
nested_col : str
The name of the nested column to split.
by : str
The name of the sub-column within nested_col to split on.
values : list or str or None, optional
The specific values to split on. If None, all unique values are used.
If a string is provided, it is iterated as a list of characters.
drop_by_col : bool, default False
If True, the sub-column specified by `by` is dropped from each new
nested column.
drop_nested : bool, default False
If True, the original nested column is dropped from the result.
Returns
-------
NestedFrame
A new NestedFrame with one new nested column per unique value in
`by`, named ``{nested_col}_{value}``.
Examples
--------
>>> from nested_pandas.datasets.generation import generate_data
>>> nf = generate_data(5, 5, seed=1)
>>> nf.split("nested", by="band")[["a", "b", "nested_r"]] # doctest: +SKIP
a b nested_r
0 0.417022 0.184677 [{t: 8.38389, flux: 31.551563, band: 'r'}; …] (2 rows)
1 0.720324 0.372520 [{t: 19.365232, flux: 90.85955, band: 'r'}; …] (2 rows)
2 0.000114 0.691121 [{t: 11.173797, flux: 28.044399, band: 'r'}; …] (3 rows)
3 0.302333 0.793535 [{t: 2.807739, flux: 78.927933, band: 'r'}]
4 0.146756 1.077633 [{t: 17.527783, flux: 13.002857, band: 'r'}; …] (2 rows)
"""
if nested_col not in self.nested_columns:
raise ValueError(
f"'{nested_col}' is not a nested column. Available nested columns: {self.nested_columns}"
)
if by not in self[nested_col].nest.columns:
raise ValueError(
f"'{by}' is not a sub-column of '{nested_col}'. "
f"Available sub-columns: {list(self[nested_col].nest.columns)}"
)
has_values = values is not None
split_values = self[f"{nested_col}.{by}"].unique() if not has_values else list(values)
if len(self) == 0:
result = self.copy()
if has_values:
for val in split_values:
result[f"{nested_col}_{val}"] = None
if drop_nested:
result = result.drop(labels=[nested_col], axis=1)
return result
is_str = pd.api.types.is_string_dtype(self[f"{nested_col}.{by}"])
result = self.copy()
for val in split_values:
val_repr = f"'{val}'" if is_str else val
queried = self.query(f"{nested_col}.{by}=={val_repr}")
if queried is None or len(queried) == 0:
if has_values:
result[f"{nested_col}_{val}"] = None
continue
filtered = queried[nested_col]
if drop_by_col:
filtered = filtered.nest.drop(by)
result[f"{nested_col}_{val}"] = filtered
if drop_nested:
result = result.drop(labels=[nested_col], axis=1)
return result
[docs]
def min(self, exclude_nest: bool = False, numeric_only: bool = False, **kwargs):
"""
Return the minimum value of each column as a series, including nested columns
with prefix to indicate the source column.
This computes the column-wise minimum (axis=0) across base and nested columns.
Row-wise minimum (axis=1) are not supported, as reductions along columns
are the primary intended behavior for NestedFrame.
By default, missing values (NaNs) will be skipped in the computation.
For non-numeric columns (e.g., strings), the method returns the
lexicographically smallest value when `numeric_only=False` (default).
Parameters
----------
exclude_nest : bool, default False
If set to True, will exclude the nested structure and
only computes the minimum over the base columns
numeric_only : bool, default False
Include only float, int, boolean columns.
**kwargs
See the documentation for :meth:`pandas.DataFrame.min`
for complete details on the keyword arguments accepted by
:meth:`min`.
Returns
-------
pandas.Series
Examples
--------
>>> from nested_pandas.datasets.generation import generate_data
>>> nf = generate_data(5,5, seed=1)
>>> nf_min = nf.min()
>>> nf_min
a 0.000114
b 0.184677
nested.t 0.547752
nested.flux 1.828828
nested.flux_error 1.0
nested.band g
dtype: object
See Also
--------
:meth:`pandas.DataFrame.min`
"""
if not self.nested_columns:
return super().min(numeric_only=numeric_only, **kwargs)
# handle base columns
base_col = [col for col in self.columns if col not in self.nested_columns]
base_min = super().__getitem__(base_col).min(numeric_only=numeric_only, **kwargs)
if exclude_nest:
return base_min
# handle nested columns
nested_mins = []
for nest_col in self.nested_columns:
nested_df = self[nest_col].explode()
nested_df.columns = [f"{nest_col}.{col}" for col in nested_df.columns]
nested_mins.append(nested_df.min(numeric_only=numeric_only, **kwargs))
# Combine base and nested min values into a single Series if applicable and return
if base_min.empty:
return pd.concat(nested_mins)
else:
return pd.concat([base_min] + nested_mins)
[docs]
def max(self, exclude_nest: bool = False, numeric_only: bool = False, **kwargs):
"""
Return the maximum value of each column as a series, including nested columns
with prefix to indicate the source column.
This computes the column-wise maximum (axis=0) across base and nested columns.
Row-wise maximum (axis=1) are not supported, as reductions along columns
are the primary intended behavior for NestedFrame.
By default, missing values (NaNs) will be skipped in the computation.
For non-numeric columns (e.g., strings), the method returns the
lexicographically largest value when `numeric_only=False` (default).
Parameters
----------
exclude_nest : bool, default False
If set to True, will exclude the nested structure and
only computes the maximum over the base columns
numeric_only : bool, default False
Include only float, int, boolean columns.
**kwargs
See the documentation for :meth:`pandas.DataFrame.max`
for complete details on the keyword arguments accepted by
:meth:`max`.
Returns
-------
pandas.Series
Examples
--------
>>> from nested_pandas.datasets.generation import generate_data
>>> nf = generate_data(5,5, seed=1)
>>> nf_max = nf.max()
>>> nf_max
a 0.720324
b 1.077633
nested.t 19.365232
nested.flux 98.886109
nested.flux_error 1.0
nested.band r
dtype: object
See Also
--------
:meth:`pandas.DataFrame.max`
"""
if not self.nested_columns:
return super().max(numeric_only=numeric_only, **kwargs)
# handle base columns
base_col = [col for col in self.columns if col not in self.nested_columns]
base_max = super().__getitem__(base_col).max(numeric_only=numeric_only, **kwargs)
if exclude_nest:
return base_max
# handle nested columns
nested_maxs = []
for nest_col in self.nested_columns:
nested_df = self[nest_col].explode()
nested_df.columns = [f"{nest_col}.{col}" for col in nested_df.columns]
nested_maxs.append(nested_df.max(numeric_only=numeric_only, **kwargs))
# Combine base and nested max values into a single Series if applicable and return
if base_max.empty:
return pd.concat(nested_maxs)
else:
return pd.concat([base_max] + nested_maxs)
[docs]
def describe(self, exclude_nest: bool = False, percentiles=None, include=None, exclude=None):
"""
Generate descriptive statistics, including nested columns with prefix to indicate the source.
Descriptive statistics include those that summarize the central tendency,
dispersion and shape of a dataset's distribution, excluding NaN values,
similar to the behavior of `pandas.DataFrame.describe()`.
Nested columns use `pyarrow` data types for efficiency, which are not always
directly compatible with pandas' type-based filtering.
- pyarrow strings are not viewed as object type.
- numerical types from pyarrow (i.e., int, double) are still matched by pandas'
`np.number`, so filtering with `include=[np.number]` will include numeric nested columns.
Parameters
----------
exclude_nest : bool, default False
If set to True, will exclude the nested structure and
only computes the statistics over the base columns
percentiles : list-like of numbers, optional
The percentiles to include in the output. All should fall between 0 and 1.
Defaults to [.25, .5, .75].
include : 'all', list-like of dtypes or None (default), optional
A white list of data types to include in the output.
exclude : list-like of dtypes or None (default), optional
A black list of data types to exclude from the output.
Returns
-------
NestedFrame
A NestedFrame with the summary statistics.
Raises
------
ValueError
If no statistics can be generated from the columns.
A combined error message will be given.
Examples
--------
>>> from nested_pandas.datasets.generation import generate_data
>>> nf = generate_data(5,5, seed=1)
>>> nf_desc = nf.describe()
>>> nf_desc
a b nested.t nested.flux nested.flux_error
count 5.000000 5.000000 25.0 25.0 25.0
mean 0.317310 0.623897 10.095623 45.252724 1.0
std 0.274904 0.351880 6.434858 30.152261 0.0
min 0.000114 0.184677 0.547752 1.828828 1.0
25% 0.146756 0.372520 3.96203 21.162812 1.0
50% 0.302333 0.691121 10.663306 44.789353 1.0
75% 0.417022 0.793535 16.014891 69.975836 1.0
max 0.720324 1.077633 19.365232 98.886109 1.0
-See Also
--------
-:meth:`pandas.DataFrame.describe`
"""
result = []
errors = []
check = ["_base"] # a list of all possible columns to call describe()
if not exclude_nest:
check.extend(self.nested_columns)
if not self.nested_columns:
return NestedFrame(super().describe(percentiles=percentiles, include=include, exclude=exclude))
for checkable in check:
# check the base columns
if checkable == "_base":
try:
base_col = [col for col in self.columns if col not in self.nested_columns]
base_desc = (
super()
.__getitem__(base_col)
.describe(
percentiles=percentiles,
include=include,
exclude=exclude,
)
)
except ValueError as err:
# continue if value error caused by no matching type or empty base columns
errors.append(f"Base columns: {err}")
continue
result.append(base_desc)
# check the nested columns
else:
nested_df = self[checkable].explode()
nested_df.columns = [f"{checkable}.{col}" for col in nested_df.columns]
try:
nested_desc = nested_df.describe(
percentiles=percentiles,
include=include,
exclude=exclude,
)
except ValueError as err:
# continue if value error caused by no matching type for nested columns
errors.append(f"Nested column '{checkable}': {err}")
continue
result.append(nested_desc)
if not result:
raise ValueError(f"All columns in {check} failed.\n" + "\n".join(errors))
if include is None and exclude is None:
# try only get the numeric columns and drop the others
numeric_dtypes = [r.select_dtypes(include=[np.number]) for r in result]
non_empty_numeric_dtypes = [r for r in numeric_dtypes if not r.empty]
if non_empty_numeric_dtypes:
result = non_empty_numeric_dtypes
return NestedFrame(pd.concat(result, axis=1))
[docs]
def explode(self, column: IndexLabel, ignore_index: bool = False):
"""
Transform each element of a list-like base column to a row, replicating index values.
Parameters
----------
column : IndexLabel
Column(s) to explode.
For multiple columns, specify a non-empty list with each element
be str or tuple, and all specified columns their list-like data
on same row of the frame must have matching length.
ignore_index : bool, default False
If True, the resulting index will be labeled 0, 1, ..., n - 1.
Returns
-------
NestedFrame
Exploded lists and to rows of the subset columns;
index will be duplicated for these rows.
Raises
------
ValueError
It raises if:
1) columns of the frame are not unique,
2) specified columns to explode is an empty list,
3) specified columns to explode do not have matching counts of
elements rowwise in the frame.
See Also
--------
:meth:`pandas.DataFrame.explode`
Examples
--------
>>> from nested_pandas.datasets.generation import generate_data
>>> nf = generate_data(3,3, seed=1)
>>> nf_explode = nf.explode(column="nested")
>>> nf_explode
a b t flux flux_error band
0 0.417022 0.604665 3.725204 67.046751 1.0 g
0 0.417022 0.604665 10.776335 14.038694 1.0 g
0 0.417022 0.604665 4.089045 96.826158 1.0 g
1 0.720324 0.293512 6.911215 41.73048 1.0 r
1 0.720324 0.293512 8.38389 19.810149 1.0 r
1 0.720324 0.293512 17.562349 31.342418 1.0 g
2 0.000114 0.184677 7.935349 55.868983 1.0 r
2 0.000114 0.184677 13.70439 80.074457 1.0 r
2 0.000114 0.184677 0.547752 69.232262 1.0 g
"""
if isinstance(column, str):
columns = [column]
elif isinstance(column, list):
columns = column
if len(columns) == 0:
raise ValueError("`column` must not be empty")
if len(set(columns)) != len(columns):
raise ValueError("`column` must have unique elements")
else:
raise ValueError("`column` must be str or list")
if len(extra_cols := set(columns) - set(self.columns)) > 0:
if len(extra_cols) == 1:
raise ValueError(
f"column {extra_cols.pop()} not found, available columns: {list(self.columns)}"
)
raise ValueError(
f"columns {sorted(extra_cols)} not found, available columns: {list(self.columns)}"
)
nested_columns = [col for col in columns if col in self.nested_columns]
base_columns = [col for col in columns if col not in nested_columns]
# Shortcut for the base-column-only case
if len(nested_columns) == 0:
return NestedFrame(super().explode(columns, ignore_index=ignore_index))
# Handle duplicated index use-case: use "ordinal" index, but keep the original one as a column to
# restore it later.
default_index_name = "__index_"
index_col_name = self.index.name or default_index_name
w_ordinal_idx = self.reset_index(drop=False, names=index_col_name)
# Call pandas.DataFrame.explode for non-nested columns
all_but_requested_nested_columns = [col for col in w_ordinal_idx.columns if col not in nested_columns]
base_exploded = w_ordinal_idx[all_but_requested_nested_columns]
if len(all_but_requested_nested_columns) > 0 and len(base_columns) > 0:
base_exploded = super(NestedFrame, base_exploded).explode(base_columns, ignore_index=False)
base_exploded = NestedFrame(base_exploded)
# Check if it was actually exploded, or no list-columns were there.
# This could fail in the case when all lists had one element only, we ignore that edge-case here.
is_base_exploded = not w_ordinal_idx.index.equals(base_exploded.index)
# Unnest each requested nested column and store as a "flat" dataframe.
flat_frames: list[Self] = [] # type: ignore[name-defined] # noqa: F821
for nested_col in nested_columns:
# Check if counts (lengths) in nested columns mismatch
if len(flat_frames) > 0 and np.any(
w_ordinal_idx[nested_col].nest.len() != w_ordinal_idx[nested_columns[0]].nest.len()
):
raise ValueError(
f"One or few rows of {nested_col} have different element counts from {nested_columns[0]}"
)
flat = w_ordinal_idx[nested_col].explode()
# Check if counts (lengths) of this nested column mismatch with one of the list columns.
if is_base_exploded and not base_exploded.index.equals(flat.index):
raise ValueError(
f"One or few rows of {nested_col} have different element counts "
f"from one or few of these columns: {base_columns}"
)
flat_frames.append(flat)
if is_base_exploded:
result = pd.concat([base_exploded] + flat_frames, axis=1)
else:
# Join works here, because we used the ordinal index before exploding
result = base_exploded.join(pd.concat(flat_frames, axis=1))
if ignore_index:
return result.drop(index_col_name, axis=1).reset_index(drop=True)
# Restore original index
result = result.set_index(index_col_name, drop=True)
if result.index.name == default_index_name:
result.index.name = None
return result
[docs]
def fillna(
self,
value: Hashable | Mapping | pd.Series | pd.DataFrame | None = None,
*,
axis: Axis | None = None,
inplace: bool = False,
limit: int | None = None,
) -> NestedFrame | None:
"""
Fill NA/NaN values using the specified method for base and nested columns.
Parameters
----------
value : scalar, dict, Series, or DataFrame
Value to use to fill holes (e.g. 0), alternately a
dict/Series/DataFrame of values specifying which value to use for
each column. Values not in the dict/Series/DataFrame will not be filled.
This value cannot be a list.
axis : {axes_single_arg}, default None
Axis along which to fill missing values.
inplace : bool, default False
If True, fill in-place. Note: this will modify any
other views on this object (e.g., a no-copy slice for a column in a
NestedFrame).
limit : int, default None
The maximum number of entries along the entire axis where NaNs will be
filled. Must be greater than 0 if not None. Currently, limit on nested
columns is not supported, meaning that all Nans will be filled (if there
is a value specified) regardless of the input.
Returns
-------
NestedFrame or None
NestedFrame with missing values filled or None if ``inplace=True``.
See Also
--------
:meth:`pandas.DataFrame.fillna`
Examples
--------
>>> import nested_pandas as npd
>>> nf = npd.NestedFrame(
... data={"a": [np.nan, 20, np.nan], "b": [np.nan, np.nan, 30], "c": [10, np.nan, np.nan]},
... index=[0, 1, 2]
... )
>>> nested = pd.DataFrame(
... data={"d": [np.nan, np.nan, np.nan], "e": [np.nan, 1, np.nan]},
... index=[0, 1, 2]
... )
>>> nf = nf.join_nested(nested, "nested")
>>> nf.fillna(0)
a b c nested
0 0.0 0.0 10.0 [{d: 0.0, e: 0.0}]
1 20.0 0.0 0.0 [{d: 0.0, e: 1.0}]
2 0.0 30.0 0.0 [{d: 0.0, e: 0.0}]
"""
if not self.nested_columns:
return super().fillna(value=value, axis=axis, inplace=inplace, limit=limit)
base_cols = [col for col in self.columns if col not in self.nested_columns]
filled_df = super().__getitem__(base_cols).fillna(value=value, axis=axis, inplace=False, limit=limit)
for nest_col in self.nested_columns:
nested_df = self[nest_col].explode()
nested_value: Any
if isinstance(value, Mapping):
nested_value = {}
for k, v in value.items():
if k.startswith(f"{nest_col}."):
subcol = k.split(".", 1)[1] # strip prefix
nested_value[subcol] = v
else:
nested_value = value
nested_df = nested_df.fillna(value=nested_value, axis=axis, inplace=False, limit=None)
filled_df = filled_df.join_nested(nested_df, nest_col)
if inplace:
self._update_inplace(filled_df)
return None
return filled_df
[docs]
def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None:
"""Evaluate a string describing operations on NestedFrame columns.
Operates on columns only, not specific rows or elements. This allows
`eval` to run arbitrary code, which can make you vulnerable to code
injection if you pass user input to this function.
Works the same way as `pd.DataFrame.eval`, except that this method
will also automatically unpack nested columns into NestedSeries,
and the resulting expression will have the dimensions of the unpacked
series.
Parameters
----------
expr : str
The expression string to evaluate.
inplace : bool, default False
If the expression contains an assignment, whether to perform the
operation inplace and mutate the existing NestedFrame. Otherwise,
a new NestedFrame is returned.
**kwargs
See the documentation for :meth:`pandas.DataFrame.eval` for
complete details on the keyword arguments accepted by :meth:`eval`.
Returns
-------
ndarray, scalar, pandas object, nested-pandas object, or None
The result of the evaluation or None if ``inplace=True``.
See Also
--------
:meth:`pandas.DataFrame.eval`
"""
_, aliases = _identify_aliases(expr)
self._aliases: dict[str, str] | None = aliases
kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + (_NestResolver(self),)
kwargs["inplace"] = inplace
kwargs["parser"] = "nested-pandas"
answer = super().eval(expr, **kwargs)
# If the result is a _SeriesFromNest, set the metadata manually
# This is a bit of a hack, as it's a backstop for super().eval()
# not propagating the metadata correctly, `for some reason`.
# Furthermore, it relies on the assumption that the first resolver
# is the only one that matters. Because we disallow multi-layer
# queries, this is potentially safe, though eval statements that target
# multiple nests may have strange behavior.
if isinstance(answer, _SeriesFromNest) and not hasattr(answer, "nest_name"):
nest_key = list(kwargs["resolvers"][0].keys())[0]
answer.nest_name = kwargs["resolvers"][0][nest_key]._nest_name
answer.flat_nest = kwargs["resolvers"][0][nest_key]._flat_nest
self._aliases = None
return answer
def extract_nest_names(
self,
expr: str,
local_dict=None,
global_dict=None,
resolvers=(),
level: int = 0,
target=None,
**kwargs,
) -> set[str]:
"""
Given a string expression, parse it and visit the resulting expression tree,
surfacing the nesting types. The purpose is to identify expressions that attempt
to mix base and nested columns, or columns from two different nests.
"""
index_resolvers = self._get_index_resolvers()
column_resolvers = self._get_cleaned_column_resolvers()
resolvers = resolvers + (_NestResolver(self), column_resolvers, index_resolvers)
# Parser needs to be the "nested-pandas" parser.
# We also need the same variable context that eval() will have, so that
# backtick-quoted names are substituted as expected.
env = ensure_scope(
level + 1,
global_dict=global_dict,
local_dict=local_dict,
resolvers=resolvers,
target=target,
)
parsed_expr = Expr(expr, parser="nested-pandas", env=env)
expr_tree = parsed_expr.terms
separable = _subexprs_by_nest([], expr_tree)
return set(separable.keys())
[docs]
def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame | None:
"""Query the columns of a NestedFrame with a boolean expression. Specified
queries can target nested columns in addition to the typical column set
Parameters
----------
expr : str
The query string to evaluate.
Access nested columns using `nested_df.nested_col` (where
`nested_df` refers to a particular nested dataframe and
`nested_col` is a column of that nested dataframe).
You can refer to variables
in the environment by prefixing them with an '@' character like
``@a + b``.
You can refer to column names that are not valid Python variable names
by surrounding them in backticks. Thus, column names containing spaces
or punctuations (besides underscores) or starting with digits must be
surrounded by backticks. (For example, a column named "Area (cm^2)" would
be referenced as ```Area (cm^2)```). Column names which are Python keywords
(like "list", "for", "import", etc) cannot be used.
For example, if one of your columns is called ``a a`` and you want
to sum it with ``b``, your query should be ```a a` + b``.
inplace : bool
Whether to modify the DataFrame rather than creating a new one.
**kwargs
See the documentation for :meth:`pandas.DataFrame.query`
for complete details on the keyword arguments accepted by
:meth:`query`.
Returns
-------
NestedFrame
NestedFrame resulting from the provided query expression.
Examples
--------
>>> from nested_pandas.datasets.generation import generate_data
>>> nf = generate_data(5,5, seed=1)
>>> nf = nf.query("nested.t > 10")
>>> nf
a b nested
0 0.417022 0.184677 [{t: 13.40935, flux: 98.886109, flux_error: 1....
1 0.720324 0.372520 [{t: 13.70439, flux: 68.650093, flux_error: 1....
2 0.000114 0.691121 [{t: 11.173797, flux: 28.044399, flux_error: 1...
3 0.302333 0.793535 [{t: 17.562349, flux: 1.828828, flux_error: 1....
4 0.146756 1.077633 [{t: 17.527783, flux: 13.002857, flux_error: 1...
Most of the Series and NestedSeries attibutes and methods are available
through the query interface. For example, to query based on the length
of the nested frames, you can do:
>>> nf = nf.query("nested.len() > 2")
>>> nf
a b nested
0 0.417022 0.184677 [{t: 13.40935, flux: 98.886109, flux_error: 1....
3 0.302333 0.793535 [{t: 17.562349, flux: 1.828828, flux_error: 1....
4 0.146756 1.077633 [{t: 17.527783, flux: 13.002857, flux_error: 1...
See Also
--------
:meth:`pandas.DataFrame.query`
Notes
-----
Queries that target a particular nested structure return a dataframe
with rows of that particular nested structure filtered. For example,
querying the NestedFrame "df" with nested structure "my_nested" as
below will return all rows of df, but with mynested filtered by the
condition: `nf.query("mynested.a > 2")`
"""
if not isinstance(expr, str):
msg = f"expr must be a string to be evaluated, {type(expr)} given"
raise ValueError(msg)
kwargs["level"] = kwargs.pop("level", 0) + 1
kwargs["target"] = None
# At present, the query expression must be either entirely within a
# single nest, or have nothing but base columns. Mixed structures are not
# supported, so preflight the expression.
nest_names = self.extract_nest_names(expr, **kwargs)
if len(nest_names) > 1:
raise ValueError("Queries cannot target multiple structs/layers, write a separate query for each")
result = self.eval(expr, **kwargs)
# If the result is a _SeriesFromNest, then the evaluation has caused unpacking,
# which means that a nested attribute was referenced. Apply this result
# to the nest and repack. Otherwise, apply it to this instance as usual,
# since it operated on the base attributes.
if isinstance(result, _SeriesFromNest):
nest_name, flat_nest = result.nest_name, result.flat_nest
# Reset index to "ordinal" like [0, 0, 0, 1, 1, 2, 2, 2]
list_index = self[nest_name].array.get_list_index()
flat_nest = flat_nest.set_index(list_index)
query_result = result.set_axis(list_index)
# Selecting flat values matching the query result
new_flat_nest = flat_nest[query_result]
new_df = self._set_filtered_flat_df(nest_name, new_flat_nest)
else:
new_df = self.loc[result]
if inplace:
self._update_inplace(new_df)
return None
else:
return new_df
def _set_filtered_flat_df(self, nest_name, flat_df):
"""Set a filtered flat dataframe for a nested column
Here we assume that flat_df has filtered "ordinal" index,
e.g. flat_df.index == [0, 2, 2, 2], while self.index
is arbitrary (e.g. ["a", "b", "a"]),
and self[nest_name].array.list_index is [0, 0, 1, 1, 1, 2, 2, 2, 2].
"""
new_df = self.reset_index(drop=True)
new_df[nest_name] = pack_sorted_df_into_struct(flat_df, name=nest_name)
return new_df.set_index(self.index)
def _resolve_dropna_target(self, on_nested, subset):
"""resolves the target layer for a given set of dropna kwargs"""
nested_cols = self.nested_columns
# first check the subset kwarg input
subset_target = []
if subset:
if isinstance(subset, str):
subset = [subset]
for col in subset:
# Without a ".", always assume base layer
if "." not in col:
subset_target.append("base")
else:
layer, col = col.split(".")
if layer in nested_cols:
subset_target.append(layer)
else:
raise ValueError(f"layer '{layer}' not found in the base columns")
# Check for 1 target
subset_target = np.unique(subset_target)
if len(subset_target) > 1: # prohibit multi-target operations
raise ValueError(
f"Targeted multiple nested structures ({subset_target}), write one command per target dataframe" # noqa
)
subset_target = str(subset_target[0])
# Next check the on_nested kwarg input
if on_nested and on_nested not in nested_cols:
raise ValueError("Provided nested layer not found in nested dataframes")
# Resolve target layer
target = "base"
if on_nested and subset_target:
if on_nested != subset_target:
raise ValueError(
f"Provided on_nested={on_nested}, but subset columns are from {subset_target}. Make sure these are aligned or just use subset." # noqa
)
else:
target = subset_target
elif on_nested:
target = str(on_nested)
elif subset_target:
target = str(subset_target)
return target, subset
[docs]
def dropna(
self,
*,
axis: Axis = 0,
how: AnyAll | lib.NoDefault = no_default,
thresh: int | lib.NoDefault = no_default,
on_nested: bool = False,
subset: IndexLabel | None = None,
inplace: bool = False,
ignore_index: bool = False,
) -> NestedFrame | None:
"""
Remove missing values for one layer of the NestedFrame.
Parameters
----------
axis : {0 or 'index', 1 or 'columns'}, default 0
Determine if rows or columns which contain missing values are
removed.
* 0, or 'index' : Drop rows which contain missing values.
* 1, or 'columns' : Drop columns which contain missing value.
Only a single axis is allowed.
how : {'any', 'all'}, default 'any'
Determine if row or column is removed from DataFrame, when we have
at least one NA or all NA.
* 'any' : If any NA values are present, drop that row or column.
* 'all' : If all values are NA, drop that row or column.
thresh : int, optional
Require that many non-NA values. Cannot be combined with how.
on_nested : str or bool, optional
If not False, applies the call to the nested dataframe in the
column with label equal to the provided string. If specified,
the nested dataframe should align with any columns given in
`subset`.
subset : column label or sequence of labels, optional
Labels along other axis to consider, e.g. if you are dropping rows
these would be a list of columns to include.
Access nested columns using `nested_df.nested_col` (where
`nested_df` refers to a particular nested dataframe and
`nested_col` is a column of that nested dataframe).
inplace : bool, default False
Whether to modify the DataFrame rather than creating a new one.
ignore_index : bool, default ``False``
If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
.. versionadded:: 2.0.0
Returns
-------
DataFrame or None
DataFrame with NA entries dropped from it or None if ``inplace=True``.
Examples
--------
A common usecase for `dropna` is to remove empty nested rows:
>>> from nested_pandas.datasets.generation import generate_data
>>> nf = generate_data(5,5, seed=1)
>>> # this query empties several of the nested dataframes
>>> nf = nf.query("nested.t > 19")
>>> nf
a b nested
0 0.417022 0.184677 None
1 0.720324 0.372520 [{t: 19.365232, flux: 90.85955, flux_error: 1....
2 0.000114 0.691121 [{t: 19.157791, flux: 14.672857, flux_error: 1...
3 0.302333 0.793535 None
4 0.146756 1.077633 None
>>> # dropna removes rows with those emptied dataframes
>>> nf.dropna(subset="nested")
a b nested
1 0.720324 0.372520 [{t: 19.365232, flux: 90.85955, flux_error: 1....
2 0.000114 0.691121 [{t: 19.157791, flux: 14.672857, flux_error: 1...
`dropna` can also be used on nested columns:
>>> nf = generate_data(5,5, seed=1)
>>> # Either on the whole dataframe
>>> nf.dropna(on_nested="nested")
a b nested
0 0.417022 0.184677 [{t: 8.38389, flux: 31.551563, flux_error: 1.0...
1 0.720324 0.372520 [{t: 13.70439, flux: 68.650093, flux_error: 1....
2 0.000114 0.691121 [{t: 4.089045, flux: 83.462567, flux_error: 1....
3 0.302333 0.793535 [{t: 17.562349, flux: 1.828828, flux_error: 1....
4 0.146756 1.077633 [{t: 0.547752, flux: 75.014431, flux_error: 1....
>>> # or on a specific nested column
>>> nf.dropna(subset="nested.t")
a b nested
0 0.417022 0.184677 [{t: 8.38389, flux: 31.551563, flux_error: 1.0...
1 0.720324 0.372520 [{t: 13.70439, flux: 68.650093, flux_error: 1....
2 0.000114 0.691121 [{t: 4.089045, flux: 83.462567, flux_error: 1....
3 0.302333 0.793535 [{t: 17.562349, flux: 1.828828, flux_error: 1....
4 0.146756 1.077633 [{t: 0.547752, flux: 75.014431, flux_error: 1....
Notes
-----
Operations that target a particular nested structure return a dataframe
with rows of that particular nested structure affected.
Values for `on_nested` and `subset` should be consistent in pointing
to a single layer, multi-layer operations are not supported.
"""
# determine target dataframe
target, subset = self._resolve_dropna_target(on_nested, subset)
if target == "base":
return super().dropna(
axis=axis,
how=how,
thresh=thresh,
subset=subset,
inplace=inplace,
ignore_index=ignore_index,
)
if ignore_index:
raise ValueError("ignore_index is not supported for nested columns")
if subset is not None:
subset = [col.split(".")[-1] for col in subset]
target_flat = self[target].explode()
target_flat = target_flat.set_index(self[target].array.get_list_index())
if inplace:
target_flat.dropna(
axis=axis,
how=how,
thresh=thresh,
subset=subset,
inplace=True,
)
else:
target_flat = target_flat.dropna(
axis=axis,
how=how,
thresh=thresh,
subset=subset,
inplace=False,
)
new_df = self._set_filtered_flat_df(nest_name=target, flat_df=target_flat)
if inplace:
self._update_inplace(new_df)
return None
return new_df
[docs]
def sort_values(
self,
by,
*,
axis=0,
ascending=True,
inplace=False,
kind="quicksort",
na_position="last",
ignore_index=False,
key=None,
):
"""
Sort by the values along either axis.
Parameters
----------
by : str or list of str
Name or list of names to sort by.
Access nested columns using `nested_df.nested_col` (where
`nested_df` refers to a particular nested dataframe and
`nested_col` is a column of that nested dataframe).
axis : {0 or 'index', 1 or 'columns'}, default 0
Axis to be sorted.
ascending : bool or list of bool, default True
Sort ascending vs. descending. Specify list for multiple sort
orders. If this is a list of bools, must match the length of the
by.
inplace : bool, default False
If True, perform operation in-place.
kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort'
Choice of sorting algorithm. See also ndarray.np.sort for more
information. mergesort is the only stable algorithm. For DataFrames,
this option is only applied when sorting on a single column or label.
na_position : {'first', 'last'}, default 'last'
Puts NaNs at the beginning if first; last puts NaNs at the end.
ignore_index : bool, default False
If True, the resulting axis will be labeled 0, 1, …, n - 1.
Always False when applied to nested layers.
key : callable, optional
Apply the key function to the values before sorting.
Returns
-------
DataFrame or None
DataFrame with sorted values if inplace=False, None otherwise.
Examples
---------
>>> from nested_pandas.datasets.generation import generate_data
>>> nf = generate_data(5,5, seed=1)
>>> # Sort nested values
>>> nf.sort_values(by="nested.band")
a b nested
0 0.417022 0.184677 [{t: 13.40935, flux: 98.886109, flux_error: 1....
1 0.720324 0.372520 [{t: 13.70439, flux: 68.650093, flux_error: 1....
2 0.000114 0.691121 [{t: 4.089045, flux: 83.462567, flux_error: 1....
3 0.302333 0.793535 [{t: 17.562349, flux: 1.828828, flux_error: 1....
4 0.146756 1.077633 [{t: 0.547752, flux: 75.014431, flux_error: 1....
"""
# Resolve target layer
target = []
if isinstance(by, str):
by = [by]
# Check "by" columns for hierarchical references
for col in by:
if self._is_known_hierarchical_column(col):
target.append(col.split(".")[0])
else:
target.append("base")
# Ensure one target layer, preventing multi-layer operations
target = np.unique(target)
if len(target) > 1:
raise ValueError("Queries cannot target multiple structs/layers, write a separate query for each")
target = str(target[0])
# Apply pandas sort_values
if target == "base":
return super().sort_values(
by=by,
axis=axis,
ascending=ascending,
inplace=inplace,
kind=kind,
na_position=na_position,
ignore_index=ignore_index,
key=key,
)
else: # target is a nested column
target_flat = self[target].explode()
target_flat = target_flat.set_index(self[target].array.get_list_index())
if target_flat.index.name is None: # set name if not present
target_flat.index.name = "index"
# Index must always be the first sort key for nested columns
nested_by = [target_flat.index.name] + [col.split(".")[-1] for col in by]
# Augment the ascending kwarg to include the index
if isinstance(ascending, bool):
ascending = [True] + [ascending] * len(by)
elif isinstance(ascending, list):
ascending = [True] + ascending
target_flat = target_flat.sort_values(
by=nested_by,
axis=axis,
ascending=ascending,
kind=kind,
na_position=na_position,
ignore_index=False,
key=key,
inplace=False,
)
# Could be optimized, as number of rows doesn't change
new_df = self._set_filtered_flat_df(nest_name=target, flat_df=target_flat)
if inplace:
self._update_inplace(new_df)
return None
return new_df
@deprecated(
version="0.6.0",
reason="`reduce` will be removed in version 0.7.0, use `map_rows` instead.",
)
def reduce(self, func, *args, infer_nesting=True, append_columns=False, **kwargs) -> NestedFrame: # type: ignore[override]
"""
Takes a function and applies it to each top-level row of the NestedFrame.
The user may specify which columns the function is applied to, with
columns from the 'base' layer being passed to the function as
scalars and columns from the nested layers being passed as numpy arrays.
Parameters
----------
func : callable
Function to apply to each nested dataframe. The first arguments to `func` should be which
columns to apply the function to. See the Notes for recommendations
on writing func outputs.
args : positional arguments
A list of string column names to pull from the NestedFrame to pass along
to the function. If the function has additional arguments, pass them as
keyword arguments (e.g. `arg_name=value`).
infer_nesting : bool, default True
If True, the function will pack output columns into nested
structures based on column names adhering to a nested naming
scheme. E.g. "nested.b" and "nested.c" will be packed into a column
called "nested" with columns "b" and "c". If False, all outputs
will be returned as base columns.
append_columns : bool, default False
if True, the output columns should be appended to those in the original NestedFrame.
kwargs : keyword arguments, optional
Keyword arguments to pass to the function.
Returns
-------
`NestedFrame`
`NestedFrame` with the results of the function applied to the columns of the frame.
Examples
--------
>>> from nested_pandas.datasets.generation import generate_data
>>> import numpy as np
>>> nf = generate_data(5,5, seed=1)
>>>
>>> # define a custom user function
>>> # reduce will return a NestedFrame with two columns
>>> def example_func(base_col, nested_col):
... return {
... "mean": np.mean(nested_col),
... "mean_minus_base": np.mean(nested_col) - base_col,
... }
>>>
>>> # apply the function
>>> nf.reduce(example_func, "a", "nested.t")
mean mean_minus_base
0 11.533440 11.116418
1 10.307751 9.587426
2 8.294042 8.293928
3 9.655291 9.352958
4 10.687591 10.540836
You may want the result of a `reduce` call to have nested structure,
we can achieve this by using the `infer_nesting` kwarg:
>>> # define a custom user function that returns nested structure
>>> def example_func(base_col1, base_col2, nested_col):
... '''reduce will return a NestedFrame with nested structure'''
... return {"offsets.t_a": nested_col - base_col1,
... "offsets.t_b": nested_col - base_col2}
By giving both output columns the prefix "offsets.", we signal
to reduce to infer that these should be packed into a nested column
called "offsets".
>>> # apply the function with `infer_nesting` (True by default)
>>> nf.reduce(example_func, "a", "b", "nested.t")
offsets
0 [{t_a: 7.966868, t_b: 8.199213}; …] (5 rows)
1 [{t_a: 12.984066, t_b: 13.33187}; …] (5 rows)
2 [{t_a: 4.088931, t_b: 3.397924}; …] (5 rows)
3 [{t_a: 17.260016, t_b: 16.768814}; …] (5 rows)
4 [{t_a: 0.400996, t_b: -0.529882}; …] (5 rows)
Notes
-----
By default, `reduce` will produce a `NestedFrame` with enumerated
column names for each returned value of the function. For more useful
naming, it's recommended to have `func` return a dictionary where each
key is an output column of the dataframe returned by `reduce` (as
shown above).
"""
# Parse through the initial args to determine the columns to apply the function to
requested_columns = []
for arg in args:
# Stop when we reach an argument that is not a valid column, as we assume
# that the remaining args are extra arguments to the function
if not isinstance(arg, str):
raise TypeError(
f"Received an argument '{arg}' that is not a string. "
"All arguments to `reduce` must be strings corresponding to"
" column names to pass along to the function. If your function"
" has additional arguments, pass them as kwargs (arg_name=value)."
)
components = self._parse_hierarchical_components(arg)
if not self._is_known_column(components):
raise ValueError(
f"Received a string argument '{arg}' that was not found in the columns list. "
"All arguments to `reduce` must be strings corresponding to"
" column names to pass along to the function. If your function"
" has additional arguments, pass them as kwargs (arg_name=value)."
)
layer = "base" if len(components) < 2 else components[0]
col = components[-1]
requested_columns.append((layer, col))
# We require the first *args to be the columns to apply the function to
if not requested_columns:
raise ValueError("No columns in `*args` specified to apply function to")
# The remaining args are the extra arguments to the function other than columns
extra_args: tuple[Any, ...] = () # empty tuple to make mypy happy
if len(requested_columns) < len(args):
extra_args = args[len(requested_columns) :]
iterators = []
for layer, col in requested_columns:
if layer == "base":
iterators.append(self[col])
else:
iterators.append(self[layer].array.iter_field_lists(col))
results = [func(*cols, *extra_args, **kwargs) for cols in zip(*iterators, strict=True)]
results_nf = NestedFrame(results, index=self.index)
if infer_nesting:
# find potential nested structures from columns
nested_cols = list(
np.unique(
[
column.split(".", 1)[0]
for column in results_nf.columns
if isinstance(column, str) and "." in column
]
)
)
# pack results into nested structures
for layer in nested_cols:
layer_cols = [col for col in results_nf.columns if col.startswith(f"{layer}.")]
rename_df = results_nf[layer_cols].rename(columns=lambda x: x.split(".", 1)[1])
nested_col = pack_lists(rename_df, name=layer)
results_nf = results_nf[
[col for col in results_nf.columns if not col.startswith(f"{layer}.")]
]
results_nf[layer] = nested_col
if append_columns:
# Append the results to the original NestedFrame
return pd.concat([self, results_nf], axis=1)
# Otherwise, return the results as a new NestedFrame
return results_nf
def _apply_njit_map_rows(self, requested_columns, func):
"""
Apply njit map_rows to njit custom function with requested_columns.
Currently only supports 1 or 2 arguments custom function.
"""
try:
import numba # noqa
except ImportError as err:
raise ImportError(
"njit=True requires numba, please install with pip install numba"
"or conda install conda-forge::numba"
) from err
from . import njit_funcs
if len(requested_columns) == 1:
layer, col_name = requested_columns[0]
if layer == "base":
base_col = np.asarray(self[col_name])
results = njit_funcs._map_rows_njit1_base(func, base_col)
else:
nested_array = self[layer]
offsets = np.asarray(nested_array.array.list_offsets)
nested_col = np.asarray(nested_array[col_name])
results = njit_funcs._map_rows_njit1_nested(func, offsets, nested_col)
else:
# 2 requested columns for 2-arg custom function
layer1, col1_name = requested_columns[0]
layer2, col2_name = requested_columns[1]
if layer1 == "base" and layer2 == "base":
base_col1 = np.asarray(self[col1_name])
base_col2 = np.asarray(self[col2_name])
results = njit_funcs._map_rows_njit2_base_base(func, base_col1, base_col2)
elif layer1 == "base":
base_col1 = np.asarray(self[col1_name])
nested_array2 = self[layer2]
offsets = np.asarray(nested_array2.array.list_offsets)
col2 = np.asarray(nested_array2[col2_name])
results = njit_funcs._map_rows_njit2_base_nest(func, base_col1, offsets, col2)
elif layer2 == "base":
nested_array1 = self[layer1]
offsets = np.asarray(nested_array1.array.list_offsets)
col1 = np.asarray(nested_array1[col1_name])
base_col2 = np.asarray(self[col2_name])
results = njit_funcs._map_rows_njit2_nest_base(func, offsets, col1, base_col2)
else:
nested_array1 = self[layer1]
nested_array2 = self[layer2]
offsets1 = np.asarray(nested_array1.array.list_offsets)
offsets2 = np.asarray(nested_array2.array.list_offsets)
col1 = np.asarray(nested_array1[col1_name])
col2 = np.asarray(nested_array2[col2_name])
results = njit_funcs._map_rows_njit2_nest_nest(func, offsets1, offsets2, col1, col2)
return results.tolist()
[docs]
def map_rows(
self,
func: Callable[..., Any],
columns: None | str | list[str] = None,
*,
row_container: Literal["dict"] | Literal["args"] = "dict",
output_names: None | str | list[str] = None,
infer_nesting: bool = True,
append_columns: bool = False,
njit: bool = False,
**kwargs,
) -> NestedFrame: # type: ignore[override]
"""
Takes a function and applies it to each top-level row of the NestedFrame.
Nested columns are packaged alongside base columns and available for function use, where base columns
are passed as scalars and nested columns are passed as numpy arrays. The way in which the row data is
packaged is configurable (by default, a dictionary) and controlled by the `row_container` argument.
Parameters
----------
func : callable
Function to apply to each nested dataframe. The first arguments to `func` should be which
columns to apply the function to. See the Notes for recommendations
on writing func outputs.
columns : None | str | list of str
Specifies which columns to pass to the function in the row_container format.
If None, all columns are passed. If list of str, those columns are passed.
If str, a single column is passed or if the string is a nested column, then all nested sub-columns
are passed (e.g. columns="nested" passes all columns of the nested dataframe "nested"). To pass
individual nested sub-columns, use the hierarchical column name (e.g. columns=["nested.t",...]).
row_container : 'dict' or 'args', default 'dict'
Specifies how the row data will be packaged when passed as an input to the function.
If 'dict', the function will be called as `func({"col1": value, ...}, **kwargs)`, so func should
expect a single dictionary input with keys corresponding to column names.
If 'args', the function will be called as `func(value, ..., **kwargs)`, so func should expect
positional arguments corresponding to the columns specified in `args`.
output_names : None | str | list of str
Specifies the names of the output columns in the resulting NestedFrame. If None, the function
will return whatever names the user function returns. If specified will override any names
returned by the user function provided the number of names matches the number of outputs. When not
specified and the user function returns values without names (e.g. a list or tuple), the output
columns will be enumerated (e.g. "0", "1", ...).
infer_nesting : bool, default True
If True, the function will pack output columns into nested
structures based on column names adhering to a nested naming
scheme. E.g. "nested.b" and "nested.c" will be packed into a column
called "nested" with columns "b" and "c". If False, all outputs
will be returned as base columns. Note that this will trigger off of names specified in
`output_names` in addition to names returned by the user function.
append_columns : bool, default False
If True, the output columns are appended to those in the original NestedFrame.
The output columns can contain nested sub-columns, which should be specified using their
hierarchical column name (e.g. "nested.x"). If their base nested column exists in the
original NestedFrame, the new output sub-columns will be added into the frame of the
existing nested column. See an example below.
njit : bool, default False
If Ture, the function will try to use numba's njit to speed up the execution.
This will only work if the custom function is compatible with njit and the requested columns
are at most two.
Note that using njit will disable support for `row_container="dict"`.
kwargs : keyword arguments, optional
Keyword arguments to pass to the function.
Returns
-------
`NestedFrame`
`NestedFrame` with the results of the function applied to the columns of the frame.
Examples
--------
>>> from nested_pandas.datasets.generation import generate_data
>>> import numpy as np
>>> nf = generate_data(5,5, seed=1)
>>> # define a custom user function
>>> # map_rows will return a NestedFrame with two columns
>>> def example_func(row):
... return np.mean(row["nested.t"]), np.mean(row["nested.t"]) - row["a"]
>>> # apply the function
>>> nf.map_rows(example_func, output_names=["mean", "mean_minus_base"])
mean mean_minus_base
0 11.533440 11.116418
1 10.307751 9.587426
2 8.294042 8.293928
3 9.655291 9.352958
4 10.687591 10.540836
We can pass along only the columns we need for the function using the `columns` argument, which
removes the performance overhead of packaging all columns for each row:
>>> nf.map_rows(example_func, columns=["a", "nested.t"], output_names=["mean", "mean_minus_base"])
mean mean_minus_base
0 11.533440 11.116418
1 10.307751 9.587426
2 8.294042 8.293928
3 9.655291 9.352958
4 10.687591 10.540836
Alternatively, we can pass along the row data as positional arguments
instead of a dictionary by setting `row_container="args"` and adjusting
our function signature accordingly:
>>> def example_func(a, time):
... return np.mean(time), np.mean(time) - a
>>> nf.map_rows(example_func,
... columns=["a", "nested.t"],
... output_names=["mean", "mean_minus_base"],
... row_container="args")
mean mean_minus_base
0 11.533440 11.116418
1 10.307751 9.587426
2 8.294042 8.293928
3 9.655291 9.352958
4 10.687591 10.540836
Additional arguments that don't depend on row data can be passed as kwargs:
>>> def example_func(row, scale):
... return np.mean(row["nested.t"]) * scale
>>> nf.map_rows(example_func, columns=["nested.t"], output_names="mean", scale=1)
mean
0 11.533440
1 10.307751
2 8.294042
3 9.655291
4 10.687591
Functions that target a single nested structure can just pass along
the nested column name and all sub-columns will be available:
>>> def first_val(row):
... return {"first_"+key.split(".")[1]:row[key][0] for key in row.keys()}
>>> nf.map_rows(first_val, columns="nested")
first_t first_flux first_flux_error first_band
0 8.383890 31.551563 1.0 r
1 13.704390 68.650093 1.0 g
2 4.089045 83.462567 1.0 g
3 17.562349 1.828828 1.0 g
4 0.547752 75.014431 1.0 g
You may want the result of a `map_rows` call to have nested structure,
we can achieve this by using the `infer_nesting` kwarg:
>>> # define a custom user function that returns nested structure
>>> def example_func(row):
... '''map_rows will return a NestedFrame with nested structure'''
... return {"offsets.t_a": row["nested.t"] - row["a"],
... "offsets.t_b": row["nested.t"] - row["b"]}
By giving both output columns the prefix "offsets.", we signal
to map_rows to infer that these should be packed into a nested column
called "offsets".
>>> # apply the function with `infer_nesting` (True by default)
>>> nf.map_rows(example_func, columns=["a", "b", "nested.t"], infer_nesting=True)
offsets
0 [{t_a: 7.966868, t_b: 8.199213}; …] (5 rows)
1 [{t_a: 12.984066, t_b: 13.33187}; …] (5 rows)
2 [{t_a: 4.088931, t_b: 3.397924}; …] (5 rows)
3 [{t_a: 17.260016, t_b: 16.768814}; …] (5 rows)
4 [{t_a: 0.400996, t_b: -0.529882}; …] (5 rows)
You may also want to append the output columns to the original NestedFrame.
We can achieve this by using the `append_columns` kwarg:
>>> # define a custom user function that creates a nested sub-column
>>> def example_func(row):
... '''map_rows will return a sub-column for the existing 'nested' column'''
... return row["nested.t"] - row["a"]
>>> # apply the function with `append_columns` (False by default)
>>> nf.map_rows(example_func,
... columns=["a", "nested.t"],
... output_names=["nested.t_a"],
... append_columns=True)
a b nested
0 0.417022 0.184677 [{t: 8.38389, flux: 31.551563, flux_error: 1.0...
1 0.720324 0.372520 [{t: 13.70439, flux: 68.650093, flux_error: 1....
2 0.000114 0.691121 [{t: 4.089045, flux: 83.462567, flux_error: 1....
3 0.302333 0.793535 [{t: 17.562349, flux: 1.828828, flux_error: 1....
4 0.146756 1.077633 [{t: 0.547752, flux: 75.014431, flux_error: 1....
Notes
-----
If concerned about performance, specify `columns` to only include the columns
needed for the function, as this will avoid the overhead of packaging
all columns for each row.
By default, `map_rows` will produce a `NestedFrame` with enumerated
column names for each returned value of the function. It's recommended
to either specify `output_names` or have `func` return a dictionary
where each key is an output column of the dataframe returned by
`map_rows` (as shown above).
>>> def example_func(row):
... return np.mean(row["nested.t"]), np.mean(row["nested.t"]) - row["a"]
>>> # first output column will be named "0", second "1"
>>> nf.map_rows(example_func)
0 1
0 11.533440 11.116418
1 10.307751 9.587426
2 8.294042 8.293928
3 9.655291 9.352958
4 10.687591 10.540836
"""
# Determine args
if columns is None:
# If None, pass all columns, with nested columns expanded to sub-columns
columns = self.base_columns + self.get_subcolumns(nested_columns="all")
elif isinstance(columns, str):
# If it's a nested column, grab all sub-columns
columns = self.get_subcolumns(columns) if columns in self.nested_columns else [columns]
# Check arg validity
requested_columns = []
for arg in columns:
if not isinstance(arg, str):
raise TypeError(
f"Received an argument '{arg}' that is not a string. "
"All arguments to `map_rows` must be strings corresponding to"
" column names to pass along to the function."
)
components = self._parse_hierarchical_components(arg)
if not self._is_known_column(components):
raise ValueError(
f"Received a string argument '{arg}' that was not found in the columns list. "
"All arguments to `map_rows` must be strings corresponding to"
" column names to pass along to the function."
)
layer = "base" if len(components) < 2 else components[0]
col = components[-1]
requested_columns.append((layer, col))
# Construct row containers and apply
results = []
if row_container == "dict":
if njit:
raise ValueError(
"njit execution is not supported for `row_container='dict'`, "
"use `row_container='args'` instead."
)
arg_dict = {}
for layer, col in requested_columns:
if layer == "base":
arg_dict[col] = self[col]
else:
arg_dict[".".join([layer, col])] = self[layer].array.iter_field_lists(col)
results = [
func(
{col: val for col, val in zip(arg_dict.keys(), row, strict=True)},
**kwargs,
)
for row in zip(*arg_dict.values(), strict=True)
]
elif row_container == "args":
if njit:
try:
results = self._apply_njit_map_rows(requested_columns, func)
# except Exception as err:
except Exception as err:
raise ValueError(
"njit execution for map_rowsis only supported for "
"numba.jit decorated functions with at most 2 arguments"
) from err
else:
# Default python execution
iterators = []
for layer, col in requested_columns:
if layer == "base":
iterators.append(self[col])
else:
iterators.append(self[layer].array.iter_field_lists(col))
results = [func(*cols, **kwargs) for cols in zip(*iterators, strict=True)]
# If the func returns a single array per row wrap results in a `NestedSeries`.
# Otherwise, Pandas will try to expand array elements into separate columns.
if results and isinstance(results[0], np.ndarray):
results_nf = NestedFrame(NestedSeries(results, index=self.index))
else:
results_nf = NestedFrame(results, index=self.index)
# Override output names if specified
if output_names is not None:
if isinstance(output_names, str):
output_names = [output_names]
if len(output_names) != len(results_nf.columns):
raise ValueError(
f"Number of output names ({len(output_names)}) does not match "
f"the number of outputs from the function ({len(results_nf.columns)})"
)
results_nf.columns = output_names
if infer_nesting:
# find potential nested structures from columns
nested_cols = list(
np.unique(
[
column.split(".", 1)[0]
for column in results_nf.columns
if isinstance(column, str) and "." in column
]
)
)
# pack results into nested structures
for layer in nested_cols:
layer_cols = [col for col in results_nf.columns if col.startswith(f"{layer}.")]
rename_df = results_nf[layer_cols].rename(columns=lambda x: x.split(".", 1)[1])
nested_col = pack_lists(rename_df, name=layer)
results_nf = results_nf[
[col for col in results_nf.columns if not col.startswith(f"{layer}.")]
]
results_nf[layer] = nested_col
if append_columns:
# Append sub-columns to existing nested columns
self_nested_cols = [col for col in results_nf.nested_columns if col in self.nested_columns]
for col in self_nested_cols:
sub_columns = results_nf.get_subcolumns(col)
for sub_col in sub_columns:
self = self.assign(**{f"{sub_col}": results_nf[sub_col]})
# Append other base and nested columns
base_results_nf = results_nf.drop(columns=self_nested_cols)
return pd.concat([self, base_results_nf], axis=1)
# Otherwise, return the results as a new NestedFrame
return results_nf
[docs]
def to_pandas(self, list_struct=False, large_list=False) -> pd.DataFrame:
"""Convert to an ordinal pandas DataFrame, with no NestedDtype series.
NestedDtype is cast to pd.ArrowDtype
Parameters
----------
list_struct: bool
If True, cast nested columns to pandas struct-list arrow extension
array columns. If False (default), cast nested columns to
list-struct array columns.
large_list : bool
If False (default), use regular ``list_`` (int32 offsets). Set to
True to use ``large_list`` (int64 offsets), which is required when
the total number of nested elements across all rows exceeds
``2**31 - 1``.
Returns
-------
pd.DataFrame
Ordinal pandas DataFrame.
Examples
--------
>>> from nested_pandas.datasets.generation import generate_data
>>> nf = generate_data(5,5, seed=1)
>>> nf.to_pandas()
a b nested
0 0.417022 0.184677 {'t': array([ 8.38389029, 13.4093502 , 16.0148...
1 0.720324 0.372520 {'t': array([13.70439001, 8.34609605, 19.3652...
2 0.000114 0.691121 {'t': array([ 4.08904499, 11.17379657, 6.2684...
3 0.302333 0.793535 {'t': array([17.56234873, 2.80773877, 13.8464...
4 0.146756 1.077633 {'t': array([ 0.54775186, 3.96202978, 17.5277...
"""
df = pd.DataFrame(self)
for col in self.nested_columns:
df[col] = df[col].array.to_arrow_ext_array(list_struct=list_struct, large_list=large_list)
return df
[docs]
def to_parquet(self, path, large_list=False, **kwargs) -> None:
"""Creates parquet file(s) with the data of a NestedFrame, either
as a single parquet file where each nested dataset is packed into its
own column or as an individual parquet file for each layer.
Note that here we always opt to use the pyarrow engine for writing
parquet files.
Parameters
----------
path : str
The path to the parquet file
large_list : bool
If False (default), use regular ``list_`` (int32 offsets). Set to
True to use ``large_list`` (int64 offsets), which is required when
the total number of nested elements across all rows exceeds
``2**31 - 1``.
kwargs : keyword arguments, optional
Keyword arguments to pass to
`pyarrow.parquet.write_table
<https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html>`_
Returns
-------
None
Examples
--------
>>> from nested_pandas.datasets.generation import generate_data
>>> nf = generate_data(5,5, seed=1)
>>> nf.to_parquet("nestedframe.parquet") # doctest: +SKIP
"""
df = self.to_pandas(list_struct=False, large_list=large_list)
# Write through pyarrow
# This is potentially not zero-copy
# Note: Without pandas metadata, index writing is not as robust set
# preserve_index=None for best behavior but index will generally
# need to be set manually on load
table = pa.Table.from_pandas(df, preserve_index=None)
# Drop pandas metadata to make sure nesteddtypes are not preserved
# Do this by rebuilding the schema
table = table.cast(pa.schema([field for field in table.schema]))
return pq.write_table(table, path, **kwargs)