Source code for nested_pandas.nestedframe.core

# typing.Self and "|" union syntax don't exist in Python 3.9
from __future__ import annotations

from collections import defaultdict
from collections.abc import Callable
from typing import Literal

import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from deprecated import deprecated
from pandas._libs import lib
from pandas._typing import Any, AnyAll, Axis, Hashable, IndexLabel, Mapping
from pandas.api.extensions import no_default
from pandas.core.computation.eval import Expr, ensure_scope
from pandas.core.dtypes.common import is_bool_dtype
from pandas.core.dtypes.inference import is_list_like

from nested_pandas.nestedframe.expr import (
    _identify_aliases,
    _NestResolver,
    _SeriesFromNest,
    _subexprs_by_nest,
)
from nested_pandas.series.dtype import NestedDtype
from nested_pandas.series.ext_array import NestedExtensionArray
from nested_pandas.series.nestedseries import NestedSeries
from nested_pandas.series.packer import pack, pack_lists, pack_sorted_df_into_struct

pd.set_option("display.max_rows", 30)
pd.set_option("display.min_rows", 5)



[docs]
class NestedFrame(pd.DataFrame):
    """A Pandas Dataframe extension with support for nested structure.

    See https://pandas.pydata.org/docs/development/extending.html#subclassing-pandas-data-structures
    """

    # https://pandas.pydata.org/docs/development/extending.html#arithmetic-with-3rd-party-types
    # The __pandas_priority__ of DataFrame is 4000, so give NestedFrame a
    # higher priority, so that binary operations involving this class and
    # Series produce instances of this class, preserving the type and origin.
    __pandas_priority__ = 4500

    # The "_aliases" attribute is usually None or not even present, but when it is present,
    # it indicates that an evaluation is in progress, and that columns and fields with names
    # that are not identifier-like have been aliases to cleaned names, and this attribute
    # contains those aliases, keyed by the cleaned name.
    _metadata = ["_aliases"]


[docs]
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self._cast_cols_to_nested(struct_list=False)


    def _cast_cols_to_nested(self, *, struct_list: bool) -> None:
        """Cast arrow columns to nested.

        Parameters
        ----------
        struct_list : bool
            If `False` cast list-struct columns only. If `True`, also
            try to cast struct-list columns validating if they have
            valid nested structure.
        """
        for column, dtype in self.dtypes.items():
            if not isinstance(dtype, pd.ArrowDtype):
                continue
            pa_type = dtype.pyarrow_dtype
            if pa.types.is_struct(pa_type) and not struct_list:
                continue
            if not NestedExtensionArray.is_input_pa_type_supported(pa_type):
                continue
            self[column] = NestedExtensionArray(pa.array(self[column]))

    @property
    def _constructor(self) -> Self:  # type: ignore[name-defined] # noqa: F821
        return NestedFrame

    @property
    def _constructor_expanddim(self) -> Self:  # type: ignore[name-defined] # noqa: F821
        return NestedFrame

    @property
    def all_columns(self) -> dict:
        """returns a dictionary of columns for each base/nested dataframe"""
        all_columns = {"base": self.columns}
        for column in self.columns:
            if isinstance(self.dtypes[column], NestedDtype):
                nest_cols = self[column].columns
                all_columns[column] = nest_cols
        return all_columns

    @property
    def nested_columns(self) -> list:
        """retrieves the base column names for all nested dataframes"""
        nested_mask = self.dtypes.apply(lambda dtype: isinstance(dtype, NestedDtype))
        return self.columns[nested_mask].tolist()

    @property
    def base_columns(self) -> list[str]:
        """Returns the list of base (non-nested) column names"""
        nested_mask = self.dtypes.apply(lambda dtype: not isinstance(dtype, NestedDtype))
        return self.columns[nested_mask].tolist()

    def _repr_html_(self) -> str | None:
        """Override html representation"""

        # Without nested columns (or empty), just do representation as normal
        if len(self.nested_columns) == 0 or len(self) == 0:
            # This mimics pandas behavior
            if pd.get_option("display.max_rows") is None:
                # If max_rows is None, just show the header
                return super().to_html(max_rows=None, show_dimensions=True)
            if self.shape[0] > pd.get_option("display.max_rows"):
                return super().to_html(max_rows=pd.get_option("display.min_rows"), show_dimensions=True)
            else:
                return super().to_html(max_rows=pd.get_option("display.max_rows"), show_dimensions=True)

        # Nested Column Formatting

        # Display nested columns as small html dataframes with a single row
        def repack_row(chunk, header=True):
            # If the chunk is None or empty, return None (displayed same as Null)
            if chunk is None or len(chunk) == 0:
                return None
            n_rows = len(chunk)

            if n_rows <= 2:
                # For 1 or 2 rows, show all rows without a footer
                chunk = chunk.round(8)
                max_rows_html = n_rows
            else:
                # For 3+ rows, show first row and a "+N rows" footer
                chunk = chunk.head(1).round(8)
                chunk.astype({col: object for col in chunk.columns})  # cast to string for info row
                len_row = pd.DataFrame(
                    {
                        col: [f"<i>+{n_rows - 1} rows</i>"] if i == 0 else ["..."]
                        for i, col in enumerate(chunk.columns)
                    }
                )
                chunk = pd.concat([chunk, len_row], ignore_index=True)
                max_rows_html = 2

            # Estimate width and resize
            html_res = chunk.to_html(
                max_rows=max_rows_html,
                max_cols=5,
                show_dimensions=False,
                index=False,
                header=header,
                escape=False,
            )
            return html_res

        # Handle sizing, trim html dataframe if output will be truncated
        df_shape = self.shape  # grab original shape information for later

        if pd.get_option("display.max_rows") is None:
            html_df = self.copy()
        elif df_shape[0] > pd.get_option("display.max_rows"):
            html_df = self.head(pd.get_option("display.min_rows") + 1)
        else:
            html_df = self.copy()

        # replace index to ensure proper behavior for duplicate index values
        index_values = html_df.index
        html_df = html_df.reset_index(drop=True)
        repr = html_df.style.format({col: repack_row for col in self.nested_columns})

        # Create a mapping function to retrieve original index
        def map_true_index(index):
            return index_values[index]

        repr = repr.format_index(map_true_index, axis=0)

        # Recover some truncation formatting, limited to head truncation
        if pd.get_option("display.max_rows") is None:
            # Just display header
            return repr.to_html(max_rows=0)
        elif df_shape[0] > pd.get_option("display.max_rows"):
            # when over the max_rows threshold, display with truncation ("..." row at the end)
            html_repr = repr.to_html(max_rows=pd.get_option("display.min_rows"))
        else:
            # when under the max_rows threshold, display all rows (behavior of 0 here)
            html_repr = repr.to_html(max_rows=0)

        # Manually append dimensionality to a styler output
        html_repr += f"{df_shape[0]} rows x {df_shape[1]} columns"

        return html_repr

    def _parse_hierarchical_components(self, delimited_path: str, delimiter: str = ".") -> list[str]:
        """
        Given a string that may be a delimited path, parse it into its components,
        respecting backticks that are used to protect component names that may contain the delimiter.
        """
        aliases = getattr(self, "_aliases", None)
        if aliases is None:
            delimited_path, aliases = _identify_aliases(delimited_path)
        return [aliases.get(x, x) for x in delimited_path.split(delimiter)]

    def _is_known_hierarchical_column(self, components: list[str] | str) -> bool:
        """Determine whether a string is a known hierarchical column name"""
        if isinstance(components, str):
            components = self._parse_hierarchical_components(components)
        if len(components) < 2:
            return False
        base_name = components[0]
        if self._is_nested_column(base_name):
            nested_name = ".".join(components[1:])
            return nested_name in self.dtypes[base_name].column_dtypes
        return False

    def _is_nested_column(self, col: str):
        return col in self.columns and isinstance(self.dtypes[col], NestedDtype)

    def _is_known_column(self, components: list[str] | str) -> bool:
        """Determine whether a list of field components describes a known column name"""
        if isinstance(components, str):
            components = self._parse_hierarchical_components(components)
        if ".".join(components) in self.columns:
            return True
        return self._is_known_hierarchical_column(components)

    def __getitem__(self, item):
        """Adds custom __getitem__ behavior for nested columns"""
        if isinstance(item, str):
            return self._getitem_str(item)
        elif self._is_key_list(item):
            return self._getitem_list(item)

        return super().__getitem__(item)

    def _getitem_str(self, item):
        if self._is_nested_column(item):
            return NestedSeries(super().__getitem__(item))
        # Preempt the nested check if the item is a base column, with or without
        # dots and backticks.
        if item in self.columns:
            return super().__getitem__(item)
        components = self._parse_hierarchical_components(item)
        # One more check on the entirety of the item name, in case backticks were used
        # (even if they weren't necessary).
        cleaned_item = ".".join(components)
        if cleaned_item in self.columns:
            return super().__getitem__(cleaned_item)

        # If a nested column name is passed, return a flat series for that column
        # flat series is chosen over list series for utility
        # e.g. native ability to do something like ndf["nested.a"] + 3
        if self._is_known_hierarchical_column(components):
            nested = components[0]
            field = ".".join(components[1:])
            return self[nested].nest.to_flat(columns=[field])[field]
        else:
            raise KeyError(f"Column '{cleaned_item}' not found in nested columns or base columns")

    @staticmethod
    def _is_key_list(item):
        if not is_list_like(item):
            return False
        if is_bool_dtype(item):
            return False
        return all(isinstance(k, str) for k in item)

    def _getitem_list(self, item):
        unknown_cols = [k for k in item if not self._is_known_column(k)]
        if unknown_cols:
            raise KeyError(f"{unknown_cols} not in index")
        non_nested_keys = [k for k in item if k in self.columns]
        result = super().__getitem__(non_nested_keys).copy()
        components = [self._parse_hierarchical_components(k) for k in item]
        nested_components = [c for c in components if self._is_known_hierarchical_column(c)]
        nested_columns = defaultdict(list)
        for comps in nested_components:
            nested_columns[comps[0]].append(".".join(comps[1:]))
        for c in nested_columns:
            result[c] = self[c].nest[nested_columns[c]]
        return result

    def __setitem__(self, key, value):
        """Custom __setitem__ for NestedFrame: auto-nest DataFrame assignment to new columns."""
        # If assigning a DataFrame to a new column, auto-nest it

        # Special handling paths for assignment of dataframes to nested columns
        if isinstance(key, str) and isinstance(value, pd.DataFrame | NestedFrame):
            # if all columns are NestedDtype, combine them into a single nested column
            if np.array([isinstance(dtype, NestedDtype) for dtype in value.dtypes]).all():
                for i, col in enumerate(value.columns):
                    if i == 0:
                        new_nested = value[col]
                    else:
                        # there must be a better way than through list columns
                        list_cols = value[col].to_lists()
                        for column in value[col].columns:
                            new_nested = new_nested.nest.set_list_column(column, list_cols[column])
                value = new_nested
            # Assign a DataFrame as a new column, auto-nesting it
            elif key not in self.columns:
                # Note this uses the default approach for join_nested, which is a left join on index
                new_df = self.join_nested(value, name=key)
                self._update_inplace(new_df)
                return

        components = self._parse_hierarchical_components(key)
        # Replacing or adding columns to a nested structure
        # Allows statements like ndf["nested.t"] = ndf["nested.t"] - 5
        # Or ndf["nested.base_t"] = ndf["nested.t"] - 5
        # Performance note: This requires building a new nested structure
        # TODO: Support assignment of a new column to an existing nested col from a list series
        if self._is_known_hierarchical_column(components) or (
            len(components) > 1 and components[0] in self.nested_columns
        ):
            if len(components) != 2:
                raise ValueError(f"Only one level of nesting is supported; given {key}")
            nested, field = components
            # Support a special case of embedding a base column into a nested column, with values being
            # repeated in each nested list-array.
            if isinstance(value, pd.Series) and self.index.equals(value.index):
                new_nested_series = self[nested].nest.set_filled_column(field, value)
            else:
                new_nested_series = self[nested].nest.set_flat_column(field, value)
            return super().__setitem__(nested, new_nested_series)

        # Adding a new nested structure from a column
        # Allows statements like ndf["new_nested.t"] = ndf["nested.t"] - 5
        if len(components) > 1:
            new_nested, field = components
            if isinstance(value, pd.Series):
                value.name = field
                value = value.to_frame()
            new_df = self.join_nested(value, name=new_nested)
            self._update_inplace(new_df)
            return None

        super().__setitem__(key, value)
        self._cast_cols_to_nested(struct_list=False)

    def __delitem__(self, key):
        """Delete a column or a nested field using dot notation (e.g., del nf['nested.x'])"""
        self.drop([key], axis=1, inplace=True)


[docs]
    def get_subcolumns(self, nested_columns="all") -> list[str]:
        """Returns a set of all subcolumn names from a set of nested columns, including dot notation

        Parameters
        ----------
        nested_columns : 'all' or str or list of str, optional
            The nested columns to get subcolumns from. Default is 'all', which means all nested columns.

        Returns
        -------
        list of str
            A list of subcolumn names in dot notation, e.g. 'nested.a'

        Examples
        --------
        >>> from nested_pandas.datasets import generate_data

        >>> nf = generate_data(5,10, seed=1)
        >>> nf["nested2"] = nf["nested"]  # create a second nested column for demonstration
        >>> nf.get_subcolumns()  # doctest: +NORMALIZE_WHITESPACE
        ['nested.t', 'nested.flux', 'nested.flux_error', 'nested.band',
        'nested2.t', 'nested2.flux', 'nested2.flux_error', 'nested2.band']

        >>> nf.get_subcolumns("nested")
        ['nested.t', 'nested.flux', 'nested.flux_error', 'nested.band']
        """
        # By default, get all subcolumns from all nested columns
        if nested_columns == "all":
            nested_columns = self.nested_columns
        if isinstance(nested_columns, str):
            nested_columns = [nested_columns]
        subcols = []
        for nested_column in nested_columns:
            subcols += [f"{nested_column}.{col}" for col in self[nested_column].columns]

        # I don't believe we need an error if we don't find any, as upstream errors will always trigger
        # on wrong column names
        return subcols


    @deprecated(
        version="0.6.0",
        reason="`add_nested` will be removed in version 0.7.0, use `join_nested` instead.",
    )
    def add_nested(
        self,
        obj,
        name: str,
        *,
        how: str = "left",
        on: None | str | list[str] = None,
        dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None,
    ) -> Self:  # type: ignore[name-defined] # noqa: F821
        """Packs input object to a nested column and adds it to the NestedFrame

        This method returns a new NestedFrame with the added nested column.

        Parameters
        ----------
        obj : pd.DataFrame or a sequence of items convertible to nested structures
            The object to be packed into nested pd.Series and added to
            the NestedFrame. If a DataFrame is passed, it must have non-unique
            index values, which are used to pack the DataFrame. If a sequence
            of elements is passed, it is packed into a nested pd.Series.
            Sequence elements may be individual pd.DataFrames, dictionaries
            (keys are nested column names, values are arrays of the same
            length), or any other object convertible to pa.StructArray.
            Additionally, None and pd.NA are allowed as elements to represent
            missing values.
        name : str
            The name of the nested column to be added to the NestedFrame.
        how : {'left', 'right', 'outer', 'inner'}, default: 'left'
            How to handle the operation of the two objects:

            - left: use calling frame's index.
            - right: use the calling frame's index and order but drop values
              not in the other frame's index.
            - outer: form union of calling frame's index with other frame's
              index, and sort it lexicographically.
            - inner: form intersection of calling frame's index with other
              frame's index, preserving the order of the calling index.
        on : str or list of str, default: None
            Column(s) in the calling frame to join on instead of the index.
            The original index is always preserved. The column(s) are used
            only as join keys and are dropped from the nested structure.
        dtype : dtype or None
            NestedDtype to use for the nested column; pd.ArrowDtype or
            pa.DataType can also be used to specify the nested dtype. If None,
            the dtype is inferred from the input object.

        Returns
        -------
        NestedFrame
            A new NestedFrame with the added nested column.

        Examples
        --------

        >>> import nested_pandas as npd

        >>> nf = npd.NestedFrame({"a": [1, 2, 3], "b": [4, 5, 6]},
        ...            index=[0,1,2])
        >>> nf2 = npd.NestedFrame({"c":[1,2,3,4,5,6,7,8,9]},
        ...             index=[0,0,0,1,1,1,2,2,2])
        >>> # By default, aligns on the index
        >>> nf.add_nested(nf2, "nested")
           a  b                nested
        0  1  4  [{c: 1}; …] (3 rows)
        1  2  5  [{c: 4}; …] (3 rows)
        2  3  6  [{c: 7}; …] (3 rows)

        >>> # We can also align on columns. The index is preserved.
        >>> nf = npd.NestedFrame({"a": [1,2,2,3], "b": [4,4,5,6]}).set_index(["a", "b"])
        >>> nf2 = npd.NestedFrame({"a": [1,2,2,2], "b": [4,4,4,5], "c": [1,2,3,4]})
        >>> nf.join_nested(nf2, "nested", on=["a", "b"]) # doctest: +NORMALIZE_WHITESPACE
                            nested
        a b
        1 4              [{c: 1}]
        2 4  [{c: 2}; …] (2 rows)
          5              [{c: 4}]
        3 6                  None
        """
        return self.join_nested(obj, name, how=how, on=on, dtype=dtype)


[docs]
    def join_nested(
        self,
        obj,
        name: str,
        *,
        how: str = "left",
        on: None | str | list[str] = None,
        dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None,
    ) -> Self:  # type: ignore[name-defined] # noqa: F821
        """Packs input object to a nested column and adds it to the NestedFrame

        This method returns a new NestedFrame with the added nested column.

        Parameters
        ----------
        obj : pd.DataFrame or a sequence of items convertible to nested structures
            The object to be packed into nested pd.Series and added to
            the NestedFrame. If a DataFrame is passed, it must have non-unique
            index values, which are used to pack the DataFrame. If a sequence
            of elements is passed, it is packed into a nested pd.Series.
            Sequence elements may be individual pd.DataFrames, dictionaries
            (keys are nested column names, values are arrays of the same
            length), or any other object convertible to pa.StructArray.
            Additionally, None and pd.NA are allowed as elements to represent
            missing values.
        name : str
            The name of the nested column to be joined to the NestedFrame.
        how : {'left', 'right', 'outer', 'inner'}, default: 'left'
            How to handle the operation of the two objects:

            - left: use calling frame's index.
            - right: use the calling frame's index and order but drop values
              not in the other frame's index.
            - outer: form union of calling frame's index with other frame's
              index, and sort it lexicographically.
            - inner: form intersection of calling frame's index with other
              frame's index, preserving the order of the calling index.
        on : str or list of str, default: None
            Column(s) in the calling frame to join on instead of the index.
            The original index is always preserved. The column(s) are used
            only as join keys and are dropped from the nested structure.
        dtype : dtype or None
            NestedDtype to use for the nested column; pd.ArrowDtype or
            pa.DataType can also be used to specify the nested dtype. If None,
            the dtype is inferred from the input object.

        Returns
        -------
        NestedFrame
            A new NestedFrame with the joined nested column.

        Examples
        --------

        >>> import nested_pandas as npd

        >>> nf = npd.NestedFrame({"a": [1, 2, 3], "b": [4, 5, 6]},
        ...            index=[0,1,2])
        >>> nf2 = npd.NestedFrame({"c":[1,2,3,4,5,6,7,8,9]},
        ...             index=[0,0,0,1,1,1,2,2,2])
        >>> # By default, aligns on the index
        >>> nf.join_nested(nf2, "nested")
           a  b                nested
        0  1  4  [{c: 1}; …] (3 rows)
        1  2  5  [{c: 4}; …] (3 rows)
        2  3  6  [{c: 7}; …] (3 rows)

        >>> # We can also align on columns. The index is preserved.
        >>> nf = npd.NestedFrame({"a": [1,2,2,3], "b": [4,4,5,6]}).set_index(["a", "b"])
        >>> nf2 = npd.NestedFrame({"a": [1,2,2,2], "b": [4,4,4,5], "c": [1,2,3,4]})
        >>> nf.join_nested(nf2, "nested", on=["a", "b"]) # doctest: +NORMALIZE_WHITESPACE
                            nested
        a b
        1 4              [{c: 1}]
        2 4  [{c: 2}; …] (2 rows)
          5              [{c: 4}]
        3 6                  None
        """
        # Add sources to objects
        packed = pack(obj, name=name, on=on, dtype=dtype)
        new_df = self.copy()
        res = new_df.join(packed, how=how, on=on)

        # In some cases join returns a DataFrame, so convert back to NestedFrame
        # For example, with empty dataframes
        if not isinstance(res, NestedFrame):
            res = NestedFrame(res)

        return res



[docs]
    def nest_lists(self, columns: list[str], name: str) -> NestedFrame:
        """Creates a new NestedFrame where the specified list-value columns are packed into a
        nested column.

        Parameters
        ----------
        columns : list[str]
            The list-value columns that should be packed into a nested column.
            All columns in the list will attempt to be packed into a single
            nested column with the name provided in `nested_name`.
        name : str
            The column name of the new nested column which we will pack the list-value
            columns into. This column will be added to the NestedFrame.

        Returns
        -------
        NestedFrame
            A new NestedFrame with the added nested columns

        Examples
        --------

        >>> import nested_pandas as npd
        >>> nf = npd.NestedFrame({"c":[1,2,3], "d":[2,4,6],
        ...                   "e":[[1,2,3], [4,5,6], [7,8,9]]},
        ...                   index=[0,1,2])

        >>> nf.nest_lists(columns=["e"], name="nested")
           c  d                nested
        0  1  2  [{e: 1}; …] (3 rows)
        1  2  4  [{e: 4}; …] (3 rows)
        2  3  6  [{e: 7}; …] (3 rows)
        """

        return NestedFrame.from_lists(self.copy(), list_columns=columns, name=name)



[docs]
    @classmethod
    def from_flat(cls, df, base_columns, nested_columns=None, on: str | None = None, name="nested"):
        """Creates a NestedFrame with base and nested columns from a flat
        dataframe.

        Parameters
        ----------
        df: pd.DataFrame or NestedFrame
            A flat dataframe.
        base_columns: list-like
            The columns that should be used as base (flat) columns in the
            output dataframe.
        nested_columns: list-like, or None
            The columns that should be packed into a nested column. All columns
            in the list will attempt to be packed into a single nested column
            with the name provided in `nested_name`. If None, is defined as all
            columns not in `base_columns`.
        on: str or None
            The name of a column to use as the new index. Typically, the index
            should have a unique value per row for base columns, and should
            repeat for nested columns. For example, a dataframe with two
            columns; a=[1,1,1,2,2,2] and b=[5,10,15,20,25,30] would want an
            index like [0,0,0,1,1,1] if a is chosen as a base column. If not
            provided the current index will be used.
        name:
            The name of the output column the `nested_columns` are packed into.

        Returns
        -------
        NestedFrame
            A NestedFrame with the specified nesting structure.

        Examples
        --------

        >>> import nested_pandas as npd
        >>> nf = npd.NestedFrame({"a":[1,1,1,2,2], "b":[2,2,2,4,4],
        ...                   "c":[1,2,3,4,5], "d":[2,4,6,8,10]},
        ...                   index=[0,0,0,1,1])

        >>> npd.NestedFrame.from_flat(nf, base_columns=["a","b"])
           a  b                      nested
        0  1  2  [{c: 1, d: 2}; …] (3 rows)
        1  2  4  [{c: 4, d: 8}; …] (2 rows)
        """

        # Resolve new index
        if on is not None:
            # if a base column is chosen remove it
            if on in base_columns:
                base_columns = [col for col in base_columns if col != on]
            df = df.set_index(on)

        # drop duplicates on index
        out_df = df[base_columns][~df.index.duplicated(keep="first")]

        # Convert df to NestedFrame if needed
        if not isinstance(out_df, NestedFrame):
            out_df = NestedFrame(out_df)

        # add nested
        if nested_columns is None:
            nested_columns = [col for col in df.columns if col not in base_columns]
        return out_df.join_nested(df[nested_columns], name=name)



[docs]
    @classmethod
    def from_lists(cls, df, base_columns=None, list_columns=None, name="nested"):
        """Creates a NestedFrame with base and nested columns from a flat
        dataframe.

        Parameters
        ----------
        df: pd.DataFrame or NestedFrame
            A dataframe with list columns.
        base_columns: list-like, or None
            Any columns that have non-list values in the input df. These will
            simply be kept as identical columns in the result
        list_columns: list-like, or None
            The list-value columns that should be packed into a nested column.
            All columns in the list will attempt to be packed into a single
            nested column with the name provided in `nested_name`. If None, is
            defined as all columns not in `base_columns`.
        name:
            The name of the output column the `nested_columns` are packed into.

        Returns
        -------
        NestedFrame
            A NestedFrame with the specified nesting structure.

        Examples
        --------

        >>> import nested_pandas as npd
        >>> nf = npd.NestedFrame({"c":[1,2,3], "d":[2,4,6],
        ...                   "e":[[1,2,3], [4,5,6], [7,8,9]]},
        ...                   index=[0,1,2])

        >>> npd.NestedFrame.from_lists(nf, base_columns=["c","d"])
           c  d                nested
        0  1  2  [{e: 1}; …] (3 rows)
        1  2  4  [{e: 4}; …] (3 rows)
        2  3  6  [{e: 7}; …] (3 rows)

        """

        # Resolve base and list columns
        if base_columns is None:
            if list_columns is None:
                # with no inputs, assume all columns are list-valued
                list_columns = df.columns
            else:
                # if list_columns are defined, assume everything else is base
                base_columns = [col for col in df.columns if col not in list_columns]
        else:
            if list_columns is None:
                # with defined base_columns, assume everything else is list
                list_columns = [col for col in df.columns if col not in base_columns]

        if len(list_columns) == 0:
            raise ValueError("No columns were assigned as list columns.")

        # Pack list columns into a nested column
        if len(df) == 0:
            # if the dataframe is empty, just return an empty nested column
            # since there are no iterable values to pack
            packed_df = NestedFrame().join_nested(df[list_columns], name=name)
            packed_df.index.name = df.index.name
        else:
            # Check that each column has iterable elements
            for col in list_columns:
                # Check if the column is iterable based on its first value.
                # This is a simple heuristic but infers more than its dtype
                # which will probably be an object.
                sample_val = df[col].iloc[0]
                if not hasattr(sample_val, "__iter__") and not isinstance(sample_val, str | bytes):
                    raise ValueError(
                        f"Cannot pack column {col} which does not contain an iterable list based "
                        "on its first value, {sample_val}."
                    )
            packed_df = pack_lists(df[list_columns])
            packed_df.name = name

        # concat the nested column to the base_column df
        if base_columns is not None:
            return pd.concat([df[base_columns], packed_df], axis=1)
        # or just return the packed_df as a nestedframe if no base cols
        else:
            return NestedFrame(packed_df.to_frame())



[docs]
    def drop(
        self,
        labels=None,
        *,
        axis=0,
        index=None,
        columns=None,
        level=None,
        inplace=False,
        errors="raise",
    ):
        """Drop specified labels from rows or columns.

        Remove rows or columns by specifying label names and corresponding
        axis, or by directly specifying index or column names. When using a
        multi-index, labels on different levels can be removed by
        specifying the level. See the `user guide <https://pandas.pydata.org/docs/user_guide
        /advanced.html#advanced-shown-levels>`_ for more information about
        the now unused levels.

        Parameters
        ----------
        labels: single label or list-like
            Index or column labels to drop. A tuple will be used as a single
            label and not treated as a list-like. Nested sub-columns are
            accessed using dot notation (e.g. "nested.col1").
        axis: {0 or ‘index’, 1 or ‘columns’}, default 0
            Whether to drop labels from the index (0 or ‘index’) or
            columns (1 or ‘columns’).
        index: single label or list-like
            Alternative to specifying axis (labels, axis=0 is equivalent to
            index=labels).
        columns: single label or list-like
            Alternative to specifying axis (labels, axis=1 is equivalent to
            columns=labels).
        level: int or level name, optional
            For MultiIndex, level from which the labels will be removed.
        inplace: bool, default False
            If False, return a copy. Otherwise, do operation in place and
            return None.
        errors: {‘ignore’, ‘raise’}, default ‘raise’
            If ‘ignore’, suppress error and only existing labels are dropped.

        Returns
        -------
        DataFrame or None
            Returns DataFrame or None DataFrame with the specified index or
            column labels removed or None if inplace=True.

        Examples
        --------

        >>> from nested_pandas.datasets.generation import generate_data
        >>> nf = generate_data(5,5, seed=1)

        >>> # drop the "t" column from "nested"
        >>> nf = nf.drop(["nested.t"], axis=1)
        >>> nf
                  a         b                                             nested
        0  0.417022  0.184677  [{flux: 31.551563, flux_error: 1.0, band: 'r'}...
        1  0.720324  0.372520  [{flux: 68.650093, flux_error: 1.0, band: 'g'}...
        2  0.000114  0.691121  [{flux: 83.462567, flux_error: 1.0, band: 'g'}...
        3  0.302333  0.793535  [{flux: 1.828828, flux_error: 1.0, band: 'g'};...
        4  0.146756  1.077633  [{flux: 75.014431, flux_error: 1.0, band: 'g'}...

        """

        # axis 1 requires special handling for nested columns
        if axis == 1 or columns is not None:
            # label convergence
            if isinstance(labels, str):
                labels = [labels]
            elif columns is not None:
                labels = [columns] if isinstance(columns, str) else columns
                columns = None
                axis = 1
            nested_labels = [label for label in labels if self._is_known_hierarchical_column(label)]
            base_labels = [label for label in labels if not self._is_known_hierarchical_column(label)]

            # split nested_labels by nested column
            if len(nested_labels) > 0:
                nested_cols = set([label.split(".")[0] for label in nested_labels])

                # drop targeted sub-columns for each nested column
                for col in nested_cols:
                    sub_cols = [label.split(".")[1] for label in nested_labels if label.split(".")[0] == col]
                    if inplace:
                        self[col] = self[col].nest.drop(sub_cols)
                    else:
                        self = self.assign(**{f"{col}": self[col].nest.drop(sub_cols)})

            # drop remaining base columns
            if len(base_labels) > 0:
                return super().drop(
                    labels=base_labels,
                    axis=axis,
                    index=index,
                    columns=columns,
                    level=level,
                    inplace=inplace,
                    errors=errors,
                )
            else:
                return self if not inplace else None
        # Otherwise just drop like pandas
        return super().drop(
            labels=labels,
            axis=axis,
            index=index,
            columns=columns,
            level=level,
            inplace=inplace,
            errors=errors,
        )


    def split(
        self,
        nested_col: str,
        by: str,
        values=None,
        drop_by_col: bool = False,
        drop_nested: bool = False,
    ) -> NestedFrame:
        """Split a nested column into multiple nested columns by a categorical sub-column.

        Parameters
        ----------
        nested_col : str
            The name of the nested column to split.
        by : str
            The name of the sub-column within nested_col to split on.
        values : list or str or None, optional
            The specific values to split on. If None, all unique values are used.
            If a string is provided, it is iterated as a list of characters.
        drop_by_col : bool, default False
            If True, the sub-column specified by `by` is dropped from each new
            nested column.
        drop_nested : bool, default False
            If True, the original nested column is dropped from the result.

        Returns
        -------
        NestedFrame
            A new NestedFrame with one new nested column per unique value in
            `by`, named ``{nested_col}_{value}``.

        Examples
        --------
        >>> from nested_pandas.datasets.generation import generate_data
        >>> nf = generate_data(5, 5, seed=1)
        >>> nf.split("nested", by="band")[["a", "b", "nested_r"]] # doctest: +SKIP

                  a         b                                                  nested_r
        0  0.417022  0.184677    [{t: 8.38389, flux: 31.551563, band: 'r'}; …] (2 rows)
        1  0.720324  0.372520   [{t: 19.365232, flux: 90.85955, band: 'r'}; …] (2 rows)
        2  0.000114  0.691121  [{t: 11.173797, flux: 28.044399, band: 'r'}; …] (3 rows)
        3  0.302333  0.793535               [{t: 2.807739, flux: 78.927933, band: 'r'}]
        4  0.146756  1.077633  [{t: 17.527783, flux: 13.002857, band: 'r'}; …] (2 rows)
        """

        if nested_col not in self.nested_columns:
            raise ValueError(
                f"'{nested_col}' is not a nested column. Available nested columns: {self.nested_columns}"
            )

        if by not in self[nested_col].nest.columns:
            raise ValueError(
                f"'{by}' is not a sub-column of '{nested_col}'. "
                f"Available sub-columns: {list(self[nested_col].nest.columns)}"
            )

        has_values = values is not None
        split_values = self[f"{nested_col}.{by}"].unique() if not has_values else list(values)

        if len(self) == 0:
            result = self.copy()
            if has_values:
                for val in split_values:
                    result[f"{nested_col}_{val}"] = None
            if drop_nested:
                result = result.drop(labels=[nested_col], axis=1)
            return result

        is_str = pd.api.types.is_string_dtype(self[f"{nested_col}.{by}"])

        result = self.copy()

        for val in split_values:
            val_repr = f"'{val}'" if is_str else val
            queried = self.query(f"{nested_col}.{by}=={val_repr}")
            if queried is None or len(queried) == 0:
                if has_values:
                    result[f"{nested_col}_{val}"] = None
                continue
            filtered = queried[nested_col]
            if drop_by_col:
                filtered = filtered.nest.drop(by)
            result[f"{nested_col}_{val}"] = filtered

        if drop_nested:
            result = result.drop(labels=[nested_col], axis=1)

        return result


[docs]
    def min(self, exclude_nest: bool = False, numeric_only: bool = False, **kwargs):
        """

        Return the minimum value of each column as a series, including nested columns
        with prefix to indicate the source column.

        This computes the column-wise minimum (axis=0) across base and nested columns.
        Row-wise minimum (axis=1) are not supported, as reductions along columns
        are the primary intended behavior for NestedFrame.

        By default, missing values (NaNs) will be skipped in the computation.

        For non-numeric columns (e.g., strings), the method returns the
        lexicographically smallest value when `numeric_only=False` (default).

        Parameters
        ----------
        exclude_nest : bool, default False
            If set to True, will exclude the nested structure and
            only computes the minimum over the base columns
        numeric_only : bool, default False
            Include only float, int, boolean columns.
        **kwargs
            See the documentation for :meth:`pandas.DataFrame.min`
            for complete details on the keyword arguments accepted by
            :meth:`min`.

        Returns
        -------
        pandas.Series

        Examples
        --------
        >>> from nested_pandas.datasets.generation import generate_data
        >>> nf = generate_data(5,5, seed=1)

        >>> nf_min = nf.min()
        >>> nf_min
        a                    0.000114
        b                    0.184677
        nested.t             0.547752
        nested.flux          1.828828
        nested.flux_error         1.0
        nested.band                 g
        dtype: object

        See Also
        --------
        :meth:`pandas.DataFrame.min`

        """

        if not self.nested_columns:
            return super().min(numeric_only=numeric_only, **kwargs)

        # handle base columns
        base_col = [col for col in self.columns if col not in self.nested_columns]
        base_min = super().__getitem__(base_col).min(numeric_only=numeric_only, **kwargs)

        if exclude_nest:
            return base_min

        # handle nested columns
        nested_mins = []
        for nest_col in self.nested_columns:
            nested_df = self[nest_col].explode()
            nested_df.columns = [f"{nest_col}.{col}" for col in nested_df.columns]
            nested_mins.append(nested_df.min(numeric_only=numeric_only, **kwargs))

        # Combine base and nested min values into a single Series if applicable and return
        if base_min.empty:
            return pd.concat(nested_mins)
        else:
            return pd.concat([base_min] + nested_mins)



[docs]
    def max(self, exclude_nest: bool = False, numeric_only: bool = False, **kwargs):
        """

        Return the maximum value of each column as a series, including nested columns
        with prefix to indicate the source column.

        This computes the column-wise maximum (axis=0) across base and nested columns.
        Row-wise maximum (axis=1) are not supported, as reductions along columns
        are the primary intended behavior for NestedFrame.

        By default, missing values (NaNs) will be skipped in the computation.

        For non-numeric columns (e.g., strings), the method returns the
        lexicographically largest value when `numeric_only=False` (default).

        Parameters
        ----------
        exclude_nest : bool, default False
            If set to True, will exclude the nested structure and
            only computes the maximum over the base columns
        numeric_only : bool, default False
            Include only float, int, boolean columns.
        **kwargs
            See the documentation for :meth:`pandas.DataFrame.max`
            for complete details on the keyword arguments accepted by
            :meth:`max`.

        Returns
        -------
        pandas.Series

        Examples
        --------
        >>> from nested_pandas.datasets.generation import generate_data
        >>> nf = generate_data(5,5, seed=1)

        >>> nf_max = nf.max()
        >>> nf_max
        a                     0.720324
        b                     1.077633
        nested.t             19.365232
        nested.flux          98.886109
        nested.flux_error          1.0
        nested.band                  r
        dtype: object

        See Also
        --------
        :meth:`pandas.DataFrame.max`

        """

        if not self.nested_columns:
            return super().max(numeric_only=numeric_only, **kwargs)

        # handle base columns
        base_col = [col for col in self.columns if col not in self.nested_columns]
        base_max = super().__getitem__(base_col).max(numeric_only=numeric_only, **kwargs)

        if exclude_nest:
            return base_max

        # handle nested columns
        nested_maxs = []
        for nest_col in self.nested_columns:
            nested_df = self[nest_col].explode()
            nested_df.columns = [f"{nest_col}.{col}" for col in nested_df.columns]
            nested_maxs.append(nested_df.max(numeric_only=numeric_only, **kwargs))

        # Combine base and nested max values into a single Series if applicable and return
        if base_max.empty:
            return pd.concat(nested_maxs)
        else:
            return pd.concat([base_max] + nested_maxs)



[docs]
    def describe(self, exclude_nest: bool = False, percentiles=None, include=None, exclude=None):
        """

        Generate descriptive statistics, including nested columns with prefix to indicate the source.

        Descriptive statistics include those that summarize the central tendency,
        dispersion and shape of a dataset's distribution, excluding NaN values,
        similar to the behavior of `pandas.DataFrame.describe()`.

        Nested columns use `pyarrow` data types for efficiency, which are not always
        directly compatible with pandas' type-based filtering.

        - pyarrow strings are not viewed as object type.
        - numerical types from pyarrow (i.e., int, double) are still matched by pandas'
          `np.number`, so filtering with `include=[np.number]` will include numeric nested columns.

        Parameters
        ----------
        exclude_nest : bool, default False
            If set to True, will exclude the nested structure and
            only computes the statistics over the base columns
        percentiles : list-like of numbers, optional
            The percentiles to include in the output. All should fall between 0 and 1.
            Defaults to [.25, .5, .75].
        include : 'all', list-like of dtypes or None (default), optional
            A white list of data types to include in the output.
        exclude : list-like of dtypes or None (default), optional
            A black list of data types to exclude from the output.

        Returns
        -------
        NestedFrame
            A NestedFrame with the summary statistics.

        Raises
        ------
        ValueError
            If no statistics can be generated from the columns.
            A combined error message will be given.

        Examples
        --------
        >>> from nested_pandas.datasets.generation import generate_data
        >>> nf = generate_data(5,5, seed=1)

        >>> nf_desc = nf.describe()
        >>> nf_desc
                      a         b   nested.t  nested.flux  nested.flux_error
        count  5.000000  5.000000       25.0         25.0               25.0
        mean   0.317310  0.623897  10.095623    45.252724                1.0
        std    0.274904  0.351880   6.434858    30.152261                0.0
        min    0.000114  0.184677   0.547752     1.828828                1.0
        25%    0.146756  0.372520    3.96203    21.162812                1.0
        50%    0.302333  0.691121  10.663306    44.789353                1.0
        75%    0.417022  0.793535  16.014891    69.975836                1.0
        max    0.720324  1.077633  19.365232    98.886109                1.0

        -See Also
        --------
        -:meth:`pandas.DataFrame.describe`

        """

        result = []
        errors = []
        check = ["_base"]  # a list of all possible columns to call describe()
        if not exclude_nest:
            check.extend(self.nested_columns)

        if not self.nested_columns:
            return NestedFrame(super().describe(percentiles=percentiles, include=include, exclude=exclude))

        for checkable in check:
            # check the base columns
            if checkable == "_base":
                try:
                    base_col = [col for col in self.columns if col not in self.nested_columns]
                    base_desc = (
                        super()
                        .__getitem__(base_col)
                        .describe(
                            percentiles=percentiles,
                            include=include,
                            exclude=exclude,
                        )
                    )
                except ValueError as err:
                    # continue if value error caused by no matching type or empty base columns
                    errors.append(f"Base columns: {err}")
                    continue

                result.append(base_desc)

            # check the nested columns
            else:
                nested_df = self[checkable].explode()
                nested_df.columns = [f"{checkable}.{col}" for col in nested_df.columns]
                try:
                    nested_desc = nested_df.describe(
                        percentiles=percentiles,
                        include=include,
                        exclude=exclude,
                    )
                except ValueError as err:
                    # continue if value error caused by no matching type for nested columns
                    errors.append(f"Nested column '{checkable}': {err}")
                    continue

                result.append(nested_desc)

        if not result:
            raise ValueError(f"All columns in {check} failed.\n" + "\n".join(errors))

        if include is None and exclude is None:
            # try only get the numeric columns and drop the others
            numeric_dtypes = [r.select_dtypes(include=[np.number]) for r in result]
            non_empty_numeric_dtypes = [r for r in numeric_dtypes if not r.empty]
            if non_empty_numeric_dtypes:
                result = non_empty_numeric_dtypes

        return NestedFrame(pd.concat(result, axis=1))



[docs]
    def explode(self, column: IndexLabel, ignore_index: bool = False):
        """

        Transform each element of a list-like base column to a row, replicating index values.

        Parameters
        ----------
        column : IndexLabel
            Column(s) to explode.
            For multiple columns, specify a non-empty list with each element
            be str or tuple, and all specified columns their list-like data
            on same row of the frame must have matching length.
        ignore_index : bool, default False
            If True, the resulting index will be labeled 0, 1, ..., n - 1.

        Returns
        -------
        NestedFrame
            Exploded lists and  to rows of the subset columns;
            index will be duplicated for these rows.

        Raises
        ------
        ValueError
            It raises if:
            1) columns of the frame are not unique,
            2) specified columns to explode is an empty list,
            3) specified columns to explode do not have matching counts of
              elements rowwise in the frame.

        See Also
        --------
        :meth:`pandas.DataFrame.explode`

        Examples
        --------
        >>> from nested_pandas.datasets.generation import generate_data
        >>> nf = generate_data(3,3, seed=1)

        >>> nf_explode = nf.explode(column="nested")
        >>> nf_explode
                  a         b          t       flux  flux_error band
        0  0.417022  0.604665   3.725204  67.046751         1.0    g
        0  0.417022  0.604665  10.776335  14.038694         1.0    g
        0  0.417022  0.604665   4.089045  96.826158         1.0    g
        1  0.720324  0.293512   6.911215   41.73048         1.0    r
        1  0.720324  0.293512    8.38389  19.810149         1.0    r
        1  0.720324  0.293512  17.562349  31.342418         1.0    g
        2  0.000114  0.184677   7.935349  55.868983         1.0    r
        2  0.000114  0.184677   13.70439  80.074457         1.0    r
        2  0.000114  0.184677   0.547752  69.232262         1.0    g

        """

        if isinstance(column, str):
            columns = [column]
        elif isinstance(column, list):
            columns = column
            if len(columns) == 0:
                raise ValueError("`column` must not be empty")
            if len(set(columns)) != len(columns):
                raise ValueError("`column` must have unique elements")
        else:
            raise ValueError("`column` must be str or list")
        if len(extra_cols := set(columns) - set(self.columns)) > 0:
            if len(extra_cols) == 1:
                raise ValueError(
                    f"column {extra_cols.pop()} not found, available columns: {list(self.columns)}"
                )
            raise ValueError(
                f"columns {sorted(extra_cols)} not found, available columns: {list(self.columns)}"
            )

        nested_columns = [col for col in columns if col in self.nested_columns]
        base_columns = [col for col in columns if col not in nested_columns]

        # Shortcut for the base-column-only case
        if len(nested_columns) == 0:
            return NestedFrame(super().explode(columns, ignore_index=ignore_index))

        # Handle duplicated index use-case: use "ordinal" index, but keep the original one as a column to
        # restore it later.
        default_index_name = "__index_"
        index_col_name = self.index.name or default_index_name
        w_ordinal_idx = self.reset_index(drop=False, names=index_col_name)

        # Call pandas.DataFrame.explode for non-nested columns
        all_but_requested_nested_columns = [col for col in w_ordinal_idx.columns if col not in nested_columns]
        base_exploded = w_ordinal_idx[all_but_requested_nested_columns]
        if len(all_but_requested_nested_columns) > 0 and len(base_columns) > 0:
            base_exploded = super(NestedFrame, base_exploded).explode(base_columns, ignore_index=False)
            base_exploded = NestedFrame(base_exploded)

        # Check if it was actually exploded, or no list-columns were there.
        # This could fail in the case when all lists had one element only, we ignore that edge-case here.
        is_base_exploded = not w_ordinal_idx.index.equals(base_exploded.index)

        # Unnest each requested nested column and store as a "flat" dataframe.
        flat_frames: list[Self] = []  # type: ignore[name-defined] # noqa: F821
        for nested_col in nested_columns:
            # Check if counts (lengths) in nested columns mismatch
            if len(flat_frames) > 0 and np.any(
                w_ordinal_idx[nested_col].nest.len() != w_ordinal_idx[nested_columns[0]].nest.len()
            ):
                raise ValueError(
                    f"One or few rows of {nested_col} have different element counts from {nested_columns[0]}"
                )
            flat = w_ordinal_idx[nested_col].explode()
            # Check if counts (lengths) of this nested column mismatch with one of the list columns.
            if is_base_exploded and not base_exploded.index.equals(flat.index):
                raise ValueError(
                    f"One or few rows of {nested_col} have different element counts "
                    f"from one or few of these columns: {base_columns}"
                )
            flat_frames.append(flat)

        if is_base_exploded:
            result = pd.concat([base_exploded] + flat_frames, axis=1)
        else:
            # Join works here, because we used the ordinal index before exploding
            result = base_exploded.join(pd.concat(flat_frames, axis=1))

        if ignore_index:
            return result.drop(index_col_name, axis=1).reset_index(drop=True)
        # Restore original index
        result = result.set_index(index_col_name, drop=True)
        if result.index.name == default_index_name:
            result.index.name = None
        return result



[docs]
    def fillna(
        self,
        value: Hashable | Mapping | pd.Series | pd.DataFrame | None = None,
        *,
        axis: Axis | None = None,
        inplace: bool = False,
        limit: int | None = None,
    ) -> NestedFrame | None:
        """
        Fill NA/NaN values using the specified method for base and nested columns.

        Parameters
        ----------
        value : scalar, dict, Series, or DataFrame
            Value to use to fill holes (e.g. 0), alternately a
            dict/Series/DataFrame of values specifying which value to use for
            each column.  Values not in the dict/Series/DataFrame will not be filled.
            This value cannot be a list.
        axis : {axes_single_arg}, default None
            Axis along which to fill missing values.
        inplace : bool, default False
            If True, fill in-place. Note: this will modify any
            other views on this object (e.g., a no-copy slice for a column in a
            NestedFrame).
        limit : int, default None
            The maximum number of entries along the entire axis where NaNs will be
            filled. Must be greater than 0 if not None. Currently, limit on nested
            columns is not supported, meaning that all Nans will be filled (if there
            is a value specified) regardless of the input.

        Returns
        -------
        NestedFrame or None
            NestedFrame with missing values filled or None if ``inplace=True``.

        See Also
        --------
        :meth:`pandas.DataFrame.fillna`

        Examples
        --------
        >>> import nested_pandas as npd
        >>> nf = npd.NestedFrame(
        ...     data={"a": [np.nan, 20, np.nan], "b": [np.nan, np.nan, 30], "c": [10, np.nan, np.nan]},
        ...     index=[0, 1, 2]
        ... )
        >>> nested = pd.DataFrame(
        ...     data={"d": [np.nan, np.nan, np.nan], "e": [np.nan, 1, np.nan]},
        ...     index=[0, 1, 2]
        ... )
        >>> nf = nf.join_nested(nested, "nested")

        >>> nf.fillna(0)
              a     b     c              nested
        0   0.0   0.0  10.0  [{d: 0.0, e: 0.0}]
        1  20.0   0.0   0.0  [{d: 0.0, e: 1.0}]
        2   0.0  30.0   0.0  [{d: 0.0, e: 0.0}]

        """

        if not self.nested_columns:
            return super().fillna(value=value, axis=axis, inplace=inplace, limit=limit)

        base_cols = [col for col in self.columns if col not in self.nested_columns]
        filled_df = super().__getitem__(base_cols).fillna(value=value, axis=axis, inplace=False, limit=limit)

        for nest_col in self.nested_columns:
            nested_df = self[nest_col].explode()
            nested_value: Any
            if isinstance(value, Mapping):
                nested_value = {}
                for k, v in value.items():
                    if k.startswith(f"{nest_col}."):
                        subcol = k.split(".", 1)[1]  # strip prefix
                        nested_value[subcol] = v
            else:
                nested_value = value
            nested_df = nested_df.fillna(value=nested_value, axis=axis, inplace=False, limit=None)
            filled_df = filled_df.join_nested(nested_df, nest_col)

        if inplace:
            self._update_inplace(filled_df)
            return None
        return filled_df



[docs]
    def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None:
        """Evaluate a string describing operations on NestedFrame columns.

        Operates on columns only, not specific rows or elements.  This allows
        `eval` to run arbitrary code, which can make you vulnerable to code
        injection if you pass user input to this function.

        Works the same way as `pd.DataFrame.eval`, except that this method
        will also automatically unpack nested columns into NestedSeries,
        and the resulting expression will have the dimensions of the unpacked
        series.

        Parameters
        ----------
        expr : str
            The expression string to evaluate.
        inplace : bool, default False
            If the expression contains an assignment, whether to perform the
            operation inplace and mutate the existing NestedFrame. Otherwise,
            a new NestedFrame is returned.
        **kwargs
            See the documentation for :meth:`pandas.DataFrame.eval` for
            complete details on the keyword arguments accepted by :meth:`eval`.

        Returns
        -------
        ndarray, scalar, pandas object, nested-pandas object, or None
            The result of the evaluation or None if ``inplace=True``.

        See Also
        --------
        :meth:`pandas.DataFrame.eval`

        """
        _, aliases = _identify_aliases(expr)
        self._aliases: dict[str, str] | None = aliases

        kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + (_NestResolver(self),)
        kwargs["inplace"] = inplace
        kwargs["parser"] = "nested-pandas"
        answer = super().eval(expr, **kwargs)

        # If the result is a _SeriesFromNest, set the metadata manually
        # This is a bit of a hack, as it's a backstop for super().eval()
        # not propagating the metadata correctly, `for some reason`.
        # Furthermore, it relies on the assumption that the first resolver
        # is the only one that matters. Because we disallow multi-layer
        # queries, this is potentially safe, though eval statements that target
        # multiple nests may have strange behavior.
        if isinstance(answer, _SeriesFromNest) and not hasattr(answer, "nest_name"):
            nest_key = list(kwargs["resolvers"][0].keys())[0]
            answer.nest_name = kwargs["resolvers"][0][nest_key]._nest_name
            answer.flat_nest = kwargs["resolvers"][0][nest_key]._flat_nest

        self._aliases = None
        return answer


    def extract_nest_names(
        self,
        expr: str,
        local_dict=None,
        global_dict=None,
        resolvers=(),
        level: int = 0,
        target=None,
        **kwargs,
    ) -> set[str]:
        """
        Given a string expression, parse it and visit the resulting expression tree,
        surfacing the nesting types.  The purpose is to identify expressions that attempt
        to mix base and nested columns, or columns from two different nests.
        """
        index_resolvers = self._get_index_resolvers()
        column_resolvers = self._get_cleaned_column_resolvers()
        resolvers = resolvers + (_NestResolver(self), column_resolvers, index_resolvers)
        # Parser needs to be the "nested-pandas" parser.
        # We also need the same variable context that eval() will have, so that
        # backtick-quoted names are substituted as expected.
        env = ensure_scope(
            level + 1,
            global_dict=global_dict,
            local_dict=local_dict,
            resolvers=resolvers,
            target=target,
        )
        parsed_expr = Expr(expr, parser="nested-pandas", env=env)
        expr_tree = parsed_expr.terms
        separable = _subexprs_by_nest([], expr_tree)
        return set(separable.keys())


[docs]
    def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame | None:
        """Query the columns of a NestedFrame with a boolean expression. Specified
        queries can target nested columns in addition to the typical column set

        Parameters
        ----------
        expr : str
            The query string to evaluate.

            Access nested columns using `nested_df.nested_col` (where
            `nested_df` refers to a particular nested dataframe and
            `nested_col` is a column of that nested dataframe).

            You can refer to variables
            in the environment by prefixing them with an '@' character like
            ``@a + b``.

            You can refer to column names that are not valid Python variable names
            by surrounding them in backticks. Thus, column names containing spaces
            or punctuations (besides underscores) or starting with digits must be
            surrounded by backticks. (For example, a column named "Area (cm^2)" would
            be referenced as ```Area (cm^2)```). Column names which are Python keywords
            (like "list", "for", "import", etc) cannot be used.

            For example, if one of your columns is called ``a a`` and you want
            to sum it with ``b``, your query should be ```a a` + b``.

        inplace : bool
            Whether to modify the DataFrame rather than creating a new one.
        **kwargs
                    See the documentation for :meth:`pandas.DataFrame.query`
            for complete details on the keyword arguments accepted by
            :meth:`query`.

        Returns
        -------
        NestedFrame
            NestedFrame resulting from the provided query expression.

        Examples
        --------

        >>> from nested_pandas.datasets.generation import generate_data
        >>> nf = generate_data(5,5, seed=1)

        >>> nf = nf.query("nested.t > 10")
        >>> nf
                  a         b                                             nested
        0  0.417022  0.184677  [{t: 13.40935, flux: 98.886109, flux_error: 1....
        1  0.720324  0.372520  [{t: 13.70439, flux: 68.650093, flux_error: 1....
        2  0.000114  0.691121  [{t: 11.173797, flux: 28.044399, flux_error: 1...
        3  0.302333  0.793535  [{t: 17.562349, flux: 1.828828, flux_error: 1....
        4  0.146756  1.077633  [{t: 17.527783, flux: 13.002857, flux_error: 1...

        Most of the Series and NestedSeries attibutes and methods are available
        through the query interface. For example, to query based on the length
        of the nested frames, you can do:

        >>> nf = nf.query("nested.len() > 2")
        >>> nf
                  a         b                                             nested
        0  0.417022  0.184677  [{t: 13.40935, flux: 98.886109, flux_error: 1....
        3  0.302333  0.793535  [{t: 17.562349, flux: 1.828828, flux_error: 1....
        4  0.146756  1.077633  [{t: 17.527783, flux: 13.002857, flux_error: 1...

        See Also
        --------
        :meth:`pandas.DataFrame.query`

        Notes
        -----
        Queries that target a particular nested structure return a dataframe
        with rows of that particular nested structure filtered. For example,
        querying the NestedFrame "df" with nested structure "my_nested" as
        below will return all rows of df, but with mynested filtered by the
        condition: `nf.query("mynested.a > 2")`

        """
        if not isinstance(expr, str):
            msg = f"expr must be a string to be evaluated, {type(expr)} given"
            raise ValueError(msg)
        kwargs["level"] = kwargs.pop("level", 0) + 1
        kwargs["target"] = None
        # At present, the query expression must be either entirely within a
        # single nest, or have nothing but base columns.  Mixed structures are not
        # supported, so preflight the expression.
        nest_names = self.extract_nest_names(expr, **kwargs)
        if len(nest_names) > 1:
            raise ValueError("Queries cannot target multiple structs/layers, write a separate query for each")
        result = self.eval(expr, **kwargs)
        # If the result is a _SeriesFromNest, then the evaluation has caused unpacking,
        # which means that a nested attribute was referenced.  Apply this result
        # to the nest and repack.  Otherwise, apply it to this instance as usual,
        # since it operated on the base attributes.
        if isinstance(result, _SeriesFromNest):
            nest_name, flat_nest = result.nest_name, result.flat_nest
            # Reset index to "ordinal" like [0, 0, 0, 1, 1, 2, 2, 2]
            list_index = self[nest_name].array.get_list_index()
            flat_nest = flat_nest.set_index(list_index)
            query_result = result.set_axis(list_index)
            # Selecting flat values matching the query result
            new_flat_nest = flat_nest[query_result]
            new_df = self._set_filtered_flat_df(nest_name, new_flat_nest)
        else:
            new_df = self.loc[result]

        if inplace:
            self._update_inplace(new_df)
            return None
        else:
            return new_df


    def _set_filtered_flat_df(self, nest_name, flat_df):
        """Set a filtered flat dataframe for a nested column

        Here we assume that flat_df has filtered "ordinal" index,
        e.g. flat_df.index == [0, 2, 2, 2], while self.index
        is arbitrary (e.g. ["a", "b", "a"]),
        and self[nest_name].array.list_index is [0, 0, 1, 1, 1, 2, 2, 2, 2].
        """
        new_df = self.reset_index(drop=True)
        new_df[nest_name] = pack_sorted_df_into_struct(flat_df, name=nest_name)
        return new_df.set_index(self.index)

    def _resolve_dropna_target(self, on_nested, subset):
        """resolves the target layer for a given set of dropna kwargs"""

        nested_cols = self.nested_columns

        # first check the subset kwarg input
        subset_target = []
        if subset:
            if isinstance(subset, str):
                subset = [subset]

            for col in subset:
                # Without a ".", always assume base layer
                if "." not in col:
                    subset_target.append("base")
                else:
                    layer, col = col.split(".")
                    if layer in nested_cols:
                        subset_target.append(layer)
                    else:
                        raise ValueError(f"layer '{layer}' not found in the base columns")

            # Check for 1 target
            subset_target = np.unique(subset_target)
            if len(subset_target) > 1:  # prohibit multi-target operations
                raise ValueError(
                    f"Targeted multiple nested structures ({subset_target}), write one command per target dataframe"  # noqa
                )
            subset_target = str(subset_target[0])

        # Next check the on_nested kwarg input
        if on_nested and on_nested not in nested_cols:
            raise ValueError("Provided nested layer not found in nested dataframes")

        # Resolve target layer
        target = "base"
        if on_nested and subset_target:
            if on_nested != subset_target:
                raise ValueError(
                    f"Provided on_nested={on_nested}, but subset columns are from {subset_target}. Make sure these are aligned or just use subset."  # noqa
                )
            else:
                target = subset_target
        elif on_nested:
            target = str(on_nested)
        elif subset_target:
            target = str(subset_target)
        return target, subset


[docs]
    def dropna(
        self,
        *,
        axis: Axis = 0,
        how: AnyAll | lib.NoDefault = no_default,
        thresh: int | lib.NoDefault = no_default,
        on_nested: bool = False,
        subset: IndexLabel | None = None,
        inplace: bool = False,
        ignore_index: bool = False,
    ) -> NestedFrame | None:
        """
        Remove missing values for one layer of the NestedFrame.

        Parameters
        ----------
        axis : {0 or 'index', 1 or 'columns'}, default 0
            Determine if rows or columns which contain missing values are
            removed.

            * 0, or 'index' : Drop rows which contain missing values.
            * 1, or 'columns' : Drop columns which contain missing value.

            Only a single axis is allowed.

        how : {'any', 'all'}, default 'any'
            Determine if row or column is removed from DataFrame, when we have
            at least one NA or all NA.

            * 'any' : If any NA values are present, drop that row or column.
            * 'all' : If all values are NA, drop that row or column.
        thresh : int, optional
            Require that many non-NA values. Cannot be combined with how.
        on_nested : str or bool, optional
            If not False, applies the call to the nested dataframe in the
            column with label equal to the provided string. If specified,
            the nested dataframe should align with any columns given in
            `subset`.
        subset : column label or sequence of labels, optional
            Labels along other axis to consider, e.g. if you are dropping rows
            these would be a list of columns to include.

            Access nested columns using `nested_df.nested_col` (where
            `nested_df` refers to a particular nested dataframe and
            `nested_col` is a column of that nested dataframe).
        inplace : bool, default False
            Whether to modify the DataFrame rather than creating a new one.
        ignore_index : bool, default ``False``
            If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.

            .. versionadded:: 2.0.0

        Returns
        -------
        DataFrame or None
            DataFrame with NA entries dropped from it or None if ``inplace=True``.

        Examples
        --------

        A common usecase for `dropna` is to remove empty nested rows:

        >>> from nested_pandas.datasets.generation import generate_data
        >>> nf = generate_data(5,5, seed=1)

        >>> # this query empties several of the nested dataframes
        >>> nf = nf.query("nested.t > 19")
        >>> nf
                  a         b                                             nested
        0  0.417022  0.184677                                               None
        1  0.720324  0.372520  [{t: 19.365232, flux: 90.85955, flux_error: 1....
        2  0.000114  0.691121  [{t: 19.157791, flux: 14.672857, flux_error: 1...
        3  0.302333  0.793535                                               None
        4  0.146756  1.077633                                               None


        >>> # dropna removes rows with those emptied dataframes
        >>> nf.dropna(subset="nested")
                  a         b                                             nested
        1  0.720324  0.372520  [{t: 19.365232, flux: 90.85955, flux_error: 1....
        2  0.000114  0.691121  [{t: 19.157791, flux: 14.672857, flux_error: 1...


        `dropna` can also be used on nested columns:

        >>> nf = generate_data(5,5, seed=1)
        >>> # Either on the whole dataframe
        >>> nf.dropna(on_nested="nested")
                  a         b                                             nested
        0  0.417022  0.184677  [{t: 8.38389, flux: 31.551563, flux_error: 1.0...
        1  0.720324  0.372520  [{t: 13.70439, flux: 68.650093, flux_error: 1....
        2  0.000114  0.691121  [{t: 4.089045, flux: 83.462567, flux_error: 1....
        3  0.302333  0.793535  [{t: 17.562349, flux: 1.828828, flux_error: 1....
        4  0.146756  1.077633  [{t: 0.547752, flux: 75.014431, flux_error: 1....
        >>> # or on a specific nested column
        >>> nf.dropna(subset="nested.t")
                  a         b                                             nested
        0  0.417022  0.184677  [{t: 8.38389, flux: 31.551563, flux_error: 1.0...
        1  0.720324  0.372520  [{t: 13.70439, flux: 68.650093, flux_error: 1....
        2  0.000114  0.691121  [{t: 4.089045, flux: 83.462567, flux_error: 1....
        3  0.302333  0.793535  [{t: 17.562349, flux: 1.828828, flux_error: 1....
        4  0.146756  1.077633  [{t: 0.547752, flux: 75.014431, flux_error: 1....

        Notes
        -----
        Operations that target a particular nested structure return a dataframe
        with rows of that particular nested structure affected.

        Values for `on_nested` and `subset` should be consistent in pointing
        to a single layer, multi-layer operations are not supported.
        """

        # determine target dataframe
        target, subset = self._resolve_dropna_target(on_nested, subset)

        if target == "base":
            return super().dropna(
                axis=axis,
                how=how,
                thresh=thresh,
                subset=subset,
                inplace=inplace,
                ignore_index=ignore_index,
            )
        if ignore_index:
            raise ValueError("ignore_index is not supported for nested columns")
        if subset is not None:
            subset = [col.split(".")[-1] for col in subset]
        target_flat = self[target].explode()
        target_flat = target_flat.set_index(self[target].array.get_list_index())
        if inplace:
            target_flat.dropna(
                axis=axis,
                how=how,
                thresh=thresh,
                subset=subset,
                inplace=True,
            )
        else:
            target_flat = target_flat.dropna(
                axis=axis,
                how=how,
                thresh=thresh,
                subset=subset,
                inplace=False,
            )
        new_df = self._set_filtered_flat_df(nest_name=target, flat_df=target_flat)
        if inplace:
            self._update_inplace(new_df)
            return None
        return new_df



[docs]
    def sort_values(
        self,
        by,
        *,
        axis=0,
        ascending=True,
        inplace=False,
        kind="quicksort",
        na_position="last",
        ignore_index=False,
        key=None,
    ):
        """
        Sort by the values along either axis.

        Parameters
        ----------
        by : str or list of str
            Name or list of names to sort by.

            Access nested columns using `nested_df.nested_col` (where
            `nested_df` refers to a particular nested dataframe and
            `nested_col` is a column of that nested dataframe).
        axis : {0 or 'index', 1 or 'columns'}, default 0
            Axis to be sorted.
        ascending : bool or list of bool, default True
            Sort ascending vs. descending. Specify list for multiple sort
            orders. If this is a list of bools, must match the length of the
            by.
        inplace : bool, default False
            If True, perform operation in-place.
        kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort'
            Choice of sorting algorithm. See also ndarray.np.sort for more
            information. mergesort is the only stable algorithm. For DataFrames,
            this option is only applied when sorting on a single column or label.
        na_position : {'first', 'last'}, default 'last'
            Puts NaNs at the beginning if first; last puts NaNs at the end.
        ignore_index : bool, default False
            If True, the resulting axis will be labeled 0, 1, …, n - 1.
            Always False when applied to nested layers.
        key : callable, optional
            Apply the key function to the values before sorting.

        Returns
        -------
        DataFrame or None
            DataFrame with sorted values if inplace=False, None otherwise.

        Examples
        ---------
        >>> from nested_pandas.datasets.generation import generate_data
        >>> nf = generate_data(5,5, seed=1)

        >>> # Sort nested values
        >>> nf.sort_values(by="nested.band")
                  a         b                                             nested
        0  0.417022  0.184677  [{t: 13.40935, flux: 98.886109, flux_error: 1....
        1  0.720324  0.372520  [{t: 13.70439, flux: 68.650093, flux_error: 1....
        2  0.000114  0.691121  [{t: 4.089045, flux: 83.462567, flux_error: 1....
        3  0.302333  0.793535  [{t: 17.562349, flux: 1.828828, flux_error: 1....
        4  0.146756  1.077633  [{t: 0.547752, flux: 75.014431, flux_error: 1....
        """

        # Resolve target layer
        target = []
        if isinstance(by, str):
            by = [by]
        # Check "by" columns for hierarchical references
        for col in by:
            if self._is_known_hierarchical_column(col):
                target.append(col.split(".")[0])
            else:
                target.append("base")

        # Ensure one target layer, preventing multi-layer operations
        target = np.unique(target)
        if len(target) > 1:
            raise ValueError("Queries cannot target multiple structs/layers, write a separate query for each")
        target = str(target[0])

        # Apply pandas sort_values
        if target == "base":
            return super().sort_values(
                by=by,
                axis=axis,
                ascending=ascending,
                inplace=inplace,
                kind=kind,
                na_position=na_position,
                ignore_index=ignore_index,
                key=key,
            )
        else:  # target is a nested column
            target_flat = self[target].explode()
            target_flat = target_flat.set_index(self[target].array.get_list_index())

            if target_flat.index.name is None:  # set name if not present
                target_flat.index.name = "index"
            # Index must always be the first sort key for nested columns
            nested_by = [target_flat.index.name] + [col.split(".")[-1] for col in by]

            # Augment the ascending kwarg to include the index
            if isinstance(ascending, bool):
                ascending = [True] + [ascending] * len(by)
            elif isinstance(ascending, list):
                ascending = [True] + ascending

            target_flat = target_flat.sort_values(
                by=nested_by,
                axis=axis,
                ascending=ascending,
                kind=kind,
                na_position=na_position,
                ignore_index=False,
                key=key,
                inplace=False,
            )

            #  Could be optimized, as number of rows doesn't change
            new_df = self._set_filtered_flat_df(nest_name=target, flat_df=target_flat)

            if inplace:
                self._update_inplace(new_df)
                return None
            return new_df


    @deprecated(
        version="0.6.0",
        reason="`reduce` will be removed in version 0.7.0, use `map_rows` instead.",
    )
    def reduce(self, func, *args, infer_nesting=True, append_columns=False, **kwargs) -> NestedFrame:  # type: ignore[override]
        """
        Takes a function and applies it to each top-level row of the NestedFrame.

        The user may specify which columns the function is applied to, with
        columns from the 'base' layer being passed to the function as
        scalars and columns from the nested layers being passed as numpy arrays.

        Parameters
        ----------
        func : callable
            Function to apply to each nested dataframe. The first arguments to `func` should be which
            columns to apply the function to. See the Notes for recommendations
            on writing func outputs.
        args : positional arguments
            A list of string column names to pull from the NestedFrame to pass along
            to the function. If the function has additional arguments, pass them as
            keyword arguments (e.g. `arg_name=value`).
        infer_nesting : bool, default True
            If True, the function will pack output columns into nested
            structures based on column names adhering to a nested naming
            scheme. E.g. "nested.b" and "nested.c" will be packed into a column
            called "nested" with columns "b" and "c". If False, all outputs
            will be returned as base columns.
        append_columns : bool, default False
            if True, the output columns should be appended to those in the original NestedFrame.
        kwargs : keyword arguments, optional
            Keyword arguments to pass to the function.

        Returns
        -------
        `NestedFrame`
            `NestedFrame` with the results of the function applied to the columns of the frame.

        Examples
        --------

        >>> from nested_pandas.datasets.generation import generate_data
        >>> import numpy as np
        >>> nf = generate_data(5,5, seed=1)
        >>>
        >>> # define a custom user function
        >>> # reduce will return a NestedFrame with two columns
        >>> def example_func(base_col, nested_col):
        ...     return {
        ...         "mean": np.mean(nested_col),
        ...         "mean_minus_base": np.mean(nested_col) - base_col,
        ...     }
        >>>
        >>> # apply the function
        >>> nf.reduce(example_func, "a", "nested.t")
                mean  mean_minus_base
        0  11.533440        11.116418
        1  10.307751         9.587426
        2   8.294042         8.293928
        3   9.655291         9.352958
        4  10.687591        10.540836

        You may want the result of a `reduce` call to have nested structure,
        we can achieve this by using the `infer_nesting` kwarg:

        >>> # define a custom user function that returns nested structure
        >>> def example_func(base_col1, base_col2, nested_col):
        ...    '''reduce will return a NestedFrame with nested structure'''
        ...    return {"offsets.t_a": nested_col - base_col1,
        ...            "offsets.t_b": nested_col - base_col2}

        By giving both output columns the prefix "offsets.", we signal
        to reduce to infer that these should be packed into a nested column
        called "offsets".

        >>> # apply the function with `infer_nesting` (True by default)
        >>> nf.reduce(example_func, "a", "b", "nested.t")
                                                  offsets
        0    [{t_a: 7.966868, t_b: 8.199213}; …] (5 rows)
        1   [{t_a: 12.984066, t_b: 13.33187}; …] (5 rows)
        2    [{t_a: 4.088931, t_b: 3.397924}; …] (5 rows)
        3  [{t_a: 17.260016, t_b: 16.768814}; …] (5 rows)
        4   [{t_a: 0.400996, t_b: -0.529882}; …] (5 rows)

        Notes
        -----
        By default, `reduce` will produce a `NestedFrame` with enumerated
        column names for each returned value of the function. For more useful
        naming, it's recommended to have `func` return a dictionary where each
        key is an output column of the dataframe returned by `reduce` (as
        shown above).
        """
        # Parse through the initial args to determine the columns to apply the function to
        requested_columns = []
        for arg in args:
            # Stop when we reach an argument that is not a valid column, as we assume
            # that the remaining args are extra arguments to the function
            if not isinstance(arg, str):
                raise TypeError(
                    f"Received an argument '{arg}' that is not a string. "
                    "All arguments to `reduce` must be strings corresponding to"
                    " column names to pass along to the function. If your function"
                    " has additional arguments, pass them as kwargs (arg_name=value)."
                )
            components = self._parse_hierarchical_components(arg)
            if not self._is_known_column(components):
                raise ValueError(
                    f"Received a string argument '{arg}' that was not found in the columns list. "
                    "All arguments to `reduce` must be strings corresponding to"
                    " column names to pass along to the function. If your function"
                    " has additional arguments, pass them as kwargs (arg_name=value)."
                )
            layer = "base" if len(components) < 2 else components[0]
            col = components[-1]
            requested_columns.append((layer, col))

        # We require the first *args to be the columns to apply the function to
        if not requested_columns:
            raise ValueError("No columns in `*args` specified to apply function to")

        # The remaining args are the extra arguments to the function other than columns
        extra_args: tuple[Any, ...] = ()  # empty tuple to make mypy happy
        if len(requested_columns) < len(args):
            extra_args = args[len(requested_columns) :]

        iterators = []
        for layer, col in requested_columns:
            if layer == "base":
                iterators.append(self[col])
            else:
                iterators.append(self[layer].array.iter_field_lists(col))

        results = [func(*cols, *extra_args, **kwargs) for cols in zip(*iterators, strict=True)]
        results_nf = NestedFrame(results, index=self.index)

        if infer_nesting:
            # find potential nested structures from columns
            nested_cols = list(
                np.unique(
                    [
                        column.split(".", 1)[0]
                        for column in results_nf.columns
                        if isinstance(column, str) and "." in column
                    ]
                )
            )

            # pack results into nested structures
            for layer in nested_cols:
                layer_cols = [col for col in results_nf.columns if col.startswith(f"{layer}.")]
                rename_df = results_nf[layer_cols].rename(columns=lambda x: x.split(".", 1)[1])
                nested_col = pack_lists(rename_df, name=layer)
                results_nf = results_nf[
                    [col for col in results_nf.columns if not col.startswith(f"{layer}.")]
                ]
                results_nf[layer] = nested_col

        if append_columns:
            # Append the results to the original NestedFrame
            return pd.concat([self, results_nf], axis=1)

        # Otherwise, return the results as a new NestedFrame
        return results_nf

    def _apply_njit_map_rows(self, requested_columns, func):
        """
        Apply njit map_rows to njit custom function with requested_columns.
        Currently only supports 1 or 2 arguments custom function.
        """
        try:
            import numba  # noqa
        except ImportError as err:
            raise ImportError(
                "njit=True requires numba, please install with pip install numba"
                "or conda install conda-forge::numba"
            ) from err

        from . import njit_funcs

        if len(requested_columns) == 1:
            layer, col_name = requested_columns[0]
            if layer == "base":
                base_col = np.asarray(self[col_name])
                results = njit_funcs._map_rows_njit1_base(func, base_col)
            else:
                nested_array = self[layer]

                offsets = np.asarray(nested_array.array.list_offsets)
                nested_col = np.asarray(nested_array[col_name])

                results = njit_funcs._map_rows_njit1_nested(func, offsets, nested_col)

        else:
            # 2 requested columns for 2-arg custom function
            layer1, col1_name = requested_columns[0]
            layer2, col2_name = requested_columns[1]

            if layer1 == "base" and layer2 == "base":
                base_col1 = np.asarray(self[col1_name])
                base_col2 = np.asarray(self[col2_name])

                results = njit_funcs._map_rows_njit2_base_base(func, base_col1, base_col2)
            elif layer1 == "base":
                base_col1 = np.asarray(self[col1_name])

                nested_array2 = self[layer2]
                offsets = np.asarray(nested_array2.array.list_offsets)
                col2 = np.asarray(nested_array2[col2_name])

                results = njit_funcs._map_rows_njit2_base_nest(func, base_col1, offsets, col2)
            elif layer2 == "base":
                nested_array1 = self[layer1]
                offsets = np.asarray(nested_array1.array.list_offsets)
                col1 = np.asarray(nested_array1[col1_name])

                base_col2 = np.asarray(self[col2_name])

                results = njit_funcs._map_rows_njit2_nest_base(func, offsets, col1, base_col2)
            else:
                nested_array1 = self[layer1]
                nested_array2 = self[layer2]
                offsets1 = np.asarray(nested_array1.array.list_offsets)
                offsets2 = np.asarray(nested_array2.array.list_offsets)
                col1 = np.asarray(nested_array1[col1_name])
                col2 = np.asarray(nested_array2[col2_name])

                results = njit_funcs._map_rows_njit2_nest_nest(func, offsets1, offsets2, col1, col2)

        return results.tolist()


[docs]
    def map_rows(
        self,
        func: Callable[..., Any],
        columns: None | str | list[str] = None,
        *,
        row_container: Literal["dict"] | Literal["args"] = "dict",
        output_names: None | str | list[str] = None,
        infer_nesting: bool = True,
        append_columns: bool = False,
        njit: bool = False,
        **kwargs,
    ) -> NestedFrame:  # type: ignore[override]
        """
        Takes a function and applies it to each top-level row of the NestedFrame.

        Nested columns are packaged alongside base columns and available for function use, where base columns
        are passed as scalars and nested columns are passed as numpy arrays. The way in which the row data is
        packaged is configurable (by default, a dictionary) and controlled by the `row_container` argument.

        Parameters
        ----------
        func : callable
            Function to apply to each nested dataframe. The first arguments to `func` should be which
            columns to apply the function to. See the Notes for recommendations
            on writing func outputs.
        columns : None | str | list of str
            Specifies which columns to pass to the function in the row_container format.
            If None, all columns are passed. If list of str, those columns are passed.
            If str, a single column is passed or if the string is a nested column, then all nested sub-columns
            are passed (e.g. columns="nested" passes all columns of the nested dataframe "nested"). To pass
            individual nested sub-columns, use the hierarchical column name (e.g. columns=["nested.t",...]).
        row_container : 'dict' or 'args', default 'dict'
            Specifies how the row data will be packaged when passed as an input to the function.
            If 'dict', the function will be called as `func({"col1": value, ...}, **kwargs)`, so func should
            expect a single dictionary input with keys corresponding to column names.
            If 'args', the function will be called as `func(value, ..., **kwargs)`, so func should expect
            positional arguments corresponding to the columns specified in `args`.
        output_names : None | str | list of str
            Specifies the names of the output columns in the resulting NestedFrame. If None, the function
            will return whatever names the user function returns. If specified will override any names
            returned by the user function provided the number of names matches the number of outputs. When not
            specified and the user function returns values without names (e.g. a list or tuple), the output
            columns will be enumerated (e.g. "0", "1", ...).
        infer_nesting : bool, default True
            If True, the function will pack output columns into nested
            structures based on column names adhering to a nested naming
            scheme. E.g. "nested.b" and "nested.c" will be packed into a column
            called "nested" with columns "b" and "c". If False, all outputs
            will be returned as base columns. Note that this will trigger off of names specified in
            `output_names` in addition to names returned by the user function.
        append_columns : bool, default False
            If True, the output columns are appended to those in the original NestedFrame.
            The output columns can contain nested sub-columns, which should be specified using their
            hierarchical column name (e.g. "nested.x"). If their base nested column exists in the
            original NestedFrame, the new output sub-columns will be added into the frame of the
            existing nested column. See an example below.
        njit : bool, default False
            If Ture, the function will try to use numba's njit to speed up the execution.
            This will only work if the custom function is compatible with njit and the requested columns
            are at most two.

            Note that using njit will disable support for `row_container="dict"`.
        kwargs : keyword arguments, optional
            Keyword arguments to pass to the function.

        Returns
        -------
        `NestedFrame`
            `NestedFrame` with the results of the function applied to the columns of the frame.

        Examples
        --------

        >>> from nested_pandas.datasets.generation import generate_data
        >>> import numpy as np
        >>> nf = generate_data(5,5, seed=1)
        >>> # define a custom user function
        >>> # map_rows will return a NestedFrame with two columns
        >>> def example_func(row):
        ...     return np.mean(row["nested.t"]), np.mean(row["nested.t"]) - row["a"]

        >>> # apply the function
        >>> nf.map_rows(example_func, output_names=["mean", "mean_minus_base"])
                mean  mean_minus_base
        0  11.533440        11.116418
        1  10.307751         9.587426
        2   8.294042         8.293928
        3   9.655291         9.352958
        4  10.687591        10.540836

        We can pass along only the columns we need for the function using the `columns` argument, which
        removes the performance overhead of packaging all columns for each row:

        >>> nf.map_rows(example_func, columns=["a", "nested.t"], output_names=["mean", "mean_minus_base"])
                mean  mean_minus_base
        0  11.533440        11.116418
        1  10.307751         9.587426
        2   8.294042         8.293928
        3   9.655291         9.352958
        4  10.687591        10.540836

        Alternatively, we can pass along the row data as positional arguments
        instead of a dictionary by setting `row_container="args"` and adjusting
        our function signature accordingly:

        >>> def example_func(a, time):
        ...     return np.mean(time), np.mean(time) - a

        >>> nf.map_rows(example_func,
        ...             columns=["a", "nested.t"],
        ...             output_names=["mean", "mean_minus_base"],
        ...             row_container="args")
                mean  mean_minus_base
        0  11.533440        11.116418
        1  10.307751         9.587426
        2   8.294042         8.293928
        3   9.655291         9.352958
        4  10.687591        10.540836

        Additional arguments that don't depend on row data can be passed as kwargs:

        >>> def example_func(row, scale):
        ...     return np.mean(row["nested.t"]) * scale

        >>> nf.map_rows(example_func, columns=["nested.t"], output_names="mean", scale=1)
                mean
        0  11.533440
        1  10.307751
        2   8.294042
        3   9.655291
        4  10.687591

        Functions that target a single nested structure can just pass along
        the nested column name and all sub-columns will be available:

        >>> def first_val(row):
        ...     return {"first_"+key.split(".")[1]:row[key][0] for key in row.keys()}

        >>> nf.map_rows(first_val, columns="nested")
             first_t  first_flux  first_flux_error first_band
        0   8.383890   31.551563               1.0          r
        1  13.704390   68.650093               1.0          g
        2   4.089045   83.462567               1.0          g
        3  17.562349    1.828828               1.0          g
        4   0.547752   75.014431               1.0          g

        You may want the result of a `map_rows` call to have nested structure,
        we can achieve this by using the `infer_nesting` kwarg:

        >>> # define a custom user function that returns nested structure
        >>> def example_func(row):
        ...     '''map_rows will return a NestedFrame with nested structure'''
        ...     return {"offsets.t_a": row["nested.t"] - row["a"],
        ...             "offsets.t_b": row["nested.t"] - row["b"]}

        By giving both output columns the prefix "offsets.", we signal
        to map_rows to infer that these should be packed into a nested column
        called "offsets".

        >>> # apply the function with `infer_nesting` (True by default)
        >>> nf.map_rows(example_func, columns=["a", "b", "nested.t"], infer_nesting=True)
                                                  offsets
        0    [{t_a: 7.966868, t_b: 8.199213}; …] (5 rows)
        1   [{t_a: 12.984066, t_b: 13.33187}; …] (5 rows)
        2    [{t_a: 4.088931, t_b: 3.397924}; …] (5 rows)
        3  [{t_a: 17.260016, t_b: 16.768814}; …] (5 rows)
        4   [{t_a: 0.400996, t_b: -0.529882}; …] (5 rows)

        You may also want to append the output columns to the original NestedFrame.
        We can achieve this by using the `append_columns` kwarg:

        >>> # define a custom user function that creates a nested sub-column
        >>> def example_func(row):
        ...     '''map_rows will return a sub-column for the existing 'nested' column'''
        ...     return row["nested.t"] - row["a"]

        >>> # apply the function with `append_columns` (False by default)
        >>> nf.map_rows(example_func,
        ...             columns=["a", "nested.t"],
        ...             output_names=["nested.t_a"],
        ...             append_columns=True)
                  a         b                                             nested
        0  0.417022  0.184677  [{t: 8.38389, flux: 31.551563, flux_error: 1.0...
        1  0.720324  0.372520  [{t: 13.70439, flux: 68.650093, flux_error: 1....
        2  0.000114  0.691121  [{t: 4.089045, flux: 83.462567, flux_error: 1....
        3  0.302333  0.793535  [{t: 17.562349, flux: 1.828828, flux_error: 1....
        4  0.146756  1.077633  [{t: 0.547752, flux: 75.014431, flux_error: 1....


        Notes
        -----
        If concerned about performance, specify `columns` to only include the columns
        needed for the function, as this will avoid the overhead of packaging
        all columns for each row.

        By default, `map_rows` will produce a `NestedFrame` with enumerated
        column names for each returned value of the function. It's recommended
        to either specify `output_names` or have `func` return a dictionary
        where each key is an output column of the dataframe returned by
        `map_rows` (as shown above).

        >>> def example_func(row):
        ...     return np.mean(row["nested.t"]), np.mean(row["nested.t"]) - row["a"]

        >>> # first output column will be named "0", second "1"
        >>> nf.map_rows(example_func)
                   0          1
        0  11.533440  11.116418
        1  10.307751   9.587426
        2   8.294042   8.293928
        3   9.655291   9.352958
        4  10.687591  10.540836
        """
        # Determine args
        if columns is None:
            # If None, pass all columns, with nested columns expanded to sub-columns
            columns = self.base_columns + self.get_subcolumns(nested_columns="all")
        elif isinstance(columns, str):
            # If it's a nested column, grab all sub-columns
            columns = self.get_subcolumns(columns) if columns in self.nested_columns else [columns]

        # Check arg validity
        requested_columns = []
        for arg in columns:
            if not isinstance(arg, str):
                raise TypeError(
                    f"Received an argument '{arg}' that is not a string. "
                    "All arguments to `map_rows` must be strings corresponding to"
                    " column names to pass along to the function."
                )
            components = self._parse_hierarchical_components(arg)
            if not self._is_known_column(components):
                raise ValueError(
                    f"Received a string argument '{arg}' that was not found in the columns list. "
                    "All arguments to `map_rows` must be strings corresponding to"
                    " column names to pass along to the function."
                )
            layer = "base" if len(components) < 2 else components[0]
            col = components[-1]
            requested_columns.append((layer, col))

        # Construct row containers and apply
        results = []

        if row_container == "dict":
            if njit:
                raise ValueError(
                    "njit execution is not supported for `row_container='dict'`, "
                    "use `row_container='args'` instead."
                )

            arg_dict = {}
            for layer, col in requested_columns:
                if layer == "base":
                    arg_dict[col] = self[col]
                else:
                    arg_dict[".".join([layer, col])] = self[layer].array.iter_field_lists(col)
            results = [
                func(
                    {col: val for col, val in zip(arg_dict.keys(), row, strict=True)},
                    **kwargs,
                )
                for row in zip(*arg_dict.values(), strict=True)
            ]

        elif row_container == "args":
            if njit:
                try:
                    results = self._apply_njit_map_rows(requested_columns, func)
                # except Exception as err:
                except Exception as err:
                    raise ValueError(
                        "njit execution for map_rowsis only supported for "
                        "numba.jit decorated functions with at most 2 arguments"
                    ) from err
            else:
                # Default python execution
                iterators = []
                for layer, col in requested_columns:
                    if layer == "base":
                        iterators.append(self[col])
                    else:
                        iterators.append(self[layer].array.iter_field_lists(col))

                results = [func(*cols, **kwargs) for cols in zip(*iterators, strict=True)]

        # If the func returns a single array per row wrap results in a `NestedSeries`.
        # Otherwise, Pandas will try to expand array elements into separate columns.
        if results and isinstance(results[0], np.ndarray):
            results_nf = NestedFrame(NestedSeries(results, index=self.index))
        else:
            results_nf = NestedFrame(results, index=self.index)

        # Override output names if specified
        if output_names is not None:
            if isinstance(output_names, str):
                output_names = [output_names]
            if len(output_names) != len(results_nf.columns):
                raise ValueError(
                    f"Number of output names ({len(output_names)}) does not match "
                    f"the number of outputs from the function ({len(results_nf.columns)})"
                )
            results_nf.columns = output_names

        if infer_nesting:
            # find potential nested structures from columns
            nested_cols = list(
                np.unique(
                    [
                        column.split(".", 1)[0]
                        for column in results_nf.columns
                        if isinstance(column, str) and "." in column
                    ]
                )
            )

            # pack results into nested structures
            for layer in nested_cols:
                layer_cols = [col for col in results_nf.columns if col.startswith(f"{layer}.")]
                rename_df = results_nf[layer_cols].rename(columns=lambda x: x.split(".", 1)[1])
                nested_col = pack_lists(rename_df, name=layer)
                results_nf = results_nf[
                    [col for col in results_nf.columns if not col.startswith(f"{layer}.")]
                ]
                results_nf[layer] = nested_col

        if append_columns:
            # Append sub-columns to existing nested columns
            self_nested_cols = [col for col in results_nf.nested_columns if col in self.nested_columns]
            for col in self_nested_cols:
                sub_columns = results_nf.get_subcolumns(col)
                for sub_col in sub_columns:
                    self = self.assign(**{f"{sub_col}": results_nf[sub_col]})
            # Append other base and nested columns
            base_results_nf = results_nf.drop(columns=self_nested_cols)
            return pd.concat([self, base_results_nf], axis=1)

        # Otherwise, return the results as a new NestedFrame
        return results_nf



[docs]
    def to_pandas(self, list_struct=False, large_list=False) -> pd.DataFrame:
        """Convert to an ordinal pandas DataFrame, with no NestedDtype series.

        NestedDtype is cast to pd.ArrowDtype

        Parameters
        ----------
        list_struct: bool
            If True, cast nested columns to pandas struct-list arrow extension
            array columns. If False (default), cast nested columns to
            list-struct array columns.
        large_list : bool
            If False (default), use regular ``list_`` (int32 offsets). Set to
            True to use ``large_list`` (int64 offsets), which is required when
            the total number of nested elements across all rows exceeds
            ``2**31 - 1``.

        Returns
        -------
        pd.DataFrame
            Ordinal pandas DataFrame.

        Examples
        --------
        >>> from nested_pandas.datasets.generation import generate_data
        >>> nf = generate_data(5,5, seed=1)
        >>> nf.to_pandas()
                  a         b                                             nested
        0  0.417022  0.184677  {'t': array([ 8.38389029, 13.4093502 , 16.0148...
        1  0.720324  0.372520  {'t': array([13.70439001,  8.34609605, 19.3652...
        2  0.000114  0.691121  {'t': array([ 4.08904499, 11.17379657,  6.2684...
        3  0.302333  0.793535  {'t': array([17.56234873,  2.80773877, 13.8464...
        4  0.146756  1.077633  {'t': array([ 0.54775186,  3.96202978, 17.5277...
        """
        df = pd.DataFrame(self)
        for col in self.nested_columns:
            df[col] = df[col].array.to_arrow_ext_array(list_struct=list_struct, large_list=large_list)
        return df



[docs]
    def to_parquet(self, path, large_list=False, **kwargs) -> None:
        """Creates parquet file(s) with the data of a NestedFrame, either
        as a single parquet file where each nested dataset is packed into its
        own column or as an individual parquet file for each layer.

        Note that here we always opt to use the pyarrow engine for writing
        parquet files.

        Parameters
        ----------
        path : str
            The path to the parquet file
        large_list : bool
            If False (default), use regular ``list_`` (int32 offsets). Set to
            True to use ``large_list`` (int64 offsets), which is required when
            the total number of nested elements across all rows exceeds
            ``2**31 - 1``.
        kwargs : keyword arguments, optional
            Keyword arguments to pass to
            `pyarrow.parquet.write_table
            <https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html>`_

        Returns
        -------
        None

        Examples
        --------
        >>> from nested_pandas.datasets.generation import generate_data
        >>> nf = generate_data(5,5, seed=1)
        >>> nf.to_parquet("nestedframe.parquet")  # doctest: +SKIP
        """
        df = self.to_pandas(list_struct=False, large_list=large_list)

        # Write through pyarrow
        # This is potentially not zero-copy
        # Note: Without pandas metadata, index writing is not as robust set
        # preserve_index=None for best behavior but index will generally
        # need to be set manually on load
        table = pa.Table.from_pandas(df, preserve_index=None)

        # Drop pandas metadata to make sure nesteddtypes are not preserved
        # Do this by rebuilding the schema
        table = table.cast(pa.schema([field for field in table.schema]))

        return pq.write_table(table, path, **kwargs)