Source code for nested_pandas.series.dtype

from __future__ import annotations  # Self is not available in python 3.10

from collections.abc import Mapping

# We use Type, because we must use "type" as an attribute name
from typing import Type, cast  # noqa: UP035

import pandas as pd
import pyarrow as pa
from deprecated import deprecated
from pandas import ArrowDtype
from pandas.api.extensions import register_extension_dtype
from pandas.core.arrays import ExtensionArray
from pandas.core.dtypes.base import ExtensionDtype

from nested_pandas.series.utils import (
    is_pa_type_a_list,
    is_pa_type_is_list_struct,
    normalize_struct_list_type,
    transpose_list_struct_type,
    transpose_struct_list_type,
)

__all__ = ["NestedDtype"]


[docs] @register_extension_dtype class NestedDtype(ExtensionDtype): """Data type to handle packed time series data Parameters ---------- pyarrow_dtype : pyarrow.StructType, pd.ArrowDtype, or Mapping[str, pa.DataType] The pyarrow data type to use for the nested type. It may be provided as a pyarrow.StructType, a pandas.ArrowDtype, or a mapping of column names to pyarrow data types (such as a dictionary). Examples -------- >>> import pyarrow as pa >>> from nested_pandas import NestedDtype From pa.StructType: >>> dtype = NestedDtype(pa.struct([pa.field("a", pa.list_(pa.int64())), ... pa.field("b", pa.list_(pa.float64()))])) >>> dtype nested<a: [int64], b: [double]> From pd.ArrowDtype: >>> import pandas as pd >>> dtype = NestedDtype(pd.ArrowDtype(pa.struct([pa.field("a", pa.list_(pa.int64())), ... pa.field("b", pa.list_(pa.float64()))]))) >>> dtype nested<a: [int64], b: [double]> From mapping of column names to pyarrow data types: >>> dtype = NestedDtype({"a": pa.int64(), "b": pa.float64()}) >>> dtype nested<a: [int64], b: [double]> """ # ExtensionDtype overrides # _metadata = ("pyarrow_dtype",) """Attributes to use as metadata for __eq__ and __hash__""" @property def na_value(self): """The missing value for this dtype""" return pd.NA # type: ignore[return-value] type = pd.DataFrame """The type of the array's elements, always pd.DataFrame""" @property def name(self) -> str: """The string representation of the nested type""" # Replace pd.ArrowDtype with pa.DataType, because it has nicer __str__ field_dtypes = {field: self.column_dtype(field) for field in list(self.column_dtypes.keys())} nice_dtypes = { field: dtype.pyarrow_dtype if isinstance(dtype, pd.ArrowDtype) else dtype for field, dtype in field_dtypes.items() } fields = ", ".join([f"{field}: [{dtype!s}]" for field, dtype in nice_dtypes.items()]) return f"nested<{fields}>" @name.setter def name(self, value: str): raise TypeError("name cannot be changed") def __repr__(self) -> str: return self.name
[docs] @classmethod def construct_array_type(cls) -> Type[ExtensionArray]: """Corresponded array type, always NestedExtensionArray""" from nested_pandas.series.ext_array import NestedExtensionArray return NestedExtensionArray
[docs] @classmethod def construct_from_string(cls, string: str) -> Self: # type: ignore[name-defined] # noqa: F821 """Construct NestedDtype from a string representation. This works only for simple types, i.e. non-parametric pyarrow types. Parameters ---------- string : str The string representation of the nested type. For example, 'nested<x: [int64], y: [float64]'. It must be consistent with the string representation of the dtype given by the `name` attribute. Returns ------- NestedDtype The constructed NestedDtype. Raises ------ TypeError If the string is not a valid nested type string or if the element types are parametric pyarrow types. """ if not string.startswith("nested<") or not string.endswith(">"): raise TypeError("Not a valid nested type string, expected 'nested<...>'") fields_str = string.removeprefix("nested<").removesuffix(">") field_strings = fields_str.split(", ") fields = {} for field_string in field_strings: try: field_name, field_type = field_string.split(": ", maxsplit=1) except ValueError as e: raise TypeError( "Not a valid nested type string, expected 'nested<x: [type], ...>', got invalid field " f"string '{field_string}'" ) from e if not field_type.startswith("[") or not field_type.endswith("]"): raise TypeError( "Not a valid nested type string, expected 'nested<x: [type], ...>', got invalid field " f"type string '{field_type}'" ) value_type = field_type.removeprefix("[").removesuffix("]") # We follow ArrowDtype implementation heere and do not try to parse complex types try: pa_value_type = pa.type_for_alias(value_type) except ValueError as e: raise TypeError( f"Parsing pyarrow specific parameters in the string is not supported yet: {value_type}. " "Please use NestedDtype() or NestedDtype.from_columns() instead." ) from e fields[field_name] = pa_value_type return cls.from_columns(fields)
# ArrowDtype would return None so we do def _get_common_dtype(self, dtypes: list) -> None: return None # Optional methods # def __from_arrow__(self, array: pa.Array | pa.ChunkedArray) -> ExtensionArray: """Construct a NestedExtensionArray from a pyarrow array. Parameters ---------- array : pa.Array | pa.ChunkedArray The input pyarrow array. Returns ------- NestedExtensionArray The constructed NestedExtensionArray. """ from nested_pandas.series.ext_array import NestedExtensionArray return NestedExtensionArray(array) # Additional methods and attributes # pyarrow_dtype: pa.StructType
[docs] def __init__(self, pyarrow_dtype: pa.DataType | Mapping) -> None: # Allow pd.ArrowDtypes on init if isinstance(pyarrow_dtype, pd.ArrowDtype): pyarrow_dtype = pyarrow_dtype.pyarrow_dtype # Allow from_columns-style mapping inputs if isinstance(pyarrow_dtype, Mapping): pyarrow_dtype = pa.struct({col: pa.large_list(pa_type) for col, pa_type in pyarrow_dtype.items()}) pyarrow_dtype = cast(pa.StructType, pyarrow_dtype) self.pyarrow_dtype, self.list_struct_pa_dtype = self._validate_dtype(pyarrow_dtype)
@property @deprecated( version="0.6.0", reason="`struct_list_pa_dtype` will be removed in version 0.7.0, " "use `_struct_list_pa_dtype` instead.", ) def struct_list_pa_dtype(self) -> pa.StructType: """Struct-list pyarrow type representing the nested type.""" return self._struct_list_pa_dtype @property def _struct_list_pa_dtype(self) -> pa.StructType: """Struct-list pyarrow type representing the nested type.""" return self.pyarrow_dtype @classmethod @deprecated( version="0.6.0", reason="`from_fields` will be removed in version 0.7.0, use `from_columns` instead.", ) def from_fields(cls, fields: Mapping[str, pa.DataType]) -> Self: # type: ignore[name-defined] # noqa: F821 """Make NestedDtype from a mapping of field names and list item types. Parameters ---------- fields : Mapping[str, pa.DataType] A mapping of field names and their item types. Since all fields are lists, the item types are inner types of the lists, not the list types themselves. Returns ------- NestedDtype The constructed NestedDtype. Examples -------- >>> dtype = NestedDtype.from_fields({"a": pa.float64(), "b": pa.int64()}) >>> dtype nested<a: [double], b: [int64]> >>> assert ( ... dtype.pyarrow_dtype ... == pa.struct({"a": pa.large_list(pa.float64()), "b": pa.large_list(pa.int64())}) ... ) """ return cls.from_columns(fields)
[docs] @classmethod def from_columns(cls, columns: Mapping[str, pa.DataType]) -> Self: # type: ignore[name-defined] # noqa: F821 """Make NestedDtype from a mapping of column names and list item types. Parameters ---------- columns : Mapping[str, pa.DataType] A mapping of column names and their item types. Since all fields are lists, the item types are inner types of the lists, not the list types themselves. Returns ------- NestedDtype The constructed NestedDtype. Examples -------- >>> dtype = NestedDtype.from_columns({"a": pa.float64(), "b": pa.int64()}) >>> dtype nested<a: [double], b: [int64]> >>> assert ( ... dtype.pyarrow_dtype ... == pa.struct({"a": pa.large_list(pa.float64()), "b": pa.large_list(pa.int64())}) ... ) """ pyarrow_dtype = pa.struct({column: pa.large_list(pa_type) for column, pa_type in columns.items()}) pyarrow_dtype = cast(pa.StructType, pyarrow_dtype) return cls(pyarrow_dtype=pyarrow_dtype)
@staticmethod def _validate_dtype(pyarrow_dtype: pa.DataType) -> tuple[pa.StructType, pa.LargeListType]: """Check that the given pyarrow type is castable to the nested type. Parameters ---------- pyarrow_dtype : pa.DataType The pyarrow type to check and cast. Returns ------- pa.StructType Struct-list pyarrow type representing the nested type. pa.LargeListType List-struct pyarrow type representing the nested type. """ if not isinstance(pyarrow_dtype, pa.DataType): raise TypeError(f"Expected a 'pyarrow.DataType' object, got {type(pyarrow_dtype)}") if pa.types.is_struct(pyarrow_dtype): struct_type = cast(pa.StructType, pyarrow_dtype) # Normalize list fields to large_list for backward compatibility # (callers may pass pa.list_ fields) struct_type = normalize_struct_list_type(struct_type) return struct_type, transpose_struct_list_type(struct_type) # Support pa.large_list (and pa.list_ for backward compatibility) if is_pa_type_a_list(pyarrow_dtype): if not pa.types.is_large_list(pyarrow_dtype): # Normalize regular list or fixed-size list to large_list pyarrow_dtype = pa.large_list(pyarrow_dtype.value_type) list_type = cast(pa.LargeListType, pyarrow_dtype) return transpose_list_struct_type(list_type), list_type raise ValueError( "NestedDtype can only be constructed with pa.StructType, pa.LargeListType, " f"or pa.ListType, got {pyarrow_dtype}" ) @property @deprecated( version="0.6.0", reason="`fields` will be removed in version 0.7.0, use `column_dtypes` instead." ) def fields(self) -> dict[str, pa.DataType]: """The mapping of field names and their item types.""" return self.column_dtypes @property def column_dtypes(self) -> dict[str, pa.DataType]: """The mapping of field names and their item types.""" return {column.name: column.type.value_type for column in self.pyarrow_dtype} @property @deprecated(version="0.6.0", reason="`struct_list_pa_dtype` will be removed in version 0.7.0.") def field_names(self) -> list[str]: """The list of field names of the nested type""" return [field.name for field in self.pyarrow_dtype]
[docs] @classmethod def from_pandas_arrow_dtype(cls, pandas_arrow_dtype: ArrowDtype) -> Self: # type: ignore[name-defined] # noqa: F821 """Construct NestedDtype from a pandas.ArrowDtype. Parameters ---------- pandas_arrow_dtype : ArrowDtype The pandas.ArrowDtype to construct NestedDtype from. Must be struct-list or list-struct type. Returns ------- NestedDtype The constructed NestedDtype. Raises ------ ValueError If the given dtype is not a valid nested type. """ return cls(pyarrow_dtype=pandas_arrow_dtype.pyarrow_dtype)
[docs] def to_pandas_arrow_dtype(self, list_struct: bool = False) -> ArrowDtype: """Convert NestedDtype to a pandas.ArrowDtype. Parameters ---------- list_struct : bool, default False If False (default) use pyarrow struct-list type, otherwise use pyarrow list-struct type. Returns ------- ArrowDtype The corresponding pandas.ArrowDtype. """ if list_struct: return ArrowDtype(self.list_struct_pa_dtype) return ArrowDtype(self.pyarrow_dtype)
@deprecated( version="0.6.0", reason="`field_dtype` will be removed in version 0.7.0, use `_struct_list_pa_dtype` instead.", ) def field_dtype(self, field: str) -> pd.ArrowDtype | Self: # type: ignore[name-defined] # noqa: F821 """Pandas dtype of a field, pd.ArrowDType or NestedDtype. Parameters ---------- field : str Field name Returns ------- pd.ArrowDtype | NestedDtype If the field is a list-struct, return NestedDtype, else wrap it as a pd.ArrowDtype. """ return self.column_dtype(field) def column_dtype(self, column: str) -> pd.ArrowDtype | Self: # type: ignore[name-defined] # noqa: F821 """Pandas dtype of a column, pd.ArrowDType or NestedDtype. Parameters ---------- column : str Column name Returns ------- pd.ArrowDtype | NestedDtype If the column is a list-struct, return NestedDtype, else wrap it as a pd.ArrowDtype. """ list_type = self.pyarrow_dtype.field(column).type value_type = list_type.value_type if is_pa_type_is_list_struct(value_type): return type(self)(value_type) return pd.ArrowDtype(value_type) @property @deprecated( version="0.6.0", reason="`field_dtypes` will be removed in version 0.7.0, use `_struct_list_pa_dtype` instead.", ) def field_dtypes(self) -> dict[str, pd.ArrowDtype | Self]: # type: ignore[name-defined] # noqa: F821 """Pandas dtypes of this dtype's fields.""" return {field: self.field_dtype(field) for field in self.field_names}