from __future__ import annotations # Self is not available in python 3.10
from collections.abc import Mapping
# We use Type, because we must use "type" as an attribute name
from typing import Type, cast # noqa: UP035
import pandas as pd
import pyarrow as pa
from deprecated import deprecated
from pandas import ArrowDtype
from pandas.api.extensions import register_extension_dtype
from pandas.core.arrays import ExtensionArray
from pandas.core.dtypes.base import ExtensionDtype
from nested_pandas.series.utils import (
is_pa_type_a_list,
is_pa_type_is_list_struct,
normalize_struct_list_type,
transpose_list_struct_type,
transpose_struct_list_type,
)
__all__ = ["NestedDtype"]
[docs]
@register_extension_dtype
class NestedDtype(ExtensionDtype):
"""Data type to handle packed time series data
Parameters
----------
pyarrow_dtype : pyarrow.StructType, pd.ArrowDtype, or Mapping[str, pa.DataType]
The pyarrow data type to use for the nested type. It may be provided as
a pyarrow.StructType, a pandas.ArrowDtype, or a mapping of column names to
pyarrow data types (such as a dictionary).
Examples
--------
>>> import pyarrow as pa
>>> from nested_pandas import NestedDtype
From pa.StructType:
>>> dtype = NestedDtype(pa.struct([pa.field("a", pa.list_(pa.int64())),
... pa.field("b", pa.list_(pa.float64()))]))
>>> dtype
nested<a: [int64], b: [double]>
From pd.ArrowDtype:
>>> import pandas as pd
>>> dtype = NestedDtype(pd.ArrowDtype(pa.struct([pa.field("a", pa.list_(pa.int64())),
... pa.field("b", pa.list_(pa.float64()))])))
>>> dtype
nested<a: [int64], b: [double]>
From mapping of column names to pyarrow data types:
>>> dtype = NestedDtype({"a": pa.int64(), "b": pa.float64()})
>>> dtype
nested<a: [int64], b: [double]>
"""
# ExtensionDtype overrides #
_metadata = ("pyarrow_dtype",)
"""Attributes to use as metadata for __eq__ and __hash__"""
@property
def na_value(self):
"""The missing value for this dtype"""
return pd.NA # type: ignore[return-value]
type = pd.DataFrame
"""The type of the array's elements, always pd.DataFrame"""
@property
def name(self) -> str:
"""The string representation of the nested type"""
# Replace pd.ArrowDtype with pa.DataType, because it has nicer __str__
field_dtypes = {field: self.column_dtype(field) for field in list(self.column_dtypes.keys())}
nice_dtypes = {
field: dtype.pyarrow_dtype if isinstance(dtype, pd.ArrowDtype) else dtype
for field, dtype in field_dtypes.items()
}
fields = ", ".join([f"{field}: [{dtype!s}]" for field, dtype in nice_dtypes.items()])
return f"nested<{fields}>"
@name.setter
def name(self, value: str):
raise TypeError("name cannot be changed")
def __repr__(self) -> str:
return self.name
[docs]
@classmethod
def construct_array_type(cls) -> Type[ExtensionArray]:
"""Corresponded array type, always NestedExtensionArray"""
from nested_pandas.series.ext_array import NestedExtensionArray
return NestedExtensionArray
[docs]
@classmethod
def construct_from_string(cls, string: str) -> Self: # type: ignore[name-defined] # noqa: F821
"""Construct NestedDtype from a string representation.
This works only for simple types, i.e. non-parametric pyarrow types.
Parameters
----------
string : str
The string representation of the nested type. For example,
'nested<x: [int64], y: [float64]'. It must be consistent with
the string representation of the dtype given by the `name`
attribute.
Returns
-------
NestedDtype
The constructed NestedDtype.
Raises
------
TypeError
If the string is not a valid nested type string or if the element types
are parametric pyarrow types.
"""
if not string.startswith("nested<") or not string.endswith(">"):
raise TypeError("Not a valid nested type string, expected 'nested<...>'")
fields_str = string.removeprefix("nested<").removesuffix(">")
field_strings = fields_str.split(", ")
fields = {}
for field_string in field_strings:
try:
field_name, field_type = field_string.split(": ", maxsplit=1)
except ValueError as e:
raise TypeError(
"Not a valid nested type string, expected 'nested<x: [type], ...>', got invalid field "
f"string '{field_string}'"
) from e
if not field_type.startswith("[") or not field_type.endswith("]"):
raise TypeError(
"Not a valid nested type string, expected 'nested<x: [type], ...>', got invalid field "
f"type string '{field_type}'"
)
value_type = field_type.removeprefix("[").removesuffix("]")
# We follow ArrowDtype implementation heere and do not try to parse complex types
try:
pa_value_type = pa.type_for_alias(value_type)
except ValueError as e:
raise TypeError(
f"Parsing pyarrow specific parameters in the string is not supported yet: {value_type}. "
"Please use NestedDtype() or NestedDtype.from_columns() instead."
) from e
fields[field_name] = pa_value_type
return cls.from_columns(fields)
# ArrowDtype would return None so we do
def _get_common_dtype(self, dtypes: list) -> None:
return None
# Optional methods #
def __from_arrow__(self, array: pa.Array | pa.ChunkedArray) -> ExtensionArray:
"""Construct a NestedExtensionArray from a pyarrow array.
Parameters
----------
array : pa.Array | pa.ChunkedArray
The input pyarrow array.
Returns
-------
NestedExtensionArray
The constructed NestedExtensionArray.
"""
from nested_pandas.series.ext_array import NestedExtensionArray
return NestedExtensionArray(array)
# Additional methods and attributes #
pyarrow_dtype: pa.StructType
[docs]
def __init__(self, pyarrow_dtype: pa.DataType | Mapping) -> None:
# Allow pd.ArrowDtypes on init
if isinstance(pyarrow_dtype, pd.ArrowDtype):
pyarrow_dtype = pyarrow_dtype.pyarrow_dtype
# Allow from_columns-style mapping inputs
if isinstance(pyarrow_dtype, Mapping):
pyarrow_dtype = pa.struct({col: pa.large_list(pa_type) for col, pa_type in pyarrow_dtype.items()})
pyarrow_dtype = cast(pa.StructType, pyarrow_dtype)
self.pyarrow_dtype, self.list_struct_pa_dtype = self._validate_dtype(pyarrow_dtype)
@property
@deprecated(
version="0.6.0",
reason="`struct_list_pa_dtype` will be removed in version 0.7.0, "
"use `_struct_list_pa_dtype` instead.",
)
def struct_list_pa_dtype(self) -> pa.StructType:
"""Struct-list pyarrow type representing the nested type."""
return self._struct_list_pa_dtype
@property
def _struct_list_pa_dtype(self) -> pa.StructType:
"""Struct-list pyarrow type representing the nested type."""
return self.pyarrow_dtype
@classmethod
@deprecated(
version="0.6.0",
reason="`from_fields` will be removed in version 0.7.0, use `from_columns` instead.",
)
def from_fields(cls, fields: Mapping[str, pa.DataType]) -> Self: # type: ignore[name-defined] # noqa: F821
"""Make NestedDtype from a mapping of field names and list item types.
Parameters
----------
fields : Mapping[str, pa.DataType]
A mapping of field names and their item types. Since all fields are lists, the item types are
inner types of the lists, not the list types themselves.
Returns
-------
NestedDtype
The constructed NestedDtype.
Examples
--------
>>> dtype = NestedDtype.from_fields({"a": pa.float64(), "b": pa.int64()})
>>> dtype
nested<a: [double], b: [int64]>
>>> assert (
... dtype.pyarrow_dtype
... == pa.struct({"a": pa.large_list(pa.float64()), "b": pa.large_list(pa.int64())})
... )
"""
return cls.from_columns(fields)
[docs]
@classmethod
def from_columns(cls, columns: Mapping[str, pa.DataType]) -> Self: # type: ignore[name-defined] # noqa: F821
"""Make NestedDtype from a mapping of column names and list item types.
Parameters
----------
columns : Mapping[str, pa.DataType]
A mapping of column names and their item types. Since all fields are lists, the item types are
inner types of the lists, not the list types themselves.
Returns
-------
NestedDtype
The constructed NestedDtype.
Examples
--------
>>> dtype = NestedDtype.from_columns({"a": pa.float64(), "b": pa.int64()})
>>> dtype
nested<a: [double], b: [int64]>
>>> assert (
... dtype.pyarrow_dtype
... == pa.struct({"a": pa.large_list(pa.float64()), "b": pa.large_list(pa.int64())})
... )
"""
pyarrow_dtype = pa.struct({column: pa.large_list(pa_type) for column, pa_type in columns.items()})
pyarrow_dtype = cast(pa.StructType, pyarrow_dtype)
return cls(pyarrow_dtype=pyarrow_dtype)
@staticmethod
def _validate_dtype(pyarrow_dtype: pa.DataType) -> tuple[pa.StructType, pa.LargeListType]:
"""Check that the given pyarrow type is castable to the nested type.
Parameters
----------
pyarrow_dtype : pa.DataType
The pyarrow type to check and cast.
Returns
-------
pa.StructType
Struct-list pyarrow type representing the nested type.
pa.LargeListType
List-struct pyarrow type representing the nested type.
"""
if not isinstance(pyarrow_dtype, pa.DataType):
raise TypeError(f"Expected a 'pyarrow.DataType' object, got {type(pyarrow_dtype)}")
if pa.types.is_struct(pyarrow_dtype):
struct_type = cast(pa.StructType, pyarrow_dtype)
# Normalize list fields to large_list for backward compatibility
# (callers may pass pa.list_ fields)
struct_type = normalize_struct_list_type(struct_type)
return struct_type, transpose_struct_list_type(struct_type)
# Support pa.large_list (and pa.list_ for backward compatibility)
if is_pa_type_a_list(pyarrow_dtype):
if not pa.types.is_large_list(pyarrow_dtype):
# Normalize regular list or fixed-size list to large_list
pyarrow_dtype = pa.large_list(pyarrow_dtype.value_type)
list_type = cast(pa.LargeListType, pyarrow_dtype)
return transpose_list_struct_type(list_type), list_type
raise ValueError(
"NestedDtype can only be constructed with pa.StructType, pa.LargeListType, "
f"or pa.ListType, got {pyarrow_dtype}"
)
@property
@deprecated(
version="0.6.0", reason="`fields` will be removed in version 0.7.0, use `column_dtypes` instead."
)
def fields(self) -> dict[str, pa.DataType]:
"""The mapping of field names and their item types."""
return self.column_dtypes
@property
def column_dtypes(self) -> dict[str, pa.DataType]:
"""The mapping of field names and their item types."""
return {column.name: column.type.value_type for column in self.pyarrow_dtype}
@property
@deprecated(version="0.6.0", reason="`struct_list_pa_dtype` will be removed in version 0.7.0.")
def field_names(self) -> list[str]:
"""The list of field names of the nested type"""
return [field.name for field in self.pyarrow_dtype]
[docs]
@classmethod
def from_pandas_arrow_dtype(cls, pandas_arrow_dtype: ArrowDtype) -> Self: # type: ignore[name-defined] # noqa: F821
"""Construct NestedDtype from a pandas.ArrowDtype.
Parameters
----------
pandas_arrow_dtype : ArrowDtype
The pandas.ArrowDtype to construct NestedDtype from.
Must be struct-list or list-struct type.
Returns
-------
NestedDtype
The constructed NestedDtype.
Raises
------
ValueError
If the given dtype is not a valid nested type.
"""
return cls(pyarrow_dtype=pandas_arrow_dtype.pyarrow_dtype)
[docs]
def to_pandas_arrow_dtype(self, list_struct: bool = False) -> ArrowDtype:
"""Convert NestedDtype to a pandas.ArrowDtype.
Parameters
----------
list_struct : bool, default False
If False (default) use pyarrow struct-list type,
otherwise use pyarrow list-struct type.
Returns
-------
ArrowDtype
The corresponding pandas.ArrowDtype.
"""
if list_struct:
return ArrowDtype(self.list_struct_pa_dtype)
return ArrowDtype(self.pyarrow_dtype)
@deprecated(
version="0.6.0",
reason="`field_dtype` will be removed in version 0.7.0, use `_struct_list_pa_dtype` instead.",
)
def field_dtype(self, field: str) -> pd.ArrowDtype | Self: # type: ignore[name-defined] # noqa: F821
"""Pandas dtype of a field, pd.ArrowDType or NestedDtype.
Parameters
----------
field : str
Field name
Returns
-------
pd.ArrowDtype | NestedDtype
If the field is a list-struct, return NestedDtype, else wrap it
as a pd.ArrowDtype.
"""
return self.column_dtype(field)
def column_dtype(self, column: str) -> pd.ArrowDtype | Self: # type: ignore[name-defined] # noqa: F821
"""Pandas dtype of a column, pd.ArrowDType or NestedDtype.
Parameters
----------
column : str
Column name
Returns
-------
pd.ArrowDtype | NestedDtype
If the column is a list-struct, return NestedDtype, else wrap it
as a pd.ArrowDtype.
"""
list_type = self.pyarrow_dtype.field(column).type
value_type = list_type.value_type
if is_pa_type_is_list_struct(value_type):
return type(self)(value_type)
return pd.ArrowDtype(value_type)
@property
@deprecated(
version="0.6.0",
reason="`field_dtypes` will be removed in version 0.7.0, use `_struct_list_pa_dtype` instead.",
)
def field_dtypes(self) -> dict[str, pd.ArrowDtype | Self]: # type: ignore[name-defined] # noqa: F821
"""Pandas dtypes of this dtype's fields."""
return {field: self.field_dtype(field) for field in self.field_names}