Source code for pandas_dataclasses.core.api

__all__ = ["asframe", "aspandas", "asseries"]


# standard library
from types import FunctionType
from typing import Any, Callable, Hashable, Iterable, Optional, overload


# dependencies
import numpy as np
import pandas as pd
from pandas.api.types import is_list_like
from typing_extensions import get_origin
from .specs import Field, Fields, Spec
from .tagging import Tag
from .typing import DataClass, DataClassOf, PAny, TFrame, TPandas, TSeries


@overload
def aspandas(obj: DataClassOf[TPandas, PAny], *, factory: None = None) -> TPandas: ...


@overload
def aspandas(obj: DataClass[PAny], *, factory: Callable[..., TPandas]) -> TPandas: ...



[docs]
def aspandas(obj: Any, *, factory: Any = None) -> Any:
    """Create a DataFrame or Series object from a dataclass object.

    Which data structure is created will be determined by a factory
    defined as the ``__pandas_factory__`` attribute in the original
    dataclass of ``obj`` or the ``factory`` argument. If a factory is
    a function, it must have an annotation of the return type.

    Args:
        obj: Dataclass object that should have attribute, column, data,
            and/or index fields. If the original dataclass has the
            ``__pandas_factory__`` attribute, it will be used as a
            factory for the data creation.

    Keyword Args:
        factory: Class or function for the DataFrame or Series creation.
            It must take the same parameters as ``pandas.DataFrame``
            or ``pandas.Series``, and return an object of it or its
            subclass. If it is a function, it must have an annotation
            of the return type. If passed, it will be preferentially
            used even if the original dataclass of ``obj`` has the
            ``__pandas_factory__`` attribute.

    Returns:
        DataFrame or Series object that complies with the original dataclass.

    Raises:
        ValueError: Raised if no factory is found or the return type
            cannot be inferred from a factory when it is a function.

    """
    spec = Spec.from_dataclass(type(obj)) @ obj

    if factory is None:
        factory = spec.factory

    if factory is None:
        raise ValueError("Could not find any factory.")

    if isinstance(factory, FunctionType):
        return_ = factory.__annotations__["return"]
    else:
        return_ = factory

    origin = get_origin(return_) or return_

    if issubclass(origin, pd.DataFrame):
        return asframe(obj, factory=factory)
    elif issubclass(origin, pd.Series):
        return asseries(obj, factory=factory)
    else:
        raise ValueError("Could not infer an object type.")



@overload
def asframe(obj: DataClassOf[TFrame, PAny], *, factory: None = None) -> TFrame: ...


@overload
def asframe(obj: DataClass[PAny], *, factory: Callable[..., TFrame]) -> TFrame: ...


@overload
def asframe(obj: DataClass[PAny], *, factory: None = None) -> pd.DataFrame: ...



[docs]
def asframe(obj: Any, *, factory: Any = None) -> Any:
    """Create a DataFrame object from a dataclass object.

    The return type will be determined by a factory defined as the
    ``__pandas_factory__`` attribute in the original dataclass of
    ``obj`` or the ``factory`` argument. If neither is specified,
    it defaults to ``pandas.DataFrame``.

    Args:
        obj: Dataclass object that should have attribute, column, data,
            and/or index fields. If the original dataclass has the
            ``__pandas_factory__`` attribute, it will be used as a
            factory for the DataFrame creation.

    Keyword Args:
        factory: Class or function for the DataFrame creation.
            It must take the same parameters as ``pandas.DataFrame``,
            and return an object of it or its subclass. If passed, it
            will be preferentially used even if the original dataclass
            of ``obj`` has the ``__pandas_factory__`` attribute.

    Returns:
        DataFrame object that complies with the original dataclass.

    """
    spec = Spec.from_dataclass(type(obj)) @ obj

    if factory is None:
        factory = spec.factory or pd.DataFrame

    dataframe = factory(
        data=get_data(spec),
        index=get_index(spec),
        columns=get_columns(spec),
    )

    dataframe.attrs.update(get_attrs(spec))
    return squeeze(dataframe)



@overload
def asseries(obj: DataClassOf[TSeries, PAny], *, factory: None = None) -> TSeries: ...


@overload
def asseries(obj: DataClass[PAny], *, factory: Callable[..., TSeries]) -> TSeries: ...


@overload
def asseries(obj: DataClass[PAny], *, factory: None = None) -> "pd.Series[Any]": ...



[docs]
def asseries(obj: Any, *, factory: Any = None) -> Any:
    """Create a Series object from a dataclass object.

    The return type will be determined by a factory defined as the
    ``__pandas_factory__`` attribute in the original dataclass of
    ``obj`` or the ``factory`` argument. If neither is specified,
    it defaults to ``pandas.Series``.

    Args:
        obj: Dataclass object that should have attribute, column, data,
            and/or index fields. If the original dataclass has the
            ``__pandas_factory__`` attribute, it will be used as a
            factory for the Series creation.

    Keyword Args:
        factory: Class or function for the Series creation.
            It must take the same parameters as ``pandas.Series``,
            and return an object of it or its subclass. If passed, it
            will be preferentially used even if the original dataclass
            of ``obj`` has the ``__pandas_factory__`` attribute.

    Returns:
        Series object that complies with the original dataclass.

    """
    spec = Spec.from_dataclass(type(obj)) @ obj

    if factory is None:
        factory = spec.factory or pd.Series

    data = get_data(spec)
    index = get_index(spec)

    if not data:
        series = factory(index=index)
    else:
        name, data = next(iter(data.items()))
        series = factory(data=data, index=index, name=name)

    series.attrs.update(get_attrs(spec))
    return squeeze(series)



def get_attrs(spec: Spec) -> dict[Hashable, Any]:
    """Derive attributes from a specification."""
    data: dict[Hashable, Any] = {}

    for field in spec.fields.of(Tag.ATTR):
        data.update(items(field))

    return data


def get_columns(spec: Spec) -> Optional[pd.MultiIndex]:
    """Derive columns from a specification."""
    if not (fields := spec.fields.of(Tag.DATA)):
        return None

    if (names := name(fields)) is None:
        return None

    return pd.MultiIndex.from_tuples(
        map(name, fields),
        names=names,
    )


def get_data(spec: Spec) -> dict[Hashable, Any]:
    """Derive data from a specification."""
    data: dict[Hashable, Any] = {}

    for field in spec.fields.of(Tag.DATA):
        for key, val in items(field):
            data[key] = ensure(val, field.dtype)

    return data


def get_index(spec: Spec) -> Optional[pd.MultiIndex]:
    """Derive index from a specification."""
    if not (fields := spec.fields.of(Tag.INDEX)):
        return None

    data: dict[Hashable, Any] = {}

    for field in fields:
        for key, val in items(field):
            data[key] = ensure(val, field.dtype)

    return pd.MultiIndex.from_arrays(
        np.broadcast_arrays(*data.values()),
        names=data.keys(),
    )


def ensure(data: Any, dtype: Optional[str]) -> Any:
    """Ensure data to be 1D and have given data type."""
    if not is_list_like(data):
        data = [data]

    if isinstance(data, (pd.Index, pd.Series)):
        return type(data)(data, dtype=dtype, copy=False)  # type: ignore
    else:
        return pd.array(data, dtype=dtype, copy=False)


def items(field: Field) -> Iterable[tuple[Hashable, Any]]:
    """Generate default(s) of a field specification."""
    if field.has(Tag.MULTIPLE):
        yield from field.default.items()
    else:
        yield (name(field), field.default)


@overload
def name(fields: Field) -> Hashable: ...


@overload
def name(fields: Fields) -> Optional[Hashable]: ...


def name(fields: Any) -> Any:
    """Derive name of a field(s) specification."""
    if isinstance(fields, Field):
        if isinstance(name := fields.name, dict):
            return tuple(name.values())
        else:
            return name

    if isinstance(fields, Fields):
        for field in fields:
            if isinstance(name := field.name, dict):
                return tuple(name.keys())


def squeeze(data: TPandas) -> TPandas:
    """Drop levels of an index and columns if possible."""
    if data.index.nlevels == 1:
        data.index = data.index.get_level_values(0)

    if isinstance(data, pd.Series):
        return data  # type: ignore

    if data.columns.nlevels == 1:
        data.columns = data.columns.get_level_values(0)

    return data