Source code for dios.dios

from .base import _DiosBase, _is_dios_like
from .lib import Opts, OptsFields, dios_options
from .lib import _find_least_common_itype
from . import pandas_bridge as pdextra

import functools as ftools
import pandas as pd
import numpy as np


[docs]class DictOfSeries(_DiosBase):
    """ A data frame where every column has its own index.

    DictOfSeries is a collection of pd.Series's which aim to be as close as possible similar to
    pd.DataFrame. The advantage over pd.DataFrame is, that every `column` has its own row-index,
    unlike the former, which provide a single row-index for all columns. This solves problems with
    unaligned data and data which varies widely in length.

    Indexing with ``di[]``, ``di.loc[]`` and ``di.iloc[]``  should work analogous to these methods
    from pd.DataFrame. The indexer can be a single label, a slice, a list-like, a boolean list-like,
    or a boolean DictOfSeries/pd.DataFrame and can be used to selectively get or set data.

    Parameters
    ----------
    data : array-like, Iterable, dict, or scalar value
        Contains data stored in Series.

    columns : array-like
        Column labels to use for resulting frame. Will default to
        RangeIndex(0, 1, 2, ..., n) if no column labels are provided.

    index : Index or array-like
        Index to use to reindex every given series during init. Ignored if omitted.

    itype : Itype, pd.Index, Itype-string-repr or type
        Every series that is inserted, must have an index of this type or any
        of this types subtypes.
        If None, the itype is inferred as soon as the first non-empty series is inserted.

    cast_policy : {'save', 'force', 'never'}, default 'save'
        Policy used for (down-)casting the index of a series if its type does not match
        the ``itype``.
    """

    def __init__(self, data=None, columns=None, index=None, itype=None, cast_policy='save', fastpath=False):
        super().__init__(
            data=data, columns=columns, index=index, itype=itype, cast_policy=cast_policy, fastpath=fastpath
        )

    @property
    def _constructor(self):
        """ Return the class. Useful for construction in the elder class.
        A import of DictOfSeries would end up cyclic."""
        return DictOfSeries

    @property
    def indexes(self):
        """ Return pandas.Series with the indexes of all columns. """
        return self.for_each('index')

    @property
    def values(self):
        """ Return a numpy.array of numpy.arrays with the values of all columns.

        The outer has the length of columns, the inner holds the values of the column.
        """
        s = self.for_each('values')
        return s.values

    @property
    def dtypes(self):
        """ Return pandas.Series with the dtypes of all columns. """
        return self.for_each('dtype')

    @property
    def lengths(self):
        """ Return pandas.Series with the lenght of all columns. """
        return self._data.apply(len)

    @property
    def size(self):
        return self.lengths.sum()

    # ------------------------------------------------------------------------------
    # Dict-like methods

[docs]    def clear(self):
        d = self._data
        self._data = pd.Series(dtype=d.dtype, index=type(d.index)([]))

[docs]    def get(self, key, default=None):
        return self._data.get(key, default)

[docs]    def items(self):
        return self._data.items()

[docs]    def keys(self):
        return self.columns

[docs]    def pop(self, *args):
        # We support a default value, like dict, in contrary to pd.
        # Therefore we need to handle args manually, because dict-style pop()
        # differ between a single arg and a tuple-arg, with arg and default,
        # where the second arg can be anything, including None. If the key is
        # not present, and a single arg is given, a KeyError is raised, but
        # with a given default value, it is returned instead.
        if len(args) == 0:
            raise TypeError("pop expected at least 1 arguments, got 0")
        if len(args) > 2:
            raise TypeError(f"pop expected at most 2 arguments, got {len(args)}")
        key, *rest = args
        if key in self.columns:
            return self._data.pop(key)
        elif rest:
            return rest.pop()
        raise KeyError(key)

[docs]    def popitem(self):
        last = self.columns[-1]
        return last, self._data.pop(last)

[docs]    def setdefault(self, key, default=None):
        if key not in self.columns:
            self._insert(key, default)
        return self._data[key]

[docs]    def update(self, other):
        if not _is_dios_like(other):
            other = to_dios(other)
        self.aloc[other, ...] = other

    # ------------------------------------------------------------------------------
    # High-Level Iteration

[docs]    def iteritems(self):
        yield from self.items()

[docs]    def iterrows(self, fill_value=np.nan, squeeze=True):
        """
        Iterate over DictOfSeries rows as (index, pandas.Series/DictOfSeries) pairs.
        **MAY BE VERY PERFORMANCE AND/OR MEMORY EXPENSIVE**

        Parameters
        ----------
        fill_value: scalar, default numpy.nan
            Fill value for row entry, if the column does not have an entry
            at the current index location. This ensures that the returned
            Row always contain all columns. If ``None`` is given no value
            is filled.

            If ``fill_value=None`` and ``squeeze=True`` the resulting Row
            (a pandas.Series) may differ in length between iterator calls.
            That's because an entry, that is not present in a column, will
            also not be present in the resulting Row.

        squeeze: bool, default False
            * ``True`` : A pandas.Series is returned for each row.
            * ``False`` : A single-rowed DictOfSeries is returned for each row.

        Yields
        ------
        index : label
            The index of the row.
        data : Series or DictOfSeries
            The data of the row as a Series if squeeze is True, as
            a DictOfSeries otherwise.

        See Also
        --------
        DictOfSeries.iteritems : Iterate over (column name, Series) pairs.
        """

        # todo: 2nd posibility for fill_value=Any, squeeze=False
        #   do it like in case fill_value=None ->
        #       1. row = aloc the row
        #       2. e = row.isempty()
        #       3. row.loc[idx,e] = fill_value
        #   This approach could be much better, because the dtype of
        #   the columns is preserved.

        # PROBABLY PERFORMANCE EXPENSIVE
        if fill_value is None:
            allidx = self.index_of('all')
            if squeeze:
                for i in allidx:
                    yield i, self.aloc[i:i].dropempty().squeeze(axis=0)
            else:
                for i in allidx:
                    yield self.aloc[i:i]

        # PROBABLY MEMORY EXPENSIVE
        else:
            if fill_value is np.nan:
                df = self.to_df()
            else:
                nans = self.isna().to_df().fillna(False)
                df = self.to_df().fillna(fill_value)
                df[nans] = np.nan
            if squeeze:
                yield from df.iterrows()
            else:
                for idx, row in df.iterrows():
                    yield idx, DictOfSeries(data=row.to_dict(), index=[idx])

    # ------------------------------------------------------------------------------
    # Broadcasting and Reducing

[docs]    def for_each(self, attr_or_callable, **kwds):
        """
        Apply a callable or a pandas.Series method or property on each column.

        Parameters
        ----------
        attr_or_callable: Any
            A pandas.Series attribute or any callable, to apply on each column.
            A series attribute can be any property, field or method and also
            could be specified as string. If a callable is given it must take
            pandas.Series as the only positional argument.

        **kwds: any
            kwargs to passed to callable

        Returns
        -------
        pandas.Series
            A series with the results, indexed by the column labels.

        See Also
        --------
        DictOfSeries.apply : Apply functions to columns and convert
                             result to DictOfSeries.

        Examples
        --------
        >>> d = DictOfSeries([range(3), range(4)], columns=['a', 'b'])
        >>> d
           a |    b |
        ==== | ==== |
        0  0 | 0  0 |
        1  1 | 1  1 |
        2  2 | 2  2 |
             | 3  3 |

        Use with a callable..

        >>> d.for_each(max)
        columns
        a    2
        b    3
        dtype: object

        ..or with a string, denoting a pd.Series attribute and
        therefor is the same as giving the latter.

        >>> d.for_each('max')
        columns
        a    2
        b    3
        dtype: object

        >>> d.for_each(pd.Series.max)
        columns
        a    2
        b    3
        dtype: object

        Both also works with properties:

        >>> d.for_each('dtype')
        columns
        a    int64
        b    int64
        dtype: object
        """
        attrOcall = attr_or_callable
        if isinstance(attrOcall, str):
            attrOcall = getattr(pd.Series, attrOcall)
        call = callable(attrOcall)
        if not call:
            attrOcall = attr_or_callable
        data = pd.Series(dtype='O', index=self.columns)
        for c in self.columns:
            dat = self._data.at[c]
            if call:
                data.at[c] = attrOcall(dat, **kwds)
            else:
                data.at[c] = getattr(dat, attrOcall)
        return data

[docs]    def apply(self, func, axis=0, raw=False, args=(), **kwds):
        """
        Apply a function along an axis of the DictOfSeries.

        Parameters
        ----------
        func : callable
            Function to apply on each column.
        axis : {0 or 'index', 1 or 'columns'}, default 0
            Axis along which the function is applied:

            * 0 or 'index': apply function to each column.
            * 1 or 'columns': NOT IMPLEMENTED

        raw : bool, default False
            Determines if row or column is passed as a Series or ndarray object:

            * ``False`` : passes each row or column as a Series to the
              function.
            * ``True`` : the passed function will receive ndarray objects
              instead.
              If you are just applying a NumPy reduction function this will
              achieve much better performance.

        args : tuple
            Positional arguments to pass to `func` in addition to the
            array/series.
        **kwds
            Additional keyword arguments to pass as keywords arguments to
            `func`.

        Returns
        -------
        Series or DataFrame
            Result of applying ``func`` along the given axis of the
            DataFrame.

        Raises
        ------
        NotImplementedError
            * if axis is 'columns' or 1

        See Also
        --------
        DictOfSeries.for_each: apply pd.Series methods or properties to each column

        Examples
        --------

        We use the example DictOfSeries from :ref:`indexing <doc_indexing:Example dios>`.

        >>> di = di[:5]
            a |    b |     c |     d |
        ===== | ==== | ===== | ===== |
        0   0 | 2  5 | 4   7 | 6   0 |
        1   7 | 3  6 | 5  17 | 7   1 |
        2  14 | 4  7 | 6  27 | 8   2 |
        3  21 | 5  8 | 7  37 | 9   3 |
        4  28 | 6  9 | 8  47 | 10  4 |

        >>> di.apply(max)
        columns
        a    28
        b     9
        c    47
        d     4
        dtype: int64

        >>> di.apply(pd.Series.count)
        columns
        a    5
        b    5
        c    5
        d    5
        dtype: int64

        One can pass keyword arguments directly..

        >>> di.apply(pd.Series.value_counts, normalize=True)
              a |      b |       c |      d |
        ======= | ====== | ======= | ====== |
        7   0.2 | 7  0.2 | 7   0.2 | 4  0.2 |
        14  0.2 | 6  0.2 | 37  0.2 | 3  0.2 |
        21  0.2 | 5  0.2 | 47  0.2 | 2  0.2 |
        28  0.2 | 9  0.2 | 27  0.2 | 1  0.2 |
        0   0.2 | 8  0.2 | 17  0.2 | 0  0.2 |

        Or define a own funtion..

        >>> di.apply(lambda s : 'high' if max(s) > 10 else 'low')
        columns
        a    high
        b     low
        c    high
        d     low
        dtype: object

        And also more advanced functions that return a list-like can be given. Note that
        the returned lists not necessarily must have the same length.

        >>> func = lambda s : ('high', max(s), min(s)) if min(s) > (max(s)//2) else ('low',max(s))
        >>> di.apply(func)
             a |       b |      c |      d |
        ====== | ======= | ====== | ====== |
        0  low | 0  high | 0  low | 0  low |
        1   28 | 1     9 | 1   47 | 1    4 |
               | 2     5 |        |        |
              """
        if axis in [1, 'columns']:
            raise NotImplementedError

        elif axis in [0, 'index']:
            # we cannot use self._data.apply(func=func, args=args, **kwds)
            # because this may return a pandas.DataFrame. Also we cannot
            # use pandas.Series.apply(), because this works on its values.
            need_dios = need_convert = False
            result = pd.Series(dtype='O', index=self.columns)
            for c in self.columns:
                dat = self._data.at[c].values if raw else self._data.at[c]
                s = func(dat, *args, **kwds)
                result.at[c] = s
                if pdextra.is_scalar(s):
                    need_convert = True
                else:
                    need_dios = True
                    if not isinstance(s, pd.Series):
                        need_convert = True
            if need_dios:
                if need_convert:
                    for c in result.index:
                        result.at[c] = pd.Series(result[c])
                itype = _find_least_common_itype(result)
                result = DictOfSeries(data=result, itype=itype, fastpath=True)
        else:
            raise ValueError(axis)
        return result

[docs]    def reduce_columns(self, func, initial=None, skipna=False):
        """
        Reduce all columns to a single pandas.Series by a given function.

        Apply a function of two pandas.Series as arguments, cumulatively to all
        columns, from left to right, so as to reduce the columns to a single
        pandas.Series. If initial is present, it is placed before the columns
        in the calculation, and serves as a default when the columns are empty.

        Parameters
        ----------
        func : function
            The function must take two identically indexed pandas.Series and should
            return a single pandas.Series with the same index.

        initial : column-label or pd.Series, default None
            The series to start with. If None a dummy series is created, with the
            indices of all columns and the first seen values.

        skipna : bool, default False
               If True, skip NaN values.

        Returns
        -------
        pandas.Series
            A series with the reducing result and the index of the start series,
            defined by ``initializer``.
        """
        if initial is None:
            value = pd.Series(index=self.index_of('all'))
            for d in self._data:
                value = value.combine_first(d)
        elif isinstance(initial, pd.Series):
            value = initial.copy()
        elif initial in self.columns:
            value = self._data.at[initial].copy()
        else:
            raise ValueError("initial must be pd.Series, a column label or None")

        if skipna:
            val = value.dropna()
            data = self.dropna()._data
        else:
            val = value
            data = self._data

        for d in data:
            idx = val.index & d.index
            if len(idx) > 0:
                l, r = val.loc[idx], d.loc[idx]
                val.loc[idx] = func(l, r)

        if skipna:
            value.loc[val.index] = val
        return value

    # ------------------------------------------------------------------------------
    # Merging and Joining

[docs]    def combine_first(self, other, keepna=False):
        """
        Update null elements with value in the same location in other.
        
        Combine two DictOfSeries objects by filling null values in one DictOfSeries with
        non-null values from other DictOfSeries. The row and column indexes of the resulting
        DictOfSeries will be the union of the two.

        Parameters
        ----------
        keepna : bool, default False
            By default Nan's are updated by other and new value-index pairs from other are
            inserted. If set to True, NaN's are not updated and only new value-index pair are inserted.

        other : DictOfSeries
            Provided DictOfSeries to use to fill null values.

        Returns
        -------
        DictOfSeries
        """
        if keepna:
            nans = self.isna()

        new: DictOfSeries = self.copy()
        for c in other.columns:
            if c in self.columns:
                col = self._data.at[c].combine_first(other[c])
            else:
                col = other[c]
            new._data.at[c] = col

        if keepna:
            new.aloc[nans] = np.nan

        return new

    # ------------------------------------------------------------------------------
    # Misc methods

[docs]    def index_of(self, method='all'):
        """ Return an single index with indices from all columns.

        Parameters
        ----------
        method : string, default 'all'
            * 'all' : get all indices from all columns
            * 'union' : alias for 'all'
            * 'shared' : get indices that are present in every columns
            * 'intersection' : alias for 'shared'
            * 'uniques' : get indices that are only present in a single column
            * 'non-uniques' : get indices that are present in more than one column

        Returns
        -------
        pd.Index
            A single duplicate-free index, somehow representing indices of all columns.

        Examples
        --------
        We use the example DictOfSeries from :ref:`indexing <doc_indexing:Example dios>`.

        >>> di
            a |      b |      c |     d |
        ===== | ====== | ====== | ===== |
        0   0 | 2    5 | 4    7 | 6   0 |
        1   7 | 3    6 | 5   17 | 7   1 |
        2  14 | 4    7 | 6   27 | 8   2 |
        3  21 | 5    8 | 7   37 | 9   3 |
        4  28 | 6    9 | 8   47 | 10  4 |
        5  35 | 7   10 | 9   57 | 11  5 |
        6  42 | 8   11 | 10  67 | 12  6 |
        7  49 | 9   12 | 11  77 | 13  7 |
        8  56 | 10  13 | 12  87 | 14  8 |
        9  63 | 11  14 | 13  97 | 15  9 |

        >>> di.index_of()
        RangeIndex(start=0, stop=16, step=1)

        >>> di.index_of("shared")
        Int64Index([6, 7, 8, 9], dtype='int64')

        >>> di.index_of("uniques")
        Int64Index([0, 1, 14, 15], dtype='int64')
        """
        indexes = self.indexes
        if len(indexes) <= 1:
            return indexes.squeeze()

        if method in ['union', 'all']:
            res = ftools.reduce(pd.Index.union, indexes)
        elif method in ['intersection', 'shared']:
            res = ftools.reduce(pd.Index.intersection, indexes)
        elif method in ['uniques', 'non-uniques']:
            res = ftools.reduce(pd.Index.append, indexes)
            res = res.value_counts(sort=False, dropna=False)
            if method == 'uniques':
                res = res[res == 1].index
            else:
                res = res[res > 1].index
        else:
            raise ValueError(method)
        return res if res.is_unique else res.unique()

[docs]    def squeeze(self, axis=None):
        """ Squeeze a 1-dimensional axis objects into scalars. """
        if axis in [0, 'index']:
            if (self.lengths == 1).all():
                return self._data.apply(pd.Series.squeeze)
            return self
        elif axis in [1, 'columns']:
            if len(self) == 1:
                return self._data.squeeze()
            return self
        elif axis is None:
            if len(self) == 1:
                return self._data.squeeze().squeeze()
            if (self.lengths == 1).all():
                return self._data.apply(pd.Series.squeeze).squeeze()
            return self
        raise ValueError(axis)

[docs]    def dropna(self, inplace=False):
        """ Return a bolean array that is `True` if the value is a Nan-value """
        data = self.for_each('dropna', inplace=False)  # never pass inplace=True
        if inplace:
            self._data = data
        else:
            return DictOfSeries(data=data, itype=self.itype, cast_policy=self._policy, fastpath=True)

[docs]    def dropempty(self):
        """ Drop empty columns. Return copy. """
        return self.loc[:, self.notempty()]

[docs]    def astype(self, dtype, copy=True, errors='raise'):
        """ Cast the data to the given data type. """
        data = self.for_each('astype', dtype=dtype, copy=copy, errors=errors)
        return DictOfSeries(data=data, itype=self.itype, cast_policy=self._policy, fastpath=True)

    def _mask_or_where(self, cond, other=np.nan, inplace=False, mask=True):
        """ helper to mask/where """
        data = self if inplace else self.copy()

        if callable(other):
            other = other(data)

        if callable(cond):
            cond = cond(data)
        # if DictOfSeries is bool,
        # is already checked in aloc
        elif not _is_dios_like(cond):
            if not pdextra.is_bool_indexer(cond):
                raise ValueError("Object with boolean entries only expected for the condition")

        if mask:
            data.aloc[cond] = other
        else:
            data.aloc[~cond] = other

        if inplace:
            return None
        return data

[docs]    def where(self, cond, other=np.nan, inplace=False):
        """
        Replace values where the condition is False.

        Parameters
        ----------
        cond : bool DictOfSeries, Series, array-like, or callable
            Where cond is True, keep the original value. Where False, replace
            with corresponding value from other. If cond is callable, it is computed
            on the DictOfSeries and should return boolean DictOfSeries or array.
            The callable must not change input DictOfSeries (though dios doesn’t check it).
            If cond is a bool Series, every column is (row-)aligned against it, before the
            boolean values are evaluated. Missing indices are treated like False values.

        other : scalar, Series, DictOfSeries, or callable
            Entries where cond is False are replaced with corresponding value from other.
            If other is callable, it is computed on the DictOfSeries and should return scalar
            or DictOfSeries. The callable must not change input DictOfSeries (though dios doesn’t check it).
            If other is a Series, every column is (row-)aligned against it, before the values
            are written. NAN's are written for missing indices.

        inplace : bool, default False
            Whether to perform the operation in place on the data.

        Returns
        -------
        DictOfSeries

        See Also
        --------
        mask: Mask data where condition is True
        """
        return self._mask_or_where(cond=cond, other=other, inplace=inplace, mask=False)

[docs]    def mask(self, cond, other=np.nan, inplace=False):
        """
        Replace values where the condition is True.

        Parameters
        ----------
        cond : bool DictOfSeries, Series, array-like, or callable
            Where cond is False, keep the original value. Where True, replace
            with corresponding value from other. If cond is callable, it is computed
            on the DictOfSeries and should return boolean DictOfSeries or array.
            The callable must not change input DictOfSeries (though dios doesn’t check it).
            If cond is a bool Series, every column is (row-)aligned against it, before the
            boolean values are evaluated. Missing indices are treated like False values.

        other : scalar, Series, DictOfSeries, or callable
            Entries where cond is True are replaced with corresponding value from other.
            If other is callable, it is computed on the DictOfSeries and should return scalar
            or DictOfSeries. The callable must not change input DictOfSeries (though dios doesn’t check it).
            If other is a Series, every column is (row-)aligned against it, before the values
            are written. NAN's are written for missing indices.

        inplace : bool, default False
            Whether to perform the operation in place on the data.

        Returns
        -------
        DictOfSeries

        See Also
        --------
        mask: Mask data where condition is False
        """
        return self._mask_or_where(cond=cond, other=other, inplace=inplace, mask=True)

[docs]    def memory_usage(self, index=True, deep=False):
        return self.for_each(pd.Series.memory_usage, index=index, deep=deep).sum()

[docs]    def to_df(self, how='outer'):
        """
        Transform DictOfSeries to a pandas.DataFrame.

        Because a pandas.DataFrame can not handle Series of different
        length, but DictOfSeries can, the missing data is filled with
        NaNs or is dropped, depending on the keyword `how`.

        Parameters
        ----------
        how: {'outer', 'inner'}, default 'outer'
            define how the resulting DataFrame index is generated:
            * 'outer': The indices of all columns, merged into one index is used.
                If a column misses values at the new index location, `NaN`s are filled.
            * 'inner': Only indices that are present in all columns are used, filling
                logic is not needed, but values are dropped, if a column has indices
                that are not known to all other columns.

        Returns
        -------
        pandas.DataFrame: transformed data
        
        Examples
        --------

        Missing data locations are filled with NaN's

        >>> a = pd.Series(11, index=range(2))
        >>> b = pd.Series(22, index=range(3))
        >>> c = pd.Series(33, index=range(1,9,3))
        >>> di = DictOfSeries(dict(a=a, b=b, c=c))
        >>> di
            a |     b |     c |
        ===== | ===== | ===== |
        0  11 | 0  22 | 1  33 |
        1  11 | 1  22 | 4  33 |
              | 2  22 | 7  33 |

        >>> di.to_df()
        columns     a     b     c
        0        11.0  22.0   NaN
        1        11.0  22.0  33.0
        2         NaN  22.0   NaN
        4         NaN   NaN  33.0
        7         NaN   NaN  33.0
        
        or is dropped if `how='inner'`
        
        >>> di.to_df(how='inner')
        columns   a   b   c
        1        11  22  33
        """
        if how == 'inner':
            how = 'shared'
        elif how == 'outer':
            how = 'all'
        else:
            raise ValueError(how)

        index = self.index_of(how)
        df = pd.DataFrame(columns=self.columns, index=index)
        for c in self.columns:
            # this automatically respects the df-index, that
            # was set before. Missing locations are already
            # nans, present locations are set.
            df[c] = self._data.at[c]
        return df

    @property
    def debugDf(self):
        """ Alias for ``to_df()`` as property, for debugging purpose."""
        return self.to_df()

[docs]    def min(self, axis=0, skipna=True):
        if axis in [0, 'index', None]:
            return self.for_each(pd.Series.min, skipna=skipna)
        elif axis in [1, 'columns']:
            func = lambda s1, s2: s1.where(s1 < s2, s2)
            return self.reduce_columns(func, skipna=skipna)
        raise ValueError(axis)

[docs]    def max(self, axis=None, skipna=None):
        if axis in [0, 'index', None]:
            return self.for_each(pd.Series.min, skipna=skipna)
        elif axis in [1, 'columns']:
            func = lambda s1, s2: s1.where(s1 > s2, s2)
            return self.reduce_columns(func, skipna=skipna)
        raise ValueError(axis)

    # ----------------------------------------------------------------------
    # Boolean and empty stuff

[docs]    def equals(self, other):
        """
        Test whether two DictOfSeries contain the same elements.
        
        This function allows two DictOfSeries to be compared against each other to see
        if they have the same shape and elements. NaNs in the same location are considered equal.
        The column headers do not need to have the same type, but the elements within the columns
        must be the same dtype.

        Parameters
        ----------
        other: DictOfSeries
            The other DictOfSeries to compare with.

        Returns
        -------
        bool
            True if all elements are the same in both DictOfSeries, False otherwise.
        """
        if not isinstance(other, DictOfSeries):
            return False
        try:
            eq_nans = (self.isna() == other.isna()).all(None)
            eq_data = (self.dropna() == other.dropna()).all(None)
            eq_dtypes = (self.dtypes == other.dtypes).all()
            return eq_nans and eq_dtypes and eq_data
        except Exception:
            return False

[docs]    def isin(self, values):
        """ Return a boolean dios, that indicates if the corresponding value is in the given array-like. """
        data = self.for_each('isin', values=values)
        return DictOfSeries(data=data, itype=self.itype, cast_policy=self._policy, fastpath=True)

[docs]    def all(self, axis=0):
        """
        Return whether all elements are True, potentially over an axis.

        Returns True unless there at least one element within a series
        or along a DictOfSeries axis that is False or equivalent (e.g. zero or empty).

        Parameters
        ----------
        axis : {0 or ‘index’, 1 or ‘columns’, None}, default 0
            Indicate which axis or axes should be reduced.
             * 0 / ‘index’ : reduce the index, return a Series whose index is the original column labels.
             * 1 / ‘columns’ : reduce the columns, return a Series whose index is the union of all columns indexes.
             * None : reduce all axes, return a scalar.

        Returns
        -------
        pandas.Series

        See Also
        --------
        pandas.Series.all: Return True if all elements are True.
        any: Return True if one (or more) elements are True.
        """
        if axis in [0, 'index']:
            return self._data.apply(all)
        elif axis in [1, 'columns']:
            func = lambda s1, s2: s1.astype(bool) & s2.astype(bool)
            init = pd.Series(True, dtype=bool, index=self.index_of('all'))
            return self.reduce_columns(func, init)
        elif axis is None:
            return self._data.apply(all).all()
        raise ValueError(axis)

[docs]    def any(self, axis=0):
        """
        Return whether any element is True, potentially over an axis.

        Returns False unless there at least one element within a series
        or along a DictOfSeries axis that is True or equivalent (e.g. non-zero or non-empty).

        Parameters
        ----------
        axis : {0 or ‘index’, 1 or ‘columns’, None}, default 0
            Indicate which axis or axes should be reduced.
             * 0 / ‘index’ : reduce the index, return a Series whose index is the original column labels.
             * 1 / ‘columns’ : reduce the columns, return a Series whose index is the union of all columns indexes.
             * None : reduce all axes, return a scalar.

        Returns
        -------
        pandas.Series

        See Also
        --------
        pandas.Series.any: Return whether any element is True.
        all: Return True if all elements are True.
        """
        if axis in [0, 'index']:
            return self._data.apply(any)
        elif axis in [1, 'columns']:
            func = lambda s1, s2: s1.astype(bool) | s2.astype(bool)
            init = pd.Series(False, dtype=bool, index=self.index_of('all'))
            return self.reduce_columns(func, init)
        elif axis is None:
            return self._data.apply(any).any()
        raise ValueError(axis)

[docs]    def isna(self, drop_empty=False):
        """
        Return a boolean DictOfSeries which indicates NA positions.
        """
        data = self.dropempty() if drop_empty else self
        data = data.for_each('isna')
        return DictOfSeries(data=data, itype=self.itype, cast_policy=self._policy, fastpath=True)

[docs]    def notna(self, drop_empty=False):
        """
        Return a boolean DictOfSeries which indicates non-NA positions.
        """
        data = self.dropempty() if drop_empty else self
        data = data.for_each('notna')
        return DictOfSeries(data=data, itype=self.itype, cast_policy=self._policy, fastpath=True)

[docs]    def hasnans(self, axis=0, drop_empty=False):
        """
        Returns a boolean Series along an axis, which indicates if it contains NA-entries.
        """
        data = self.dropempty() if drop_empty else self
        if axis in [0, 'index']:
            return data.for_each('hasnans')
        elif axis in [1, 'columns']:
            func = lambda s1, s2: s1.isna() | s2.isna()
            init = pd.Series(False, dtype=bool, index=self.index_of('all'))
            return data.reduce_columns(func, init)
        elif axis is None:
            return self.isna(drop_empty=drop_empty)
        raise ValueError(axis)

[docs]    def isempty(self):
        """ Returns a boolean Series, which indicates if an column is empty """
        return self.for_each('empty').astype(bool)

[docs]    def notempty(self):
        """ Returns a boolean Series, which indicates if an column is not empty """
        return ~ self.isempty()

[docs]    def isdata(self):
        """ Alias for ``notna(drop_empty=True)``. """
        return self.notna(drop_empty=True)

[docs]    def isnull(self, drop_empty=False):
        """ Alias for ``isna()`` """
        return self.isna(drop_empty=drop_empty)

[docs]    def notnull(self, drop_empty=False):
        """ Alias, see ``notna()``. """
        return self.notna(drop_empty=drop_empty)

[docs]    def to_dios(self):
        """
        A dummy to allow unconditional to_dios calls
        on pd.DataFrame, pd.Series and dios.DictOfSeries
        """
        return self

    # ----------------------------------------------------------------------
    # Rendering Methods

    def __str__(self):
        return self.__repr__()

    def __repr__(self):
        repr = dios_options[OptsFields.dios_repr]
        showdim = self.lengths.max() > dios_options[OptsFields.disp_max_rows]
        return self.to_string(method=repr, show_dimensions=showdim)

[docs]    def to_string(self, max_rows=None, min_rows=None, max_cols=None,
                  na_rep='NaN', show_dimensions=False,
                  method=Opts.repr_indexed,
                  no_value=' ', empty_series_rep='no data',
                  col_delim=' | ', header_delim='=', col_space=None, ):
        """ Pretty print a dios.

        if `method` == `indexed` (default):
            every column is represented by a own index and corresponding values

        if `method` == `aligned` [2]:
            one(!) global index is generated and values from a column appear at
            the corresponding index-location.

        Parameters
        ---------

        max_cols :
            not more column than `max_cols` are printed [1]

        max_rows :
            see `min_rows` [1]

        min_rows :
            not more rows than `min_rows` are printed, if rows of any series exceed `max_rows` [1]

        na_rep :
            all NaN-values are replaced by `na_rep`. Default `NaN`

        empty_series_rep :
            Ignored if not `method='indexed'`.
            Empty series are represented by the string in `empty_series_rep`

        col_delim : str
            Ignored if not `method='indexed'`.
            between all columns `col_delim` is inserted.

        header_delim :
            Ignored if not `method='indexed'`.
            between the column names (header) and the data, `header_delim` is inserted,
            if not None. The string is repeated, up to the width of the column. (str or None).

        no_value :
            Ignored if not `method='aligned'`.
            value that indicates, that no entry in the underling series is present. Bear in mind
            that this should differ from `na_rep`, otherwise you cannot differ missing- from NaN- values.

        Notes
        -----
            [1]: defaults to the corresponding value in `dios_options`
            [2]: the common-params are directly passed to pd.DataFrame.to_string(..)
            under the hood, if method is `aligned`

        """
        if self.empty:
            return _empty_repr(self)

        max_cols = max_cols or dios_options[OptsFields.disp_max_cols] or 100
        max_rows = max_rows or dios_options[OptsFields.disp_max_rows] or 200
        min_rows = min_rows or dios_options[OptsFields.disp_min_rows] or 100

        kwargs = dict(max_rows=max_rows, min_rows=min_rows, max_cols=max_cols,
                      na_rep=na_rep, col_space=col_space, show_dimensions=show_dimensions)

        if method == Opts.repr_aligned:
            return _to_aligned_df(self, no_value=no_value).to_string(**kwargs)

        # add pprint relevant options
        kwargs.update(empty_series_rep=empty_series_rep,
                      col_delim=col_delim, header_delim=header_delim)

        return pprint_dios(self, **kwargs)

[docs]    def to_csv(self, *args, **kwargs):
        self.to_df().to_csv(*args, **kwargs)

    to_csv.__doc__ = pd.DataFrame.to_csv.__doc__


def _empty_repr(di):
    return f"Empty DictOfSeries\n" \
           f"Columns: {di.columns.to_list()}"


[docs]def pprint_dios(dios,
                max_rows=None, min_rows=None, max_cols=None,
                na_rep="NaN", empty_series_rep='no data',
                col_space=None, show_dimensions=True,
                col_delim=' | ', header_delim='='
                ):
    na_rep = str(na_rep)
    empty_series_rep = str(empty_series_rep)
    col_delim = col_delim or ' '

    min_rows = min(max_rows, min_rows)

    if dios.empty:
        return _empty_repr(dios)

    maxlen = dios.lengths.max()
    data = dios._data

    trunc_cols = len(data) > max_cols
    if trunc_cols:
        left, right = data.head(max_cols // 2), data.tail(max_cols // 2)
        data = left.append(right)

    # now data only contains series that we want to print.

    # if any series exceed max_rows we trim all series to min_rows
    series_lengths = data.apply(len).to_list()
    series_maxlen = max(series_lengths)
    trunc_rows = series_maxlen > max_rows
    max_rows = min_rows if trunc_rows else series_maxlen

    # we make a list of list, where the inner contains all
    # stringified values of the series upto max_rows+1, where
    # the additional row is the column-name
    outer = []
    for colname in data.index:
        s = data.at[colname]

        isempty = s.empty
        if isempty:
            s = pd.Series(empty_series_rep)
            idx = False
            cspace = col_space
        else:
            idx = True
            cspace = col_space // 2 if col_space else col_space

        sstr = s.to_frame().to_string(col_space=cspace,
                                      header=[str(colname)],
                                      index=idx,
                                      na_rep=na_rep,
                                      max_rows=max_rows,
                                      min_rows=min_rows,
                                      )
        li = sstr.split('\n')

        # HACK: empty series produce a unnecessary space,
        # because index is omitted
        if isempty:
            cstr, vstr = li
            if len(cstr.lstrip()) < len(vstr) and (cspace or 0) < len(vstr):
                li = [cstr[1:], vstr[1:]]

        outer.append(li)

    # now the length of every value-string per series are the same.
    # we need this length's to know, how many chars we need to fill,
    # once we exceed the length of the series, or if we insert whole
    # columns.
    valstr_len = [len(c[0]) for c in outer]

    rows = max_rows + 1  # colnames aka. header
    rows += 1 if trunc_rows else 0  # `...` in rows
    rows += 1 if header_delim else 0  # underline header

    if header_delim:
        for i, c in enumerate(outer):
            colheader = (header_delim * valstr_len[i])[:valstr_len[i]]
            c.insert(1, colheader)

    dots = ' ... '
    if trunc_cols:
        outer.insert(max_cols // 2, [dots] * rows)
        valstr_len.insert(max_cols // 2, len(dots))
        series_lengths.insert(max_cols // 2, rows)

    txt = ""
    for r in range(rows):
        for i, c in enumerate(outer):
            try:
                vstr = c[r]
            except IndexError:
                vstr = ' ' * valstr_len[i]
            txt += vstr + col_delim
        txt += '\n'

    # add footer
    if show_dimensions:
        for i, c in enumerate(outer):
            # ignore the dot-column
            if trunc_cols and i == max_cols // 2:
                txt += dots + ' ' * len(col_delim)
            else:
                txt += f"[{series_lengths[i]}]".ljust(valstr_len[i] + len(col_delim))

        txt += f'\n\nmax: [{maxlen} rows x {len(dios.columns)} columns]'
        txt += '\n'

    return txt


def _to_aligned_df(dios, no_value=' '):
    if dios.empty:
        return pd.DataFrame(columns=dios.columns)

    # keep track of all real nans
    nandict = {}
    for c in dios:
        nans = dios[c].isna()
        nandict[c] = nans[nans].index

    df = dios.to_df()
    df[df.isna()] = no_value

    # reinsert all real nans
    for c in df:
        df.loc[nandict[c], c] = np.nan

    return df


[docs]def to_dios(obj) -> DictOfSeries:
    if isinstance(obj, DictOfSeries):
        return obj
    return DictOfSeries(data=obj)


def __monkey_patch_pandas():
    def to_dios(self):
        return DictOfSeries(data=self)

    pd.Series.to_dios = to_dios
    pd.DataFrame.to_dios = to_dios


__monkey_patch_pandas()