Source code for ibex._base

from __future__ import absolute_import


import string
import collections
import functools
import itertools
import os
import threading

import pandas as pd
from sklearn import base
from sklearn import pipeline
from sklearn.externals import joblib
try:
    from sklearn.exceptions import NotFittedError
except ImportError:
    from sklearn.utils.validation import NotFittedError # Older Versions

from ._utils import verify_x_type, verify_y_type


__all__ = []


def _get_iris_example_doc_preamble_(
        is_regressor,
        is_classifier,
        is_transformer,
        is_clusterer,
        indent):

    if is_classifier:
        return  """
        Example:

            >>> import numpy as np
            >>> from sklearn import datasets
            >>> import pandas as pd
            >>>
            >>> iris = datasets.load_iris()
            >>> features, targets, iris = iris['feature_names'], iris['target_names'], pd.DataFrame(
            ...     np.c_[iris['data'], iris['target']],
            ...     columns=iris['feature_names']+['class'])
            >>> iris['class'] = iris['class'].map(pd.Series(targets))
            >>>
            >>> iris.head()
                            sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm) \
            0                5.1               3.5                1.4               0.2
            1                4.9               3.0                1.4               0.2
            2                4.7               3.2                1.3               0.2
            3                4.6               3.1                1.5               0.2
            4                5.0               3.6                1.4               0.2
            <BLANKLINE>
                class
            0  setosa
            1  setosa
            2  setosa
            3  setosa
            4  setosa

        """

    return """
        Example:

            >>> import pandas as pd
            >>> import numpy as np
            >>> from ibex.sklearn import datasets
            >>> from ibex.sklearn.linear_model import LinearRegression as PdLinearRegression

            >>> iris = datasets.load_iris()
            >>> features = iris['feature_names']
            >>> iris = pd.DataFrame(
            ...     np.c_[iris['data'], iris['target']],
            ...     columns=features+['class'])

            >>> iris[features]
                            sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
            0                5.1               3.5                1.4               0.2
            1                4.9               3.0                1.4               0.2
            2                4.7               3.2                1.3               0.2
            3                4.6               3.1                1.5               0.2
            4                5.0               3.6                1.4               0.2
            ...

    """


def _get_fit_doc(
        orig,
        name,
        est,
        kwargs,
        is_regressor,
        is_classifier,
        is_transformer,
        is_clusterer,
        has_dataframe_y):

    doc = _get_iris_example_doc_preamble_(
        is_regressor,
        is_classifier,
        is_transformer,
        is_clusterer,
        indent=0) + \
    string.Template(
    r"""
        >>>
        >>> from ibex.sklearn import $orig as pd_$orig
        >>>
        >>> prd =  pd_$orig.$name($kwargs).fit(iris[features], iris['class'])

    """).substitute({
        'orig': orig,
        'name': name,
        'est': est,
        'kwargs': kwargs,
        'is_regressor': is_regressor,
        'is_classifier': is_classifier,
        'is_transformer': is_transformer,
        'is_clusterer': is_clusterer})


    if has_dataframe_y:
        doc += string.Template(
    r"""


    Example:

        >>> from ibex.sklearn import $orig as pd_$orig
        >>>
        >>> prd =  pd_$orig.$name($kwargs).fit(iris[features], iris['class'])

    """).substitute({
        'orig': orig,
        'name': name,
        'est': est,
        'kwargs': kwargs,
        'is_regressor': is_regressor,
        'is_classifier': is_classifier,
        'is_transformer': is_transformer,
        'is_clusterer': is_clusterer})

    return doc


# Tmp Ami - uts, docs
def _make_pipeline_steps(objs):
    names = [type(o).__name__.lower() for o in objs]
    name_counts = collections.Counter(names)
    name_inds = name_counts.copy()
    unique_names = []
    for name in names:
        if name_counts[name] > 1:
            unique_names.append(name + '_' + str(name_counts[name] - name_inds[name]))
            name_inds[name] -= 1
        else:
            unique_names.append(name)

    return list(zip(unique_names, objs))


[docs]class FrameMixin(object):
    """
    A base class for steps taking pandas entities, not numpy entities.

    Subclass this step to indicate that a step takes pandas entities.

    Example:

        This is a simple, illustrative "identity" transformer,
        which simply relays its input.

        >>> import pandas as pd
        >>> from sklearn import base
        >>> import ibex
        >>>
        >>> class Id(
        ...            base.BaseEstimator, # (1)
        ...            base.TransformerMixin, # (2)
        ...            ibex.FrameMixin): # (3)
        ...
        ...     def fit(self, X, y=None):
        ...         self.x_columns = X.columns # (4)
        ...         if y is not None and isinstance(y, pd.DataFrame):
        ...             self.y_columns = y.columns
        ...         return self
        ...
        ...     def transform(self, X, *args, **kwargs):
        ...         return X[self.x_columns] # (5)

        Note the following general points:

        1. We subclass :class:`sklearn.base.BaseEstimator`, as this is an estimator.

        2. We subclass :class:`sklearn.base.TransformerMixin`, as, in this case, this is specifically a transformer.

        3. We subclass :class:`ibex.FrameMixin`, as this estimator deals with ``pandas`` entities.

        4. In ``fit``, we make sure to set :py:attr:`ibex.FrameMixin.x_columns`;, and, if relevant,
        :py:attr:`ibex.FrameMixin.y_columns` (if ``y`` is a :class:`pandas.DataFrame`); this will ensure that the
        transformer will "remember" the columns it should see in further calls.

        5. In ``transform``, we first use ``x_columns``. This will verify the columns of ``X``, and also reorder
        them according to the original order seen in ``fit`` (if needed).

        Suppose we define two :class:`pandas.DataFrame` objects, ``X_1`` and ``X_2``, with different columns:

        >>> import pandas as pd
        >>>
        >>> X_1 = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]})
        >>> X_2 = X_1.rename(columns={'b': 'd'})

        The following ``fit``-``transform`` combination will work:

        >>> Id().fit(X_1).transform(X_1)
        a  b
        0  1  3
        1  2  4
        2  3  5

        The following ``fit``-``transform`` combination will fail:

        >>> try:
        ...     Id().fit(X_1).transform(X_2)
        ... except KeyError:
        ...     print('caught')
        caught

        The following ``transform`` will fail, as the estimator was not fitted:

        >>> try:
        ...     from sklearn.exceptions import NotFittedError
        ... except ImportError:
        ...     from sklearn.utils.validation import NotFittedError # Older Versions
        >>> try:
        ...     Id().transform(X_2)
        ... except NotFittedError:
        ...     print('caught')
        caught

        Steps can be piped into each other:

        >>> (Id() | Id()).fit(X_1).transform(X_1)
        a  b
        0  1  3
        1  2  4
        2  3  5

        Steps can be added:

        >>> (Id() + Id()).fit(X_1).transform(X_1)
             id_0     id_1
           a  b  a  b
        0  1  3  1  3
        1  2  4  2  4
        2  3  5  3  5
    """

    @property
    def x_columns(self):
        """
        The X columns set in the last call to fit.

        Set this property at fit, and call it in other methods:

        """
        try:
            return self.__x_cols
        except AttributeError:
            raise NotFittedError()

    @x_columns.setter
    def x_columns(self, columns):
        self.__x_cols = columns

    @property
    def y_columns(self):
        """
        The y columns set in the last call to fit.

        Set this property at fit, and call it in other methods:

        .. versionadded:: 0.1.2

        """
        try:
            return self.__y_cols
        except AttributeError:
            raise NotFittedError()

    @y_columns.setter
    def y_columns(self, columns):
        self.__y_cols = columns

[docs]    def __or__(self, other):
        """
        Pipes the result of this step to other.

        Arguments:
            other: A different step object whose class subclasses this one.

        Returns:
            :py:class:`ibex.sklearn.pipeline.Pipeline`
        """

        if isinstance(self, Pipeline):
            selfs = [e[1] for e in self.steps]
        else:
            selfs = [self]

        if isinstance(other, Pipeline):
            others = [e[1] for e in other.steps]
        else:
            others = [other]

        combined = selfs + others

        return Pipeline(_make_pipeline_steps(combined))

[docs]    def __add__(self, other):
        """

        Returns:
            :py:class:`ibex.sklearn.pipeline.FeatureUnion`
        """

        if isinstance(self, FeatureUnion):
            self_features = [e[1] for e in self.transformer_list]
        else:
            self_features = [self]

        if isinstance(other, FeatureUnion):
            other_features = [e[1] for e in other.transformer_list]
        else:
            other_features = [other]

        combined = self_features + other_features

        return FeatureUnion(_make_pipeline_steps(combined))


__all__ += ['FrameMixin']


def _transform(transformer, weight, X, *args, **kwargs):
    res = transformer.transform(X, *args, **kwargs)
    if weight is not None:
        res *= weight
    return res


def _fit_transform(transformer, weight, X, y, *args, **kwargs):
    if hasattr(transformer, 'fit_transform'):
        res = transformer.fit_transform(X, y, *args, **kwargs)
    else:
        res = transformer.fit(X, y, *args, **kwargs).transform(X)
    if weight is not None:
        res *= weight
    return res


[docs]class FeatureUnion(pipeline.FeatureUnion, base.TransformerMixin, FrameMixin):
    """
    Concatenates results of multiple transformer objects.
    This estimator applies a list of transformer objects in parallel to the
    input data, then concatenates the results. This is useful to combine
    several feature extraction mechanisms into a single transformer.

    Arguments:

        transformer_list: list of (string, transformer) tuples.
            List of transformer objects to be applied to the data.
            The first half of each tuple is the name of the transformer.

        n_jobs: int, optional.
            Number of jobs to run in parallel (default 1).

        transformer_weights: dict, optional
            Multiplicative weights for features per transformer.
            Keys are transformer names, values the weights.

    Example:

        >>> import pandas as pd
        >>> X = pd.DataFrame({'a': [1, 2, 3], 'b': [10, -3, 4]})

        >>> from ibex.sklearn import preprocessing as pd_preprocessing
        >>> from ibex.sklearn import pipeline as pd_pipeline

        >>> trn = pd_pipeline.FeatureUnion([
        ...     ('std', pd_preprocessing.StandardScaler()),
        ...     ('abs', pd_preprocessing.MaxAbsScaler())])
        >>> trn.fit_transform(X)
              std                 abs
                a         b         a    b
        0 -1.224745  1.192166  0.333333  1.0
        1  0.000000 -1.254912  0.666667 -0.3
        2  1.224745  0.062746  1.000000  0.4

        >>> from ibex import trans
        >>>
        >>> trn = pd_preprocessing.StandardScaler() + pd_preprocessing.MaxAbsScaler()
        >>> trn.fit_transform(X)
              standardscaler           maxabsscaler
                a         b         a    b
        0 -1.224745  1.192166  0.333333  1.0
        1  0.000000 -1.254912  0.666667 -0.3
        2  1.224745  0.062746  1.000000  0.4

        >>> trn = trans(pd_preprocessing.StandardScaler(), out_cols=['std_scale_a', 'std_scale_b'])
        >>> trn += trans(pd_preprocessing.MaxAbsScaler(), out_cols=['max_scale_a', 'max_scale_b'])
        >>> trn.fit_transform(X)
        functiontransformer_0             functiontransformer_1
        std_scale_a  std_scale_b  max_scale_a  max_scale_b
        0    -1.224745     1.192166     0.333333          1.0
        1     0.000000    -1.254912     0.666667         -0.3
        2     1.224745     0.062746     1.000000          0.4
    """
    # Tmp Ami - document as_index
    def __init__(self, transformer_list, n_jobs=1, transformer_weights=None, as_index=True):
        pipeline.FeatureUnion.__init__(
            self,
            transformer_list,
            n_jobs,
            transformer_weights)
        FrameMixin.__init__(self)
        self._as_index = as_index

    # Tmp Ami - get docstrings from sklearn.
[docs]    def fit_transform(self, X, y=None, **fit_params):
        """
        Fits the transformer using ``X`` (and possibly ``y``). Transforms
        ``X`` using the transformers, uses :func:`pandas.concat`
        to horizontally concatenate the results.

        Returns:

            ``self``
        """
        verify_x_type(X)
        verify_y_type(y)

        Xts = joblib.Parallel(n_jobs=self.n_jobs)(
            joblib.delayed(_fit_transform)(trans, weight, X, y, **fit_params) for _, trans, weight in self._iter())
        return self.__concat(Xts)

[docs]    def transform(self, X, *args, **kwargs):
        """
        Transforms ``X`` using the transformers, uses :func:`pandas.concat`
        to horizontally concatenate the results.
        """
        verify_x_type(X)

        Xts = joblib.Parallel(n_jobs=self.n_jobs)(
            joblib.delayed(_transform)(trans, weight, X, *args, **kwargs) for _, trans, weight in self._iter())
        return self.__concat(Xts)

    def _iter(self):
        weights = self.transformer_weights
        if weights is None:
            weights = {}
        return ((name, trans, weights.get(name, None)) for name, trans in self.transformer_list)

    def __concat(self, Xts):
        conc = pd.concat(Xts, axis=1)

        cols = conc.columns
        tups = [(c, ) if not isinstance(c, tuple) else c for c in cols]
        max_tup_len = max(len(t) for t in tups)
        tups = [c + ('', ) * (max_tup_len - len(c)) for c in tups]

        if self._as_index:
            names = [name for (name, _, _) in self._iter()]
            mults = [len(X.columns) for X in Xts]
            tup_heads = [(name, ) * m for (name, m) in zip(names, mults)]
            tup_heads = list(itertools.chain.from_iterable(tup_heads))
            tups = [(h, ) + t for h, t in zip(tup_heads, tups)]

        conc.columns = pd.MultiIndex.from_tuples(tups)

        return conc


FeatureUnion.__name__ = 'FeatureUnion'

_wrapped = [
    'get_feature_names',
    'get_params',
    'set_params',
]

for wrap in _wrapped:
    try:
        functools.update_wrapper(getattr(FeatureUnion, wrap), getattr(pipeline.FeatureUnion, wrap))
    except AttributeError:
        pass


__all__ += ['FeatureUnion']


[docs]class Pipeline(pipeline.Pipeline, FrameMixin):
    pass



class InOpChecker(object):
    def __init__(self, f_name):
        flag = '_ibex_adapter_in_op_%s' % hash(os.path.abspath(f_name))
        self.__set = getattr(threading.local(), flag, set())

    def __contains__(self, est):
        return id(est) in self.__set

    def add(self, est):
        self.__set.add(id(est))

    def remove(self, est):
        self.__set.remove(id(est))


__all__ += ['Pipeline']
Source code for ibex._base

Table Of Contents

Related Topics