Source code for skippa.transformers.base

"""
This contains base / utility classes and functions needed for defining/using transformers
"""
from __future__ import annotations

from typing import Optional, Union, List, Dict, Tuple, Callable
import re
import logging
import inspect

import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype, is_string_dtype, is_float_dtype
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_selector, make_column_transformer


[docs]class ColumnSelector:
    """This is not a transformer, but a utility class for defining a column set."""

    def __init__(self, selector: Callable) -> None:
        self.selector = selector
        self.name = re.sub('[^a-zA-Z0-9_]', '', f'select_{selector}')

    def __call__(self, df: pd.DataFrame) -> List[str]:
        """A ColumnsSelector can be called on a dataframe.

        Args:
            df (pd.DataFrame): pandas df

        Returns:
            List[str]: A list of column names
        """
        return self.selector(df)

    def __add__(self, other: ColumnSelector) -> ColumnSelector:
        """Add two selectors.

        N.B. Adding means taking the intersection because we don't want duplicates.
        In order to preserve the order in existing selectors, the use of set is avoided.

        Args:
            other (ColumnSelector): Another column selector

        Returns:
            ColumnSelector: A new one with merged selector callables
        """
        assert isinstance(other, ColumnSelector), 'Argument should be of type ColumnSelector'
        def _ordered_union(list1, list2):
            intersection = set(list1) & set(list2)
            return list1 + [x for x in list2 if x not in intersection]

        return ColumnSelector(
            lambda df: _ordered_union(self.__call__(df), other(df))
        )

    def __sub__(self, other: ColumnSelector) -> ColumnSelector:
        """Not sure if this is ever practical, but if you make an __add__...

        Args:
            other (ColumnSelector): Another column selector

        Returns:
            ColumnSelector: [description]
        """
        assert isinstance(other, ColumnSelector), 'Argument should be of type ColumnSelector'
        return ColumnSelector(lambda df: [c for c in self.__call__(df) if c not in other(df)])

    def __str__(self) -> str:
        """Simple string representation

        Returns:
            str: This string is shown as a name in pipeline steps
        """
        return self.name


# New types for use in type annotation
Transformation = Union[BaseEstimator, TransformerMixin]
ColumnExpression = Union[ColumnSelector, List[str]]


[docs]def columns(
    *args,
    include: Optional[ColumnExpression] = None,
    exclude: Optional[ColumnExpression] = None,
    **kwargs
) -> ColumnSelector:
    """Helper function for creating a ColumnSelector

    Flexible arguments:
    - include or exclude lists: speak for themselves
    - dtype_include, dtype_exclude, pattern: dispatched to sklearn's make_column_selector
    - otherwise: a list to include, or an existing ColumnSelector

    Args:
        include (Optional[ColumnExpression], optional): [description]. Defaults to None.
        exclude (Optional[ColumnExpression], optional): [description]. Defaults to None.

    Returns:
        ColumnSelector: A callable that returns columns names, when called on a df
    """
    if len(args) == 1:
        include = args[0]

    if isinstance(include, ColumnSelector):
        return include

    if include is not None:
        #selector = (lambda df: [c for c in df.columns if c in include])
        #selector = (lambda df: [c for c in include if c in df.columns])
        selector = lambda _: list(include)
    elif exclude is not None:
        selector = (lambda df: [c for c in df.columns if c not in exclude])
    else:
        selector = make_column_selector(**kwargs)
    return ColumnSelector(selector)


[docs]class SkippaMixin:
    """Utility class providing additional methods for custom Skippa transformers."""
    
    def _get_param_names(self) -> List[str]:
        """Get parameter names for the estimator.
        
        This overrides sklearn's BaseEstimator._get_param_names()
        These things are changed:
        - the first line doesn't look at cls.__init__, but super().__init__
        - it's no longer a class method but an instance method, because the super().__init__ depends on which class was instantiated
        - we manually add the 'cols' parameter, because it's always added to the skippa extension

        This is a fix for a problem using GridSearchCV (or any other sklearn hyperparam search)
        - When calling .fit on the search, it calls .get_params() for each pipeline step
        - this in turn calls the BaseEstimator._get_param_names() class method
        - since our steps are not the sklearn transformers, but Skippa extensions, they have a different init signature
        - this fix makes sure we look at the param signature of the sklearn class (and we mamnually add the 'cols' parameter)
        """
        # fetch the constructor or the original constructor before
        # deprecation wrapping if any
        init = getattr(super().__init__, "deprecated_original", super().__init__)
        if init is object.__init__:
            # No explicit constructor to introspect
            return []

        # introspect the constructor arguments to find the model parameters
        # to represent
        init_signature = inspect.signature(init)
        # Consider the constructor parameters excluding 'self'
        parameters = [
            p
            for p in init_signature.parameters.values()
            if p.name != "self" and p.kind != p.VAR_KEYWORD
        ]
        for p in parameters:
            if p.kind == p.VAR_POSITIONAL:
                raise RuntimeError(
                    "scikit-learn estimators should always "
                    "specify their parameters in the signature"
                    " of their __init__ (no varargs)."
                    " %s with constructor %s doesn't "
                    " follow this convention." % (self, init_signature)
                )
        # Extract and sort argument names excluding 'self']
        return sorted([p.name for p in parameters] + ['cols'])
    
    def _set_columns(self, cols: ColumnSelector) -> None:
        self.cols = cols

    def _evaluate_columns(self, X: pd.DataFrame, check_dtypes: str = None) -> List[str]:
        """Evaluate columns expression on given dataframe

        Args:
            X (pd.DataFrame): [description]
            check_dtypes (str, optional): When filled, check if the datatypes are as expected. Defaults to None.

        Raises:
            TypeError: if dtypes ofund in the columns are not as expected by the transformation

        Returns:
            List[str]: A list of column names
        """
        self._column_names = self.cols(X)
        if len(self._column_names) == 0:
            logging.warn(f'No columns found for column selector {self.cols}')
        
        if check_dtypes in ['number', 'numeric']:
            if not all([is_numeric_dtype(t) for t in X[self._column_names].dtypes]):
                raise TypeError('Transformation can only be applied to numeric columns')
        elif check_dtypes == 'float':
            if not all([is_float_dtype(t) for t in X[self._column_names].dtypes]):
                raise TypeError('Transformation can only be applied to float columns')
        elif check_dtypes == 'string':
            if not all([is_string_dtype(t) for t in X[self._column_names].dtypes]):
                raise TypeError('Transformation can only be applied to string columns')
        
        return self._column_names

    def _get_result(self, X, res) -> pd.DataFrame:
        column_names = self._evaluate_columns(X)
        X.loc[:, column_names] = res
        return X

    def _set_names(self, X) -> None:
        self.names = X.columns

    def _get_names(self):
        return self.names