"""
This contains base / utility classes and functions needed for defining/using transformers
"""
from __future__ import annotations
from typing import Optional, Union, List, Dict, Tuple, Callable
import re
import logging
import inspect
import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype, is_string_dtype, is_float_dtype
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_selector, make_column_transformer
[docs]class ColumnSelector:
"""This is not a transformer, but a utility class for defining a column set."""
def __init__(self, selector: Callable) -> None:
self.selector = selector
self.name = re.sub('[^a-zA-Z0-9_]', '', f'select_{selector}')
def __call__(self, df: pd.DataFrame) -> List[str]:
"""A ColumnsSelector can be called on a dataframe.
Args:
df (pd.DataFrame): pandas df
Returns:
List[str]: A list of column names
"""
return self.selector(df)
def __add__(self, other: ColumnSelector) -> ColumnSelector:
"""Add two selectors.
N.B. Adding means taking the intersection because we don't want duplicates.
In order to preserve the order in existing selectors, the use of set is avoided.
Args:
other (ColumnSelector): Another column selector
Returns:
ColumnSelector: A new one with merged selector callables
"""
assert isinstance(other, ColumnSelector), 'Argument should be of type ColumnSelector'
def _ordered_union(list1, list2):
intersection = set(list1) & set(list2)
return list1 + [x for x in list2 if x not in intersection]
return ColumnSelector(
lambda df: _ordered_union(self.__call__(df), other(df))
)
def __sub__(self, other: ColumnSelector) -> ColumnSelector:
"""Not sure if this is ever practical, but if you make an __add__...
Args:
other (ColumnSelector): Another column selector
Returns:
ColumnSelector: [description]
"""
assert isinstance(other, ColumnSelector), 'Argument should be of type ColumnSelector'
return ColumnSelector(lambda df: [c for c in self.__call__(df) if c not in other(df)])
def __str__(self) -> str:
"""Simple string representation
Returns:
str: This string is shown as a name in pipeline steps
"""
return self.name
# New types for use in type annotation
Transformation = Union[BaseEstimator, TransformerMixin]
ColumnExpression = Union[ColumnSelector, List[str]]
[docs]def columns(
*args,
include: Optional[ColumnExpression] = None,
exclude: Optional[ColumnExpression] = None,
**kwargs
) -> ColumnSelector:
"""Helper function for creating a ColumnSelector
Flexible arguments:
- include or exclude lists: speak for themselves
- dtype_include, dtype_exclude, pattern: dispatched to sklearn's make_column_selector
- otherwise: a list to include, or an existing ColumnSelector
Args:
include (Optional[ColumnExpression], optional): [description]. Defaults to None.
exclude (Optional[ColumnExpression], optional): [description]. Defaults to None.
Returns:
ColumnSelector: A callable that returns columns names, when called on a df
"""
if len(args) == 1:
include = args[0]
if isinstance(include, ColumnSelector):
return include
if include is not None:
#selector = (lambda df: [c for c in df.columns if c in include])
#selector = (lambda df: [c for c in include if c in df.columns])
selector = lambda _: list(include)
elif exclude is not None:
selector = (lambda df: [c for c in df.columns if c not in exclude])
else:
selector = make_column_selector(**kwargs)
return ColumnSelector(selector)
[docs]class SkippaMixin:
"""Utility class providing additional methods for custom Skippa transformers."""
def _get_param_names(self) -> List[str]:
"""Get parameter names for the estimator.
This overrides sklearn's BaseEstimator._get_param_names()
These things are changed:
- the first line doesn't look at cls.__init__, but super().__init__
- it's no longer a class method but an instance method, because the super().__init__ depends on which class was instantiated
- we manually add the 'cols' parameter, because it's always added to the skippa extension
This is a fix for a problem using GridSearchCV (or any other sklearn hyperparam search)
- When calling .fit on the search, it calls .get_params() for each pipeline step
- this in turn calls the BaseEstimator._get_param_names() class method
- since our steps are not the sklearn transformers, but Skippa extensions, they have a different init signature
- this fix makes sure we look at the param signature of the sklearn class (and we mamnually add the 'cols' parameter)
"""
# fetch the constructor or the original constructor before
# deprecation wrapping if any
init = getattr(super().__init__, "deprecated_original", super().__init__)
if init is object.__init__:
# No explicit constructor to introspect
return []
# introspect the constructor arguments to find the model parameters
# to represent
init_signature = inspect.signature(init)
# Consider the constructor parameters excluding 'self'
parameters = [
p
for p in init_signature.parameters.values()
if p.name != "self" and p.kind != p.VAR_KEYWORD
]
for p in parameters:
if p.kind == p.VAR_POSITIONAL:
raise RuntimeError(
"scikit-learn estimators should always "
"specify their parameters in the signature"
" of their __init__ (no varargs)."
" %s with constructor %s doesn't "
" follow this convention." % (self, init_signature)
)
# Extract and sort argument names excluding 'self']
return sorted([p.name for p in parameters] + ['cols'])
def _set_columns(self, cols: ColumnSelector) -> None:
self.cols = cols
def _evaluate_columns(self, X: pd.DataFrame, check_dtypes: str = None) -> List[str]:
"""Evaluate columns expression on given dataframe
Args:
X (pd.DataFrame): [description]
check_dtypes (str, optional): When filled, check if the datatypes are as expected. Defaults to None.
Raises:
TypeError: if dtypes ofund in the columns are not as expected by the transformation
Returns:
List[str]: A list of column names
"""
self._column_names = self.cols(X)
if len(self._column_names) == 0:
logging.warn(f'No columns found for column selector {self.cols}')
if check_dtypes in ['number', 'numeric']:
if not all([is_numeric_dtype(t) for t in X[self._column_names].dtypes]):
raise TypeError('Transformation can only be applied to numeric columns')
elif check_dtypes == 'float':
if not all([is_float_dtype(t) for t in X[self._column_names].dtypes]):
raise TypeError('Transformation can only be applied to float columns')
elif check_dtypes == 'string':
if not all([is_string_dtype(t) for t in X[self._column_names].dtypes]):
raise TypeError('Transformation can only be applied to string columns')
return self._column_names
def _get_result(self, X, res) -> pd.DataFrame:
column_names = self._evaluate_columns(X)
X.loc[:, column_names] = res
return X
def _set_names(self, X) -> None:
self.names = X.columns
def _get_names(self):
return self.names