"""
This defines custom transformers implementing anything other than
existing skleafrn treansformers.
"""
from typing import Any, Optional, Union, List, Dict, Tuple, Callable
import logging
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from skippa.transformers import ColumnSelector, SkippaMixin
[docs]class SkippaApplier(BaseEstimator, TransformerMixin, SkippaMixin):
"""Transformer for applying arbitrary function (wraps around pandas apply)"""
def __init__(self, cols: ColumnSelector, *args, **kwargs):
"""Initialise with columns specifier and apply parameters
Args:
cols (ColumnSelector): columns specifier
*args, **kwargs: any arguments accepted by pandas.DataFrame.apply()
"""
self.cols = cols
self.args = args
self.kwargs = kwargs
[docs] def fit(self, X, y=None, **fit_params):
"""Nothing to do here"""
return self
[docs]class SkippaCaster(BaseEstimator, TransformerMixin, SkippaMixin):
"""Transformer for casting columns to another data type"""
def __init__(self, cols: ColumnSelector, dtype: Any) -> None:
"""There are 2 ways to define a mapping for renaming
- a dict of old: new mappings
- a column selector and a renaming fuction
Args:
dtype (Any): Either a single dtype, of a dict mapping column to dtype
"""
self.cols = cols
self.dtype = dtype
[docs] def fit(self, X, y=None, **kwargs):
"""Nothing to do here."""
return self
[docs]class SkippaRenamer(BaseEstimator, TransformerMixin):
"""Transformer for renaming columns"""
def __init__(self, mapping: Union[Dict, Tuple[ColumnSelector, Callable]]) -> None:
"""There are 2 ways to define a mapping for renaming
- a dict of old: new mappings
- a column selector and a renaming fuction
Args:
mapping (Union[Dict, Tuple[ColumnSelector, Callable]]): [description]
"""
self.mapping = mapping
[docs] def fit(self, X, y=None, **kwargs):
"""Look at the df to determine the mapping.
In case of a columnselector + function:
evaluate the column names and apply the renaming function
"""
if isinstance(self.mapping, tuple):
column_selector, renamer = self.mapping
column_names = column_selector(X)
self.mapping_dict = {c: renamer(c) for c in column_names}
else:
self.mapping_dict = self.mapping
return self
[docs]class SkippaSelector(BaseEstimator, TransformerMixin, SkippaMixin):
"""Transformer for selecting a subset of columns in a df."""
def __init__(self, cols: ColumnSelector) -> None:
self.cols = cols
[docs] def fit(self, X, y=None, **kwargs):
return self
[docs]class SkippaAssigner(BaseEstimator, TransformerMixin, SkippaMixin):
"""Transformer for selecting a subset of columns in a df."""
def __init__(self, **kwargs) -> None:
"""This one doesn't have a column selctor!"""
self.cols = None
self.kwargs = kwargs
[docs] def fit(self, X, y=None, **kwargs):
return self
[docs]class SkippaReplacer(BaseEstimator, TransformerMixin, SkippaMixin):
def __init__(self, **kwargs) -> None:
self.kwargs = kwargs
[docs] def fit(self, X, y=None, **kwargs):
return self
[docs]class SkippaConcat(BaseEstimator, SkippaMixin):
"""Concatenate two pipelines."""
def __init__(self, left, right) -> None:
self.name1, self.pipe1 = left
self.name2, self.pipe2 = right
[docs] def fit(self, X, y=None, **kwargs):
self.pipe1.fit(X=X, y=y, **kwargs)
self.pipe2.fit(X=X, y=y, **kwargs)
return self
[docs]class SkippaDateEncoder(BaseEstimator, TransformerMixin, SkippaMixin):
"""Derive date features using pandas datatime's .dt property."""
def __init__(self, cols: ColumnSelector, **kwargs) -> None:
parts = {'year': True, 'month': True, 'day': True}
parts.update(kwargs)
self._parts = parts
self._set_columns(cols)
super().__init__(**kwargs)
[docs] def fit(self, X, y=None):
return self
[docs]class SkippaOutlierRemover(BaseEstimator, TransformerMixin, SkippaMixin):
"""Detect and remove outliers, based on simple IQR"""
def __init__(self, cols: ColumnSelector, factor: float = 1.5):
self.cols = cols
self.factor = factor
self.statistics = {}
@staticmethod
def _get_iqr_range(x: pd.Series, factor: float = 1.5) -> Tuple[float, float]:
q1 = x.quantile(0.25)
q3 = x.quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - (factor * iqr)
upper_bound = q3 + (factor * iqr)
return lower_bound, upper_bound
@staticmethod
def _limit(x: pd.Series, bounds: Tuple[float, float]) -> pd.Series:
x_ = x.copy()
lower, upper = bounds
x_[(x_ < lower) | (x_ > upper)] = np.nan
return x_
[docs] def fit(self, X, y=None):
for column_name in self._evaluate_columns(X):
x = X[column_name]
lower, upper = self._get_iqr_range(x, factor=self.factor)
self.statistics[column_name] = (lower, upper)
return self