Source code for skippa.profile

"""
DataProfile is used for storing and retrieving metadata of data that is used in the pipeline.
Typically the DataProfile is created during fitting of a pipeline.
The profile is used by the Gradio app that can be created.
"""
from typing import Optional, Any, Dict, Generator

import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype, is_string_dtype, is_float_dtype

[docs]class DataProfile:

    MAX_NUM_DISTINCT_VALUES = 100000

    def __init__(self, df: pd.DataFrame, y: Optional[Any] = None) -> None:
        self.column_names = df.columns.tolist()
        self.dtypes = df.dtypes
        self.info = {}
        self.info_labels = {}
        self._profile_features(df)
        self._profile_labels(y)

    def _profile_features(self, df: pd.DataFrame) -> None:
        """Create a profile of the features"""
        for column_name, dtype in zip(self.column_names, self.dtypes):
            column_info = {
                'dtype': dtype
            }
            if is_numeric_dtype(dtype):
                column_info['is_numeric'] = True
                column_info['is_string'] = False
                column_info['min_value'] = df[column_name].min()
                column_info['max_value'] = df[column_name].max()
                column_info['median_value'] = df[column_name].median()
            elif is_string_dtype(dtype):
                column_info['is_numeric'] = False
                column_info['is_string'] = True
                if df[column_name].nunique() <= self.MAX_NUM_DISTINCT_VALUES:
                    column_info['values'] = df[column_name].unique().tolist()
                else:
                    column_info['values'] = []
                column_info['mode'] = df[column_name].value_counts().idxmax()
            else:
                raise ValueError(f'No profile for column {column_name}')

            self.info[column_name] = column_info
    
    def _profile_labels(self, y) -> None:
        """Create a profile of the labels (if present)"""
        if y is None:
            self.info_labels['type'] = None
        y = np.array(y)
        try:
            n_cols = y.shape[1]
        except IndexError:
            n_cols = 1
        
        if n_cols > 1:
            # assume multi-class classification
            self.info_labels['type'] = 'multi-class'
        else:
            if sorted(pd.Series(y).unique().astype('int')) == [0, 1]:
                self.info_labels['type'] = 'binary'
            else:
                self.info_labels['type'] = 'regression'

    def __iter__(self) -> Generator[Dict, None, None]:
        for column_name, info in self.info.items():
            info['name'] = column_name
            yield info
    
[docs]    def is_classification(self) -> bool:
        return self.info_labels['type'] in ['binary', 'multi-class']
    
[docs]    def is_regression(self) -> bool:
        return self.info_labels['type'] == 'regression'