Source code for skippa.profile

"""
DataProfile is used for storing and retrieving metadata of data that is used in the pipeline.
Typically the DataProfile is created during fitting of a pipeline.
The profile is used by the Gradio app that can be created.
"""
from typing import Optional, Any, Dict, Generator

import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype, is_string_dtype, is_float_dtype

[docs]class DataProfile: MAX_NUM_DISTINCT_VALUES = 100000 def __init__(self, df: pd.DataFrame, y: Optional[Any] = None) -> None: self.column_names = df.columns.tolist() self.dtypes = df.dtypes self.info = {} self.info_labels = {} self._profile_features(df) self._profile_labels(y) def _profile_features(self, df: pd.DataFrame) -> None: """Create a profile of the features""" for column_name, dtype in zip(self.column_names, self.dtypes): column_info = { 'dtype': dtype } if is_numeric_dtype(dtype): column_info['is_numeric'] = True column_info['is_string'] = False column_info['min_value'] = df[column_name].min() column_info['max_value'] = df[column_name].max() column_info['median_value'] = df[column_name].median() elif is_string_dtype(dtype): column_info['is_numeric'] = False column_info['is_string'] = True if df[column_name].nunique() <= self.MAX_NUM_DISTINCT_VALUES: column_info['values'] = df[column_name].unique().tolist() else: column_info['values'] = [] column_info['mode'] = df[column_name].value_counts().idxmax() else: raise ValueError(f'No profile for column {column_name}') self.info[column_name] = column_info def _profile_labels(self, y) -> None: """Create a profile of the labels (if present)""" if y is None: self.info_labels['type'] = None y = np.array(y) try: n_cols = y.shape[1] except IndexError: n_cols = 1 if n_cols > 1: # assume multi-class classification self.info_labels['type'] = 'multi-class' else: if sorted(pd.Series(y).unique().astype('int')) == [0, 1]: self.info_labels['type'] = 'binary' else: self.info_labels['type'] = 'regression' def __iter__(self) -> Generator[Dict, None, None]: for column_name, info in self.info.items(): info['name'] = column_name yield info
[docs] def is_classification(self) -> bool: return self.info_labels['type'] in ['binary', 'multi-class']
[docs] def is_regression(self) -> bool: return self.info_labels['type'] == 'regression'