python/FeatureLabs/henchman/henchman/diagnostics.py

diagnostics.py
# -*- coding: utf-8 -*-

'''The diagnostics module. Describe a particular dataset.
'''
import numpy as np
import pandas as pd


def title(string):
    centerline = '|  {}  |'.format(string)
    border = '+' + '{}'.format('-' * (len(centerline) - 2)) + '+'
    print('')
    print(border)
    print(centerline)
    print(border)


def subtitle(string):
    print('')
    print('## {} ##'.format(string))


def overview(data):
    '''Give a brief data overview.
    Contains information about data shape, missing values,
    memory usage and data types of columns.

    Args:
        data (pd.DataFrame): The dataframe for which to give an overview.

    Example:
        >>> from henchman.diagnostics import overview
        >>> overview(df)

    '''
    title('Data Shape')
    print('Number of columns: {}'.format(data.shape[1]))
    print('Number of rows: {}'.format(data.shape[0]))

    title('Missing Values')
    missing_values = data.isnull().sum().sort_values()
    print('Most values missing from column: {}'.format(missing_values[-1]))
    print('Average missing values by column: {:.2f}'.format(
        missing_values.mean()))

    title('Memory Usage')
    memory_used = data.memory_usage(deep=True)/1000000
    print('Total memory used: {:.2f} MB'.format(memory_used.sum()))
    print('Average memory by column: {:.2f} MB'.format(memory_used.mean()))

    title('Data Types')
    print(pd.DataFrame([data[col].dtype for col in data]
                       ).reset_index().groupby(0).count())


def _find_duplicates(data):
    duplicates = data[data.duplicated()]
    if duplicates.shape[0] > 0:
        print('DataFrame has {} duplicates'.format(duplicates.shape[0]))


def _find_correlations(data, corr_thresh):
    correlations = data.corr()
    warningfm = correlations[(np.abs(correlations) > corr_thresh) & (
        np.abs(correlations)  <  1.)]
    listed = []
    unicorr = []
    for col in warningfm:
        warningcol = warningfm[col][~warningfm[col].isnull()]
        if not warningcol.empty:
            for index, value in warningcol.iteritems():
                if (index, col) not in listed:
                    print('{} and {} are linearly correlated: {:.3f}'.format(
                        col, index, value))
                    listed.append((col, index))
                    listed.append((index, col))
                    unicorr += [col, index]


def _find_missing(data, missing_thresh):
    for index, value in data.isnull().sum().iteritems():
        if value > (data.shape[0] * missing_thresh):
            print('{} has {} missing values: ({}% of total)'.format(
                index, value, 100 * value / data.shape[0]))


def _find_high_card(data, card_thresh):
    objects = [col for col in data if data[col].dtype == 'O']
    for index, value in data[objects].nunique().iteritems():
        if value > card_thresh:
            print('{} has many unique values: {}'.format(index, value))


def warnings(data, corr_thresh=.9, missing_thresh=.1, card_thresh=50):
    '''Warn about common dataset problems.
    Checks for duplicates, highly linearly correlated columns,
    columns with many missing values and categorical columns
    with many unique values.

    Args:
        data (pd.DataFrame): The dataframe to warn about.
        corr_thresh (float): Warn above this threshold (Default .9)
        missing_thresh (float): Warn above this threshold (Default .1)
        card_thresh (int): Warn above this threshold (Default 50).

    Example:
        >>> from henchman.diagnostics import warnings
        >>> warnings(df, corr_thresh=.5)
    '''
    title('Warnings')
    _find_duplicates(data)
    _find_correlations(data, corr_thresh)
    _find_missing(data, missing_thresh)
    _find_high_card(data, card_thresh)


def _object_column_summary(data, objects):
    title('Object Column Summary')
    for col in objects:
        subtitle(col)
        datacol = data[col]
        print('Unique: {}'.format(len(datacol.unique())))

        mode = datacol.mode().values
        if len(mode) > 1:
            print('Mode: No Mode')

        else:
            mode = mode[0]
            nummode = 100 * datacol[datacol == mode].shape[0]/datacol.shape[0]
            print('Mode: {}, (matches {:.1f}% of rows)'.format(mode, nummode))
        missing = datacol.isnull().sum()
        if missing > 0:
            print('Missing: {}'.format(missing))


def _time_column_summary(data, times):
    title('Time Column Summary')
    for col in times:
        subtitle(col)
        datacol = data[col]
        print('Last Time: {}'.format(datacol.max()))
        print('First Time: {}'.format(datacol.min()))


def _boolean_column_summary(data, bools):
    title('Boolean Column Summary')
    for col in bools:
        subtitle(col)
        datacol = data[col]
        numtrue = float(datacol.sum())
        total = datacol.shape[0]
        perctrue = 100 * numtrue / total

        print('Number True: {}, Number False: {}, Mean: {:.2f}'.format(
            numtrue, total - numtrue, datacol.mean()))
        print('Percent True: {:.1f}% | Percent False: {:.1f}%'.format(
            perctrue, 100 - perctrue))
        missing = datacol.isnull().sum()
        if missing > 0:
            print('Missing: {}'.format(missing))


def _numeric_column_summary(data, numbers):
    title('Numeric Column Summary')
    for col in numbers:
        subtitle(col)
        datacol = data[col]
        print('Maximum: {}, Minimum: {}, Mean: {:.2f}'.format(
            datacol.max(), datacol.min(), datacol.mean()))
        print('Quartile 3: {:.2f} | Median: {:.2f}'
              '| Quartile 1: {:.2f}'.format(datacol.quantile(.75),
                                            datacol.quantile(.5),
                                            datacol.quantile(.25)))
        missing = datacol.isnull().sum()
        if missing > 0:
            print('Missing: {}'.format(missing))


def column_report(data):
    '''Give column summaries according to pandas dtype.
    Has functionality for objects, times, booleans and numeric
    columns. Finds maximums, minimums, means, missing and other
    datatype appropriate attributes.

    Args:
        data (pd.DataFrame): The dataframe on which to report.

    Example:
        >>> from henchman.diagnostics import column_report
        >>> column_report(df)

    '''
    objects = [col for col in data if data[col].dtype == 'O']
    times = [col for col in data if data[col].dtype == ' < M8[ns]']
    bools = [col for col in data if data[col].dtype == 'bool']
    numbers = [col for col in data if data[col].dtype in (
        ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'])]
    if objects != []:
        _object_column_summary(data, objects)
    if times != []:
        _time_column_summary(data, times)
    if bools != []:
        _boolean_column_summary(data, bools)
    if numbers != []:
        _numeric_column_summary(data, numbers)


def profile(data, corr_thresh=.9, missing_thresh=.1, card_thresh=50):
    '''Profile dataset.
    Gives a dataset overview, writes the warnings and reports
    on all columns.

    Args:
        data (pd.DataFrame): The dataframe to profile.
        corr_thresh (float): Warn above this threshold (Default .9)
        missing_thresh (float): Warn above this threshold (Default .1)
        card_thresh (int): Warn above this threshold (Default 50)

    Example:
        >>> from henchman.diagnostics import profile
        >>> profile(df, missing_thresh=.3, card_thresh=10)

    '''
    overview(data)
    warnings(data, corr_thresh, missing_thresh, card_thresh)
    column_report(data)