Source code for predeval.categorical

"""Library of classes for evaluating categorical model outputs."""
from numbers import Real
from functools import partial
import numpy as np
from scipy import stats
from .parent import ParentPredEval

__author__ = 'Dan Vatterott'
__license__ = 'MIT'


def _chi2_test(reference, test_data):
    """Change chi2_contingency inputs for partial evaluation.

    Uses `chi2_contingency test from scipy
    <https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2_contingency.html>`_.

    Parameters
    ----------
    reference : list or np.array
        This the reference data that will be used for the comparison.
    test_data : list or np.array
        This the data compared to the reference data.

    Returns
    -------
    chi2 : float
        The test statistic.
    p : float
        The p-value of the test
    dof : int
        Degrees of freedom
    expected : ndarray, same shape as `observed`
        The expected frequencies, based on the marginal sums of the table.

    """
    obs = np.append([reference], [test_data], axis=0)
    return stats.chi2_contingency(obs)


[docs]class CategoricalEvaluator(ParentPredEval): """ Evaluator for categorical model outputs (e.g., classification models). By default, this will run the tests listed in the assertions attribute (['chi2_test', 'exist']). You can change the tests that will run by listing the desired tests in the assertions parameter. The available tests are chi2_test and exist. ... Parameters ---------- ref_data : list of int or float or np.array This the reference data for all tests. All future data will be compared to this data. assertions : list of str, optional These are the assertion tests that will be created. Defaults is ['chi2_test', 'exist']. verbose : bool, optional Whether tests should print their output. Default is true Attributes ---------- assertion_params : dict dictionary of test names and values defining these tests. * chi2_stat : float Chi2-test-statistic. When this value is exceeded. The test 'failed'. * chi2_test : func Partially evaluated chi2 test. * cat_exists : list of int or str This is a list of the expected model outputs assertions : list of str This list of strings describes the tests that will be run on comparison data. Defaults to ['chi2_test', 'exist'] """ def __init__( self, ref_data, assertions=None, verbose=True, **kwargs): super(CategoricalEvaluator, self).__init__(ref_data, verbose=verbose) # ---- Fill in Assertion Parameters ---- # self._assertion_params_ = { 'cat_exists': None, 'chi2_test': None, } assert isinstance(kwargs.get('chi2_stat', 2), Real), 'expected number, input chi2_test_stat is not a number' self._assertion_params_['chi2_stat'] = kwargs.get('chi2_stat', 2) # ---- create list of assertions to test ---- # self._possible_assertions_ = { 'exist': (self.update_exist, self.check_exist), 'chi2_test': (self.update_chi2_test, self.check_chi2), } # ---- create list of assertions to test ---- # assertions = ['exist', 'chi2_test'] if assertions is None else assertions self._assertions_ = self._check_assertion_types(assertions) # ---- populate assertion tests with reference data ---- # for i in self._assertions_: self._possible_assertions[i][0](self.ref_data) # ---- populate list of tests to run and run tests ---- # self._tests_ = [self._possible_assertions_[i][1] for i in self._assertions_] @property def assertion_params(self): return self._assertion_params_ @property def _possible_assertions(self): return self._possible_assertions_ @property def assertions(self): return self._assertions_ @property def _tests(self): return self._tests_
[docs] def update_chi2_test(self, input_data): """Create partially evaluated chi2 contingency test. Uses `chi2_contingency test from scipy <https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2_contingency.html>`_. Parameters ---------- input_data : list or np.array This the reference data for the ks-test. All future data will be compared to this data. Returns ------- None """ input_data = np.array(input_data) if isinstance(input_data, list) else input_data assert len(input_data.shape) == 1, 'Input data not a single vector' _, counts = np.unique(input_data, return_counts=True) assert all([x >= 5 for x in counts]), \ 'Not enough data of each type for reliable Chi2 Contingency test. Need at least 5.' self.assertion_params['chi2_test'] = partial(_chi2_test, np.array(counts))
[docs] def update_exist(self, input_data): """Create input data for test checking whether all categorical outputs exist. Parameters ---------- input_data : list or np.array This the reference data for the check_exist. All future data will be compared to it. Returns ------- None """ input_data = np.array(input_data) if isinstance(input_data, list) else input_data assert len(input_data.shape) == 1, 'Input data not a single vector' self.assertion_params['cat_exists'] = np.unique(input_data)
[docs] def check_chi2(self, test_data): """Test whether test_data is similar to reference data. If the returned chi2-test-statistic is greater than the threshold (default 2), the test failed. The threshold is set by assertion_params['chi2_test']. Uses `chi2_contingency test from scipy <https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2_contingency.html>`_. Parameters ---------- test_data : list or np.array This the data that will be compared to the reference data. Returns ------- (string, bool) 2 item tuple with test name and boolean expressing whether passed test. """ assert self.assertion_params['chi2_test'], 'Must input or load reference data chi2-test' test_data = np.array(test_data) if isinstance(test_data, list) else test_data assert len(test_data.shape) == 1, 'Input data not a single vector' _, counts = np.unique(test_data, return_counts=True) assert all([x >= 5 for x in counts]), \ 'Not enough data of each type for reliable Chi2 Contingency test. '\ 'Need at least 5 values in each cell.' try: test_stat, p_value, _, _ = self.assertion_params['chi2_test'](counts) # pylint: disable=E1102 except ValueError: test_stat = 1000.0 p_value = 0.00 print('WARNING: NOT ALL CATEGORIES PRESENT') passed = True if test_stat <= self.assertion_params['chi2_stat'] else False pass_fail = 'Passed' if passed else 'Failed' if self.verbose: print('{0} chi2 check; test statistic={1:.4f}, p={2:.4f}'.format( pass_fail, float(test_stat), float(p_value))) return ('chi2', passed)
[docs] def check_exist(self, test_data): """Check that all distinct values present in test_data. If any values missing, then the function will return a False (rather than true). The expected values is controlled by assertion_params['cat_exists']. Parameters ---------- test_data : list or np.array This the data that will be compared to the reference data. Returns ------- (string, bool) 2 item tuple with test name and boolean expressing whether passed test. """ assert self.assertion_params['cat_exists'] is not None,\ 'Must input or load reference categories' test_data = np.array(test_data) if isinstance(test_data, list) else test_data assert len(test_data.shape) == 1, 'Input data not a single vector' obs = np.unique(np.array(test_data)) exp = list(self.assertion_params['cat_exists']) passed = True if all([x in exp for x in obs]) and all([x in obs for x in exp]) else False pass_fail = 'Passed' if passed else 'Failed' if self.verbose: print('{0} exist check; observed={1} (Expected {2})'.format(pass_fail, obs, exp)) return ('exist', passed)