Source code for pipeline.perf_data

#!/usr/bin/env python

"""Contains class PerfData and its subclasses, which are objects for collecting and computing model performance metrics
and predictions
"""


import deepchem as dc
import numpy as np
import logging
from sklearn.metrics import roc_auc_score, confusion_matrix, average_precision_score, precision_score, recall_score
from sklearn.metrics import accuracy_score, matthews_corrcoef, cohen_kappa_score, log_loss, balanced_accuracy_score
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

log = logging.getLogger('ATOM')

# ******************************************************************************************************************************

[docs]
def rms_error(y_real, y_pred):
    """Calculates the root mean squared error. Score function used for model selection.

    Args:
       y_real (np.array): Array of ground truth values

       y_pred (np.array): Array of predicted values

    Returns:
       (np.array): root mean squared error of the input

    """
    return np.sqrt(mean_squared_error(y_real, y_pred))


# ---------------------------------------------

[docs]
def negative_predictive_value(y_real, y_pred):
    """Computes negative predictive value of a binary classification model: NPV = TN/(TN+FN).

    Args:
        y_real (np.array): Array of ground truth values

        y_pred (np.array): Array of predicted values

    Returns:
        (float): The negative predictive value

    """
    TN = sum((y_pred == 0) & (y_real == 0))
    FN = sum((y_pred == 0) & (y_real == 1))
    if TN + FN == 0:
        return 0.0
    else:
        return float(TN)/float(TN+FN)


# ******************************************************************************************************************************

# params.model_choice_score_type must be a key in one of the dictionaries below:
regr_score_func = dict(r2 = r2_score, mae = mean_absolute_error, rmse = rms_error)
classif_score_func = dict(roc_auc = roc_auc_score, precision = precision_score, ppv = precision_score, recall = recall_score,
                          npv = negative_predictive_value, cross_entropy = log_loss, accuracy = accuracy_score, bal_accuracy = balanced_accuracy_score,
                          avg_precision = average_precision_score, mcc = matthews_corrcoef, kappa = cohen_kappa_score)

MAX_INVALID_FRAC_TO_FILTER = 0.01
_ACTIVE_MAX_INVALID_FRAC_TO_FILTER = MAX_INVALID_FRAC_TO_FILTER


def _configure_invalid_pred_frac_threshold(params):
    """Set module-level invalid prediction filtering threshold from params."""
    global _ACTIVE_MAX_INVALID_FRAC_TO_FILTER
    threshold = getattr(params, 'max_invalid_pred_frac', MAX_INVALID_FRAC_TO_FILTER)
    if threshold is None:
        threshold = MAX_INVALID_FRAC_TO_FILTER
    threshold = float(threshold)
    clamped_threshold = min(1.0, max(0.0, threshold))
    if clamped_threshold != threshold:
        log.warning(
            "Clamped max_invalid_pred_frac from %.6f to %.6f; expected range is [0.0, 1.0].",
            threshold,
            clamped_threshold,
        )
    threshold = clamped_threshold
    _ACTIVE_MAX_INVALID_FRAC_TO_FILTER = threshold
    log.info("Using max_invalid_pred_frac=%.4f for invalid prediction handling.", threshold)


def _get_active_invalid_pred_frac_threshold():
    """Return active invalid prediction filtering threshold."""
    return _ACTIVE_MAX_INVALID_FRAC_TO_FILTER


def _regression_invalid_penalty(score_type):
    """Return a hard penalty for invalid regression predictions.

    Scores where larger is better (for example R2) get a very negative value.
    Loss-like scores (for example MAE/RMSE) get a very large positive value.
    """
    if score_type == 'r2':
        return -1.0e12
    if score_type in {'mae', 'rmse'}:
        return 1.0e12
    return np.nan


def _finite_row_mask(arr):
    """Return a row-wise finite mask for 1D/2D+ arrays."""
    arr = np.asarray(arr)
    if arr.ndim <= 1:
        return np.isfinite(arr)
    reduce_axes = tuple(range(1, arr.ndim))
    return np.all(np.isfinite(arr), axis=reduce_axes)


def _invalid_stats(y_real, y_pred):
    """Return invalid count, total count, and invalid fraction for paired arrays."""
    y_real = np.asarray(y_real)
    y_pred = np.asarray(y_pred)

    if y_real.ndim == 0 or y_pred.ndim == 0 or y_real.shape[0] != y_pred.shape[0]:
        return 0, 0, 0.0, np.array([], dtype=bool)

    valid_rows = _finite_row_mask(y_real) & _finite_row_mask(y_pred)
    total = int(valid_rows.shape[0])
    invalid_count = int(total - np.sum(valid_rows))
    invalid_frac = float(invalid_count / total) if total > 0 else 0.0
    return invalid_count, total, invalid_frac, valid_rows


def _safe_regression_score(score_type, y_real, y_pred, default=np.nan):
    """Compute a regression metric with tolerant filtering then hard penalties.

    If invalid values are at most 1% of rows, compute the metric on remaining rows.
    Otherwise, return a score-specific worst-case penalty.
    """
    y_real = np.asarray(y_real).reshape(-1)
    y_pred = np.asarray(y_pred).reshape(-1)
    penalty = _regression_invalid_penalty(score_type)

    threshold = _get_active_invalid_pred_frac_threshold()
    invalid_count, total_count, invalid_frac, valid_rows = _invalid_stats(y_real, y_pred)
    if invalid_frac > 0.0:
        if invalid_frac <= threshold and np.any(valid_rows):
            log.info(
                "Filtered invalid regression rows for %s: %d/%d (%.2f%%) <= threshold %.2f%%",
                score_type,
                invalid_count,
                total_count,
                100.0 * invalid_frac,
                100.0 * threshold,
            )
            y_real = y_real[valid_rows]
            y_pred = y_pred[valid_rows]
        else:
            log.warning(
                "Applying regression penalty for %s due to invalid rows: %d/%d (%.2f%%) > threshold %.2f%%",
                score_type,
                invalid_count,
                total_count,
                100.0 * invalid_frac,
                100.0 * threshold,
            )
            return penalty

    if score_type == 'r2' and y_real.size < 2:
        return penalty

    try:
        return regr_score_func[score_type](y_real, y_pred)
    except ValueError:
        return penalty if np.isfinite(penalty) else default


def _has_non_finite_values(arr):
    """Return True when an array-like contains NaN/Inf numeric values."""
    arr = np.asarray(arr)
    if np.issubdtype(arr.dtype, np.number):
        return not np.all(np.isfinite(arr))
    if arr.dtype == object:
        try:
            return not np.all(np.isfinite(arr.astype(float)))
        except (TypeError, ValueError):
            return False
    return False


def _classification_invalid_penalty(score_type):
    """Return a hard penalty for invalid classification predictions."""
    if score_type == 'cross_entropy':
        return 1.0e12
    if score_type in {'mcc', 'kappa'}:
        return -1.0
    if score_type in classif_score_func:
        return 0.0
    return np.nan


def _safe_classification_score(score_type, y_real, y_pred, average=None, default=np.nan):
    """Compute a classification metric with tolerant filtering then hard penalties.

    If invalid values are at most 1% of rows, compute the metric on remaining rows.
    Otherwise, return a score-specific worst-case penalty.
    """
    penalty = _classification_invalid_penalty(score_type)
    y_real = np.asarray(y_real)
    y_pred = np.asarray(y_pred)

    threshold = _get_active_invalid_pred_frac_threshold()

    if _has_non_finite_values(y_real) or _has_non_finite_values(y_pred):
        if y_real.ndim == 0 or y_pred.ndim == 0:
            return penalty

        if y_real.shape[0] != y_pred.shape[0]:
            return penalty

        invalid_count, total_count, invalid_frac, valid_rows = _invalid_stats(y_real, y_pred)
        if invalid_frac <= threshold and np.any(valid_rows):
            log.info(
                "Filtered invalid classification rows for %s: %d/%d (%.2f%%) <= threshold %.2f%%",
                score_type,
                invalid_count,
                total_count,
                100.0 * invalid_frac,
                100.0 * threshold,
            )
            y_real = y_real[valid_rows]
            y_pred = y_pred[valid_rows]
        else:
            log.warning(
                "Applying classification penalty for %s due to invalid rows: %d/%d (%.2f%%) > threshold %.2f%%",
                score_type,
                invalid_count,
                total_count,
                100.0 * invalid_frac,
                100.0 * threshold,
            )
            return penalty

    try:
        if average is None:
            return classif_score_func[score_type](y_real, y_pred)
        return classif_score_func[score_type](y_real, y_pred, **{'average': average})
    except ValueError:
        return penalty if np.isfinite(penalty) else default


def _coerce_invalid_pred_classes(y_real_classes, y_pred_classes, y_class_probs, num_classes):
    """Force non-finite-probability rows to an explicitly wrong predicted class."""
    pred_classes = np.asarray(y_pred_classes).copy()
    real_classes = np.asarray(y_real_classes)
    class_probs = np.asarray(y_class_probs)

    if class_probs.ndim == 1:
        invalid_rows = ~np.isfinite(class_probs)
    else:
        invalid_rows = ~np.all(np.isfinite(class_probs), axis=1)

    if not np.any(invalid_rows):
        return pred_classes

    bad_idxs = np.where(invalid_rows)[0]
    for idx in bad_idxs:
        real_class = real_classes[idx]
        if not np.isfinite(real_class):
            pred_classes[idx] = 0
            continue
        real_class = int(real_class)
        if num_classes == 2:
            pred_classes[idx] = 1 - real_class if real_class in (0, 1) else 0
        else:
            pred_classes[idx] = (real_class + 1) % int(num_classes)

    return pred_classes


def _safe_confusion_matrix(y_real, y_pred, num_classes):
    """Compute confusion matrix and return zeros when predictions are invalid."""
    if _has_non_finite_values(y_real):
        return np.zeros((num_classes, num_classes), dtype=int).tolist()

    try:
        return confusion_matrix(y_real, y_pred).tolist()
    except ValueError:
        return np.zeros((num_classes, num_classes), dtype=int).tolist()


def _safe_mean(scores, default=np.nan):
    """Average only finite values and return a fallback when none are valid."""
    finite_scores = [score for score in scores if np.isfinite(score)]
    return float(np.mean(finite_scores)) if finite_scores else default

# The following score types are loss functions, meaning the result must be sign flipped so we can maximize it in model selection
loss_funcs = {'mae', 'rmse', 'cross_entropy'}

# The following score types for classification require predicted class probabilities rather than class labels as input
uses_class_probs = {'roc_auc', 'avg_precision', 'cross_entropy'}

# The following classification score types have an 'average' parameter to control how multilabel scores are combined
has_average_param = {'roc_auc', 'avg_precision', 'precision', 'recall'}

# The following classification score types allow the value 'binary' for the 'average' parameter to make them report scores
# for class 1 only
binary_average_param = {'precision', 'recall'}

# The following classification score types only support binary classifiers
binary_class_only = {'npv'}

# ******************************************************************************************************************************

[docs]
def create_perf_data(prediction_type, model_dataset, subset, **kwargs):
    """Factory function that creates the right kind of PerfData object for the given subset,
    prediction_type (classification or regression) and split strategy (k-fold or train/valid/test).

    Args:
        prediction_type (str): classification or regression.
        
        model_dataset (ModelDataset): Object representing the full dataset.
        
        subset (str): Label in ['train', 'valid', 'test', 'full'], indicating the type of subset of dataset for tracking predictions
        
        **kwargs: Additional PerfData subclass arguments
        
    Returns:
        PerfData object
        
    Raises:
        ValueError: if split_strategy not in ['train_valid_test','k_fold_cv']
        ValueError: prediction_type not in ['regression','classification']
    """
    _configure_invalid_pred_frac_threshold(model_dataset.params)

    if subset == 'full':
        split_strategy = 'train_valid_test'
    else:
        split_strategy = model_dataset.params.split_strategy
    if prediction_type == 'regression':
        if subset == 'full' or split_strategy == 'train_valid_test':
            # Called simple because no need to track compound IDs across multiple training folds
            return SimpleRegressionPerfData(model_dataset, subset, **kwargs)
        elif split_strategy == 'k_fold_cv':
            return KFoldRegressionPerfData(model_dataset, subset, **kwargs)
        else:
            raise ValueError('Unknown split_strategy %s' % split_strategy)
    elif prediction_type == 'classification':
        if subset == 'full' or split_strategy == 'train_valid_test':
            return SimpleClassificationPerfData(model_dataset, subset, **kwargs)
        elif split_strategy == 'k_fold_cv':
            return KFoldClassificationPerfData(model_dataset, subset, **kwargs)
        else:
            raise ValueError('Unknown split_strategy %s' % split_strategy)
    elif prediction_type == "hybrid":
        return SimpleHybridPerfData(model_dataset, subset, **kwargs)
    else:
        raise ValueError('Unknown prediction type %s' % prediction_type)


# ****************************************************************************************

[docs]
class PerfData(object):
    """Class with methods for accumulating prediction data over multiple cross-validation folds
    and computing performance metrics after all folds have been run. Abstract class with
    concrete subclasses for classification and regression models.
    """
    # ****************************************************************************************
    def __init__(self, model_dataset, subset):
        """Initialize any attributes that are common to all PerfData subclasses"""

    # ****************************************************************************************

[docs]
    def accumulate_preds(self, predicted_vals, ids, pred_stds=None):
        """Raises:
            NotImplementedError: The method is implemented by subclasses
        """
        raise NotImplementedError


    # ****************************************************************************************

[docs]
    def get_pred_values(self):
        """Raises:
            NotImplementedError: The method is implemented by subclasses
        """
        raise NotImplementedError


    # ****************************************************************************************

[docs]
    def get_real_values(self, ids=None):
        """Raises:
            NotImplementedError: The method is implemented by subclasses
        """
        raise NotImplementedError


    # ****************************************************************************************

[docs]
    def get_weights(self, ids=None):
        """Returns the dataset response weights as an (ncmpds, ntasks) array

        Raises:
            NotImplementedError: The method is implemented by subclasses
        """
        raise NotImplementedError



    # ****************************************************************************************

[docs]
    def compute_perf_metrics(self, per_task=False):
        """Raises:
            NotImplementedError: The method is implemented by subclasses
        """
        raise NotImplementedError


    # ****************************************************************************************

[docs]
    def get_prediction_results(self):
        """Raises:
            NotImplementedError: The method is implemented by subclasses
        """
        raise NotImplementedError


    # ****************************************************************************************
    def _reshape_preds(self, predicted_vals):
        """Raises:
            NotImplementedError: The method is implemented by subclasses
        """
        raise NotImplementedError



# ****************************************************************************************

[docs]
class RegressionPerfData(PerfData):
    """Class with methods for accumulating regression model prediction data over multiple
    cross-validation folds and computing performance metrics after all folds have been run.
    Abstract class with concrete subclasses for different split strategies.

    Attributes:
        set in __init__
            num_tasks (int): Set to None, the number of tasks

            num_cmpds (int): Set to None, the number of compounds

    """
    # ****************************************************************************************
    # class RegressionPerfData
    def __init__(self, model_dataset, subset):
        """Initialize any attributes that are common to all RegressionPerfData subclasses.

        Side effects:
            num_tasks (int) is set as a RegressionPerfData attribute

            num_cmps (int) is set as a RegressionPerfData attribute

        """
        # The code below is to document the atributes that methods in this class expect the
        # subclasses to define. Subclasses don't actually call this superclass method.
        self.num_tasks = None
        self.num_cmpds = None
        self.perf_metrics = []
        self.model_score = None
        self.weights = None

    # ****************************************************************************************
    # class RegressionPerfData

[docs]
    def model_choice_score(self, score_type='r2'):
        """Computes a score function based on the accumulated predicted values, to be used for selecting
        the best training epoch and other hyperparameters.

        Args:
            score_type (str): The name of the scoring metric to be used, e.g. 'r2',
                              'neg_mean_squared_error', 'neg_mean_absolute_error', etc.; see
                              https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
                              and sklearn.metrics.SCORERS.keys() for a complete list of options.
                              Larger values of the score function indicate better models.

        Returns:
            score (float): A score function value. For multitask models, this will be averaged
                           over tasks.

        """
        ids, pred_vals, stds = self.get_pred_values()
        real_vals = self.get_real_values(ids=ids)
        weights = self.get_weights(ids)
        scores = []
        for i in range(self.num_tasks):
            nzrows = np.where(weights[:,i] != 0)[0]
            task_real_vals = np.squeeze(real_vals[nzrows,i])
            task_pred_vals = np.squeeze(pred_vals[nzrows,i])
            scores.append(_safe_regression_score(score_type, task_real_vals, task_pred_vals))

        self.model_score = _safe_mean(scores, default=-np.inf)
        if score_type in loss_funcs:
            self.model_score = -self.model_score
        return self.model_score



    # ****************************************************************************************
    # class RegressionPerfData

[docs]
    def get_prediction_results(self):
        """Returns a dictionary of performance metrics for a regression model.
        The dictionary values should contain only primitive Python types, so that it can
        be easily JSONified.

        Args:
            per_task (bool): True if calculating per-task metrics, False otherwise.

        Returns:
            pred_results (dict): dictionary of performance metrics for a regression model.

        """
        pred_results = {}

        # Get the mean and SD of R^2 scores over folds. If only single fold training was done, the SD will be None.
        r2_means, r2_stds = self.compute_perf_metrics(per_task=True)
        pred_results['r2_score'] = float(np.mean(r2_means))
        if r2_stds is not None:
            pred_results['r2_std'] = float(np.sqrt(np.mean(r2_stds ** 2)))
        if self.num_tasks > 1:
            pred_results['task_r2_scores'] = r2_means.tolist()
            if r2_stds is not None:
                pred_results['task_r2_stds'] = r2_stds.tolist()

        # Compute some other performance metrics. We do these differently than R^2, in that we compute the
        # metrics from the average predicted values, rather than computing them separately for each fold
        # and then averaging the metrics. If people start asking for SDs of MAE and RMSE scores over folds,
        # we'll change the code to compute all metrics the same way.

        (ids, pred_vals, pred_stds) = self.get_pred_values()
        real_vals = self.get_real_values(ids)
        weights = self.get_weights(ids)
        mae_scores = []
        rms_scores = []
        response_means = []
        response_stds = []
        # Iterate over tasks, call score funcs directly on weight masked values
        for i in range(self.num_tasks):
            nzrows = np.where(weights[:,i] != 0)[0]
            task_real_vals = np.squeeze(real_vals[nzrows,i])
            task_pred_vals = np.squeeze(pred_vals[nzrows,i])
            mae_scores.append(_safe_regression_score('mae', task_real_vals, task_pred_vals))
            rms_scores.append(_safe_regression_score('rmse', task_real_vals, task_pred_vals))
            response_means.append(task_real_vals.mean().tolist())
            response_stds.append(task_real_vals.std().tolist())
        pred_results['mae_score'] = _safe_mean(mae_scores)
        if self.num_tasks > 1:
            pred_results['task_mae_scores'] = mae_scores
        pred_results['rms_score'] = _safe_mean(rms_scores)
        if self.num_tasks > 1:
            pred_results['task_rms_scores'] = rms_scores

        # Add model choice score if one was computed
        if self.model_score is not None:
            pred_results['model_choice_score'] = self.model_score

        pred_results['num_compounds'] = self.num_cmpds
        pred_results['mean_response_vals'] = response_means
        pred_results['std_response_vals'] = response_stds

        task_invalid_counts = []
        task_invalid_fracs = []
        for i in range(self.num_tasks):
            nzrows = np.where(weights[:,i] != 0)[0]
            task_real_vals = np.squeeze(real_vals[nzrows,i])
            task_pred_vals = np.squeeze(pred_vals[nzrows,i])
            invalid_count, total_count, invalid_frac, _ = _invalid_stats(task_real_vals, task_pred_vals)
            task_invalid_counts.append(int(invalid_count))
            task_invalid_fracs.append(float(invalid_frac))

        total_scored = int(np.sum(weights != 0))
        total_invalid = int(np.sum(task_invalid_counts))
        pred_results['invalid_prediction_count'] = total_invalid
        pred_results['invalid_prediction_fraction'] = float(total_invalid / total_scored) if total_scored > 0 else 0.0
        pred_results['invalid_prediction_threshold'] = _get_active_invalid_pred_frac_threshold()
        pred_results['task_invalid_prediction_counts'] = task_invalid_counts
        pred_results['task_invalid_prediction_fractions'] = task_invalid_fracs

        return pred_results


    # ****************************************************************************************
    # class RegressionPerfData
    def _reshape_preds(self, predicted_vals):
        """Reshape an array of regression model predictions to a standard (ncmpds, ntasks)
        format. Checks that the task dimension matches what we expect for the dataset.

        Args:
            predicted_vals (np.array): array of regression model predictions.

        Returns:
            predicted_vals (np.array): reshaped array

        Raises:
            ValueError: if the dimensions of the predicted value do not match the dimensions of num_tasks for
            RegressionPerfData

        """
        # For regression models, predicted_vals can be 1D, 2D or 3D array depending on the type of
        # underlying DeepChem model.
        dim = len(predicted_vals.shape)
        ncmpds = predicted_vals.shape[0]
        if dim == 1:
            # Single task model
            predicted_vals = predicted_vals.reshape((ncmpds,1))
            ntasks = 1
        else:
            ntasks = predicted_vals.shape[1]
        if ntasks != self.num_tasks:
            raise ValueError("Predicted value dimensions don't match num_tasks for RegressionPerfData")
        if dim == 3:
            # FCNet models generate predictions with an extra dimension, possibly for the number of
            # classes, which is always 1 for regression models.
            predicted_vals = predicted_vals.reshape((ncmpds,ntasks))
        return predicted_vals


    # ****************************************************************************************

# ****************************************************************************************

[docs]
class HybridPerfData(PerfData):
    """Class with methods for accumulating regression model prediction data over multiple
    cross-validation folds and computing performance metrics after all folds have been run.
    Abstract class with concrete subclasses for different split strategies.

    Attributes:
        set in __init__
            num_tasks (int): Set to None, the number of tasks

            num_cmpds (int): Set to None, the number of compounds

    """
    # ****************************************************************************************
    # class HybridPerfData
    def __init__(self, model_dataset, subset):
        """Initialize any attributes that are common to all HybridPerfData subclasses.

        Side effects:
            num_tasks (int) is set as a HybridPerfData attribute

            num_cmps (int) is set as a HybridPerfData attribute

        """
        # The code below is to document the atributes that methods in this class expect the
        # subclasses to define. Subclasses don't actually call this superclass method.
        self.num_tasks = 2
        self.num_cmpds = None
        self.perf_metrics = []
        self.model_score = None
        self.weights = None

    # ****************************************************************************************
    # class HybridPerfData

[docs]
    def model_choice_score(self, score_type='r2'):
        """Computes a score function based on the accumulated predicted values, to be used for selecting
        the best training epoch and other hyperparameters.

        Args:
            score_type (str): The name of the scoring metric to be used, e.g. 'r2', 'mae', 'rmse'

        Returns:
            score (float): A score function value. For multitask models, this will be averaged
                           over tasks.

        """
        ids, pred_vals, stds = self.get_pred_values()
        real_vals = self.get_real_values(ids=ids)
        weights = self.get_weights(ids)
        scores = []
        
        pos_ki = np.where(np.isnan(real_vals[:, 1]))[0]
        pos_bind = np.where(~np.isnan(real_vals[:, 1]))[0]

        # score for pKi/IC50
        nzrows = np.where(weights[:, 0] != 0)[0]
        rowki = np.intersect1d(nzrows, pos_ki)
        rowbind = np.intersect1d(nzrows, pos_bind)
        ki_real_vals = np.squeeze(real_vals[rowki,0])
        ki_pred_vals = np.squeeze(pred_vals[rowki,0])
        bind_real_vals = np.squeeze(real_vals[rowbind,0])
        bind_pred_vals = np.squeeze(pred_vals[rowbind,0])
        if len(rowki) > 0:
            scores.append(_safe_regression_score(score_type, ki_real_vals, ki_pred_vals))
            if len(rowbind) > 0:
                scores.append(_safe_regression_score(score_type, bind_real_vals, bind_pred_vals))
            else:
                # if all values are dose response activities, use the r2_score above.
                scores.append(scores[0])
        elif len(rowbind) > 0:
            # all values are single concentration activities.
            scores.append(_safe_regression_score(score_type, bind_real_vals, bind_pred_vals))
            scores.append(scores[0])

        self.model_score = float(np.mean(scores))
        if score_type in loss_funcs:
            self.model_score = -self.model_score
        return self.model_score



    # ****************************************************************************************
    # class HybridPerfData

[docs]
    def get_prediction_results(self):
        """Returns a dictionary of performance metrics for a regression model.
        The dictionary values should contain only primitive Python types, so that it can
        be easily JSONified.

        Args:
            per_task (bool): True if calculating per-task metrics, False otherwise.

        Returns:
            pred_results (dict): dictionary of performance metrics for a regression model.

        """
        pred_results = {}

        # Get the mean and SD of R^2 scores over folds. If only single fold training was done, the SD will be None.
        r2_means, r2_stds = self.compute_perf_metrics(per_task=True)
        pred_results['r2_score'] = float(np.mean(r2_means))
        if r2_stds is not None:
            pred_results['r2_std'] = float(np.sqrt(np.mean(r2_stds ** 2)))
        if self.num_tasks > 1:
            pred_results['task_r2_scores'] = r2_means.tolist()
            if r2_stds is not None:
                pred_results['task_r2_stds'] = r2_stds.tolist()

        # Compute some other performance metrics. We do these differently than R^2, in that we compute the
        # metrics from the average predicted values, rather than computing them separately for each fold
        # and then averaging the metrics. If people start asking for SDs of MAE and RMSE scores over folds,
        # we'll change the code to compute all metrics the same way.

        (ids, pred_vals, pred_stds) = self.get_pred_values()
        real_vals = self.get_real_values(ids=ids)
        weights = self.get_weights(ids)
        mae_scores = []
        rms_scores = []
        response_means = []
        response_stds = []
        
        pos_ki = np.where(np.isnan(real_vals[:, 1]))[0]
        pos_bind = np.where(~np.isnan(real_vals[:, 1]))[0]

        # score for pKi/IC50
        nzrows = np.where(weights[:, 0] != 0)[0]
        rowki = np.intersect1d(nzrows, pos_ki)
        rowbind = np.intersect1d(nzrows, pos_bind)
        ki_real_vals = np.squeeze(real_vals[rowki,0])
        ki_pred_vals = np.squeeze(pred_vals[rowki,0])
        bind_real_vals = np.squeeze(real_vals[rowbind,0])
        bind_pred_vals = np.squeeze(pred_vals[rowbind,0])
        if len(rowki) > 0:
            mae_scores.append(_safe_regression_score('mae', ki_real_vals, ki_pred_vals))
            rms_scores.append(_safe_regression_score('rmse', ki_real_vals, ki_pred_vals))
            if len(rowbind) > 0:
                mae_scores.append(_safe_regression_score('mae', bind_real_vals, bind_pred_vals))
                rms_scores.append(_safe_regression_score('rmse', bind_real_vals, bind_pred_vals))
            else:
                # if all values are dose response activities, use the r2_score above.
                mae_scores.append(mae_scores[0])
                rms_scores.append(rms_scores[0])
        elif len(rowbind) > 0:
            # all values are single concentration activities.
            mae_scores.append(_safe_regression_score('mae', bind_real_vals, bind_pred_vals))
            rms_scores.append(_safe_regression_score('rmse', bind_real_vals, bind_pred_vals))
            mae_scores.append(mae_scores[0])
            rms_scores.append(rms_scores[0])

        response_means.append(ki_real_vals.mean().tolist())
        response_stds.append(ki_real_vals.std().tolist())
        response_means.append(bind_real_vals.mean().tolist())
        response_stds.append(bind_real_vals.std().tolist())

        pred_results['mae_score'] = _safe_mean(mae_scores)
        if self.num_tasks > 1:
            pred_results['task_mae_scores'] = mae_scores
        pred_results['rms_score'] = _safe_mean(rms_scores)
        if self.num_tasks > 1:
            pred_results['task_rms_scores'] = rms_scores

        # Add model choice score if one was computed
        if self.model_score is not None:
            pred_results['model_choice_score'] = self.model_score

        pred_results['num_compounds'] = self.num_cmpds
        pred_results['mean_response_vals'] = response_means
        pred_results['std_response_vals'] = response_stds

        task_invalid_counts = []
        task_invalid_fracs = []
        ki_invalid_count, ki_total_count, ki_invalid_frac, _ = _invalid_stats(ki_real_vals, ki_pred_vals)
        bind_invalid_count, bind_total_count, bind_invalid_frac, _ = _invalid_stats(bind_real_vals, bind_pred_vals)
        task_invalid_counts.extend([int(ki_invalid_count), int(bind_invalid_count)])
        task_invalid_fracs.extend([float(ki_invalid_frac), float(bind_invalid_frac)])
        total_scored = int(ki_total_count + bind_total_count)
        total_invalid = int(ki_invalid_count + bind_invalid_count)
        pred_results['invalid_prediction_count'] = total_invalid
        pred_results['invalid_prediction_fraction'] = float(total_invalid / total_scored) if total_scored > 0 else 0.0
        pred_results['invalid_prediction_threshold'] = _get_active_invalid_pred_frac_threshold()
        pred_results['task_invalid_prediction_counts'] = task_invalid_counts
        pred_results['task_invalid_prediction_fractions'] = task_invalid_fracs

        return pred_results


    # ****************************************************************************************
    # class HybridPerfData
    def _reshape_preds(self, predicted_vals):
        """Reshape an array of regression model predictions to a standard (ncmpds, ntasks)
        format. Checks that the task dimension matches what we expect for the dataset.

        Args:
            predicted_vals (np.array): array of regression model predictions.

        Returns:
            predicted_vals (np.array): reshaped array

        Raises:
            ValueError: if the dimensions of the predicted value do not match the dimensions of num_tasks for
            RegressionPerfData

        """
        # hybrid model is highly specific, there is no need to reshape
        return predicted_vals


    # ****************************************************************************************

# ****************************************************************************************

[docs]
class ClassificationPerfData(PerfData):
    """Class with methods for accumulating classification model prediction data over multiple
    cross-validation folds and computing performance metrics after all folds have been run.
    Abstract class with concrete subclasses for different split strategies.

    Attributes:
        set in __init__
            num_tasks (int): Set to None, the number of tasks

            num_cmpds (int): Set to None, the number of compounds

            num_classes (int): Set to None, the number of classes

    """
    # ****************************************************************************************
    # class ClassificationPerfData
    def __init__(self, model_dataset, subset):
        """Initialize any attributes that are common to all ClassificationPerfData subclasses

        Side effects:
            num_tasks (int) is set as a ClassificationPerfData attribute

            num_cmps (int) is set as a ClassificationPerfData attribute

            num_classes (int) is set as a ClassificationPerfData attribute

        """
        # TODO: Allow num_classes to vary between tasks in a multitask, multilabel model.
        # This would require making self.num_classes a list or array. Also, the _reshape_preds method
        # would have to change to accept and generate lists of (ncmpds, nclasses) arrays, rather than
        # the 3D (ncmpds, ntasks, nclasses) arrays generated by DeepChem predict() methods; and downstream
        # code would have to be modified to deal with these lists. Recommend we hold off dealing with this
        # until we have some multitask/label datasets and models where it will be necessary.

        # The code below is to document the atributes that methods in this class expect the
        # subclasses to define. Subclasses don't actually call this superclass method.
        self.num_tasks = None
        self.num_cmpds = None
        self.num_classes = None
        self.perf_metrics = []
        self.model_score = None
        self.weights = None
        
    # ****************************************************************************************
    # class ClassificationPerfData

[docs]
    def model_choice_score(self, score_type='roc_auc'):
        """Computes a score function based on the accumulated predicted values, to be used for selecting
        the best training epoch and other hyperparameters.

        Args:
            score_type (str): The name of the scoring metric to be used, e.g. 'roc_auc', 'precision',
                              'recall', 'f1'; see
                              https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
                              and sklearn.metrics.SCORERS.keys() for a complete list of options.
                              Larger values of the score function indicate better models.

        Returns:
            score (float): A score function value. For multitask models, this will be averaged
                           over tasks.

        """
        ids, pred_classes, class_probs, prob_stds = self.get_pred_values()
        real_vals = self.get_real_values(ids=ids)
        weights = self.get_weights(ids=ids)
        scores = []
            
        for i in range(self.num_tasks):
            nzrows = np.where(weights[:,i] != 0)[0]
            average_param = None
            if self.num_classes > 2:
                if score_type in binary_class_only:
                    raise ValueError("Model selection by %s score not allowed for multi-label classifiers." % score_type)
                if score_type in has_average_param:
                    average_param = 'macro'
                # If more than 2 classes, task_real_vals is indicator matrix (one-hot encoded). 
                task_real_vals = real_vals[nzrows,i,:]
                task_class_probs = class_probs[nzrows,i,:]
                task_real_classes = np.argmax(task_real_vals, axis=1)
            else:
                # sklearn metrics functions are expecting single array of 1s and 0s for task_real_vals
                # and task_class_probs for class 1 only
                task_real_vals = real_vals[nzrows,i]
                task_real_classes = task_real_vals
                task_class_probs = class_probs[nzrows,i,1]
                if score_type in binary_average_param:
                    average_param = 'binary'

            if score_type in uses_class_probs:
                task_pred_vars = task_class_probs
                task_real_vars = task_real_vals
            else:
                task_pred_classes = pred_classes[nzrows,i]
                task_pred_vars = _coerce_invalid_pred_classes(
                    task_real_classes, task_pred_classes, task_class_probs, self.num_classes
                )
                task_real_vars = task_real_classes
            scores.append(_safe_classification_score(score_type, task_real_vars, task_pred_vars, average=average_param))

        self.model_score = float(np.mean(scores))
        if score_type in loss_funcs:
            self.model_score = -self.model_score
        return self.model_score


    # ****************************************************************************************
    # class ClassificationPerfData

[docs]
    def get_prediction_results(self):
        """Returns a dictionary of performance metrics for a classification model.
        The dictionary values will contain only primitive Python types, so that it can
        be easily JSONified.

        Args:
            per_task (bool): True if calculating per-task metrics, False otherwise.

        Returns:
            pred_results (dict): dictionary of performance metrics for a classification model.

        """
        pred_results = {}
        (ids, pred_classes, class_probs, prob_stds) = self.get_pred_values()

        real_vals = self.get_real_values(ids=ids)
        weights = self.get_weights(ids)
        if self.num_classes > 2:
            real_classes = np.argmax(real_vals, axis=2)
        else:
            real_classes = real_vals

        # Get the mean and SD of ROC AUC scores over folds. If only single fold training was done, the SD will be None.
        roc_auc_means, roc_auc_stds = self.compute_perf_metrics(per_task=True)
        pred_results['roc_auc_score'] = float(np.mean(roc_auc_means))
        if roc_auc_stds is not None:
            pred_results['roc_auc_std'] = float(np.sqrt(np.mean(roc_auc_stds ** 2)))
        if self.num_tasks > 1:
            pred_results['task_roc_auc_scores'] = roc_auc_means.tolist()
            if roc_auc_stds is not None:
                pred_results['task_roc_auc_stds'] = roc_auc_stds.tolist()

        # Compute some other performance metrics. We do these differently than ROC AUC, in that we compute the
        # metrics from the average predicted values, rather than computing them separately for each fold
        # and then averaging the metrics. If people start asking for SDs of the other metrics over folds,
        # we'll change the code to compute all metrics the same way.

        prc_aucs = []
        cross_entropies = []
        precisions = []
        recalls = []
        npvs = []
        accuracies = []
        bal_accs = []
        kappas = []
        matthews_ccs = []
        confusion_matrices = []
        task_invalid_counts = []
        task_invalid_fracs = []
        total_scored = 0
        total_invalid = 0
        for i in range(self.num_tasks):
            nzrows = np.where(weights[:,i] != 0)[0]
            task_real_classes = real_classes[nzrows,i]

            if self.num_classes > 2:
                # If more than 2 classes, task_real_vals is indicator matrix (one-hot encoded).
                task_real_vals = real_vals[nzrows,i,:]
                task_class_probs = class_probs[nzrows,i,:]
                invalid_count, total_count, invalid_frac, _ = _invalid_stats(task_real_vals, task_class_probs)
                task_pred_classes = _coerce_invalid_pred_classes(
                    task_real_classes, pred_classes[nzrows,i], task_class_probs, self.num_classes
                )
                prc_aucs.append(_safe_classification_score('avg_precision', task_real_vals, task_class_probs, average='macro'))
                precisions.append(float(_safe_classification_score('precision', task_real_classes, task_pred_classes, average='macro')))
                recalls.append(float(_safe_classification_score('recall', task_real_classes, task_pred_classes, average='macro')))
                # NPV is not supported for multilabel classifiers, skip it
            else:
                # sklearn metrics functions are expecting single array of 1s and 0s for task_real_vals
                # and task_class_probs for class 1 only
                task_real_vals = real_vals[nzrows,i]
                task_class_probs = class_probs[nzrows,i,1]
                invalid_count, total_count, invalid_frac, _ = _invalid_stats(task_real_vals, task_class_probs)
                task_pred_classes = _coerce_invalid_pred_classes(
                    task_real_classes, pred_classes[nzrows,i], task_class_probs, self.num_classes
                )
                prc_aucs.append(_safe_classification_score('avg_precision', task_real_vals, task_class_probs))
                precisions.append(float(_safe_classification_score('precision', task_real_vals, task_pred_classes, average='binary')))
                recalls.append(float(_safe_classification_score('recall', task_real_vals, task_pred_classes, average='binary')))
                npvs.append(float(_safe_classification_score('npv', task_real_vals, task_pred_classes)))

            cross_entropies.append(_safe_classification_score('cross_entropy', task_real_vals, task_class_probs))
            accuracies.append(_safe_classification_score('accuracy', task_real_classes, task_pred_classes))
            bal_accs.append(_safe_classification_score('bal_accuracy', task_real_classes, task_pred_classes))
            kappas.append(float(_safe_classification_score('kappa', task_real_classes, task_pred_classes)))
            matthews_ccs.append(float(_safe_classification_score('mcc', task_real_classes, task_pred_classes)))
            confusion_matrices.append(_safe_confusion_matrix(task_real_classes, task_pred_classes, self.num_classes))
            task_invalid_counts.append(int(invalid_count))
            task_invalid_fracs.append(float(invalid_frac))
            total_scored += int(total_count)
            total_invalid += int(invalid_count)

        pred_results['prc_auc_score'] = float(np.mean(prc_aucs))
        if self.num_tasks > 1:
            pred_results['task_prc_auc_scores'] = prc_aucs

        pred_results['cross_entropy'] = float(np.mean(cross_entropies))
        if self.num_tasks > 1:
            pred_results['task_cross_entropies'] = cross_entropies

        # Add model choice score if one was computed
        if self.model_score is not None:
            pred_results['model_choice_score'] = self.model_score

        pred_results['precision'] = float(np.mean(precisions))
        if self.num_tasks > 1:
            pred_results['task_precisions'] = precisions

        pred_results['recall_score'] = float(np.mean(recalls))
        if self.num_tasks > 1:
            pred_results['task_recalls'] = recalls

        if self.num_classes == 2:
            pred_results['npv'] = float(np.mean(npvs))
            if self.num_tasks > 1:
                pred_results['task_npvs'] = npvs

        pred_results['accuracy_score'] = float(np.mean(accuracies))
        if self.num_tasks > 1:
            pred_results['task_accuracies'] = accuracies

        pred_results['bal_accuracy'] = float(np.mean(bal_accs))
        if self.num_tasks > 1:
            pred_results['task_bal_accuracies'] = bal_accs

        pred_results['kappa'] = float(np.mean(kappas))
        if self.num_tasks > 1:
            pred_results['task_kappas'] = kappas

        pred_results['matthews_cc'] = float(np.mean(matthews_ccs))
        if self.num_tasks > 1:
            pred_results['task_matthews_ccs'] = matthews_ccs

        pred_results['confusion_matrix'] = confusion_matrices
        pred_results['num_compounds'] = self.num_cmpds
        pred_results['invalid_prediction_count'] = int(total_invalid)
        pred_results['invalid_prediction_fraction'] = float(total_invalid / total_scored) if total_scored > 0 else 0.0
        pred_results['invalid_prediction_threshold'] = _get_active_invalid_pred_frac_threshold()
        pred_results['task_invalid_prediction_counts'] = task_invalid_counts
        pred_results['task_invalid_prediction_fractions'] = task_invalid_fracs

        return pred_results


    # ****************************************************************************************
    # class ClassificationPerfData
    def _reshape_preds(self, predicted_vals):
        """Reshape an array of classification model predictions to a standard (ncmpds, ntasks, nclasses)
        format. Checks that the task and class dimensions match what we expect for the dataset.

        Args:
            predicted_vals (np.array): array of classification model predictions

        Returns:
            predicted_vals (np.array): reshaped array of classification model predictions

        """
        # For classification models, predicted_vals can be 2D or 3D array depending on whether the
        # underlying DeepChem or sklearn model supports multitask datasets.
        dim = len(predicted_vals.shape)
        ncmpds = predicted_vals.shape[0]
        if dim == 2:
            # Single task model
            ntasks = 1
            nclasses = predicted_vals.shape[1]
            predicted_vals = predicted_vals.reshape((ncmpds, 1, nclasses))
        else:
            ntasks = predicted_vals.shape[1]
            nclasses = predicted_vals.shape[2]

        if nclasses != self.num_classes:
            raise ValueError("Predicted value dimensions doesn't match num_classes for ClassificationPerfData")
        if ntasks != self.num_tasks:
            raise ValueError("Predicted value dimensions doesn't match num_tasks for ClassificationPerfData")

        return predicted_vals



# ****************************************************************************************

[docs]
class KFoldRegressionPerfData(RegressionPerfData):
    """Class with methods for accumulating regression model prediction data over multiple
    cross-validation folds and computing performance metrics after all folds have been run.

    Arguments:
        Set in __init__:
            subset (str): Label of the type of subset of dataset for tracking predictions

            num_cmps (int): The number of compounds in the dataset

            num_tasks (int): The number of tasks in the dataset

            pred-vals (dict): The dictionary of prediction results

            folds (int): Initialized at zero, flag for determining which k-fold is being assessed

            real_vals (dict): The dictionary containing the origin response column values

    """

    # ****************************************************************************************
    # class KFoldRegressionPerfData
    def __init__(self, model_dataset, subset):
        """# Initialize any attributes that are common to all KFoldRegressionPerfData subclasses
        Args:
            model_dataset (ModelDataset object): contains the dataset and related methods

            subset (str): Label in ['train', 'valid', 'test', 'full'], indicating the type of subset of dataset for
            tracking predictions

        Side effects:
            Sets the following attributes of KFoldRegressionPerfData:
                subset (str): Label of the type of subset of dataset for tracking predictions

                num_cmps (int): The number of compounds in the dataset

                num_tasks (int): The number of tasks in the dataset

                pred_vals (dict): The dictionary of prediction results

                folds (int): Initialized at zero, flag for determining which k-fold is being assessed

                real_vals (dict): The dictionary containing the origin response column values

        """
        self.subset = subset
        if self.subset in ('train', 'valid', 'train_valid'):
            dataset = model_dataset.combined_training_data()
        elif self.subset == 'test':
            dataset = model_dataset.test_dset
        else:
            raise ValueError('Unknown dataset subset type "%s"' % self.subset)
        self.num_cmpds = dataset.y.shape[0]
        self.num_tasks = dataset.y.shape[1]
        self.pred_vals = dict([(id, np.empty((0, self.num_tasks), dtype=np.float32)) for id in dataset.ids])
        self.folds = 0
        self.perf_metrics = []
        self.model_score = None
        # Want predictions and real values to be in the same space, untransformed
        self.real_vals, self.weights = model_dataset.get_subset_responses_and_weights(self.subset)


    # ****************************************************************************************
    # class KFoldRegressionPerfData

[docs]
    def accumulate_preds(self, predicted_vals, ids, pred_stds=None):
        """Add training, validation or test set predictions from the current fold to the data structure
        where we keep track of them.

        Args:
            predicted_vals (np.array): Array of the predicted values for the current dataset

            ids (np.array): An np.array of compound ids for the current dataset

            pred_stds (np.array): An array of the standard deviation in the predictions, not used in this method

        Returns:
            None

        Raises:
            ValueError: If Predicted value dimensions don't match num_tasks for RegressionPerfData

        Side effects:
            Overwrites the attribute pred_vals

            Increments folds by 1

        """
        # For regression models, predicted_vals can be 1D, 2D or 3D array depending on the type of
        # underlying DeepChem model. Reshape the array to (ncmpds, ntasks) regardless.
        dim = len(predicted_vals.shape)
        ncmpds = predicted_vals.shape[0]
        if dim == 1:
            # Single task model
            predicted_vals = predicted_vals.reshape((ncmpds,1))
            ntasks = 1
        else:
            ntasks = predicted_vals.shape[1]
        if ntasks != self.num_tasks:
            raise ValueError("Predicted value dimensions don't match num_tasks for RegressionPerfData")
        if dim == 3:
            # FCNet models generate predictions with an extra dimension, possibly for the number of
            # classes, which is always 1 for regression models.
            predicted_vals = predicted_vals.reshape((ncmpds,ntasks))

        for i, id in enumerate(ids):
            self.pred_vals[id] = np.concatenate([self.pred_vals[id], predicted_vals[i,:].reshape((1,-1))], axis=0)
        self.folds += 1

        real_vals = self.get_real_values(ids=ids)
        weights = self.get_weights(ids)
        scores = []
        for i in range(self.num_tasks):
            nzrows = np.where(weights[:,i] != 0)[0]
            task_real_vals = np.squeeze(real_vals[nzrows,i])
            task_pred_vals = np.squeeze(predicted_vals[nzrows,i])
            scores.append(regr_score_func['r2'](task_real_vals, task_pred_vals))
        self.perf_metrics.append(np.array(scores))
        return float(np.mean(scores))



    # ****************************************************************************************
    # class KFoldRegressionPerfData

[docs]
    def get_pred_values(self):
        """Returns the predicted values accumulated over training.
        If self.subset is 'train' or 'test', the function will return averages over the training folds for each compound
        along with standard deviations when there are predictions from multiple folds. Otherwise, returns a
        single predicted value for each compound.

        Returns:
            ids (np.array): list of compound IDs

            vals (np.array): (ncmpds, ntasks) array of mean predicted values

            fold_stds (np.array): (ncmpds, ntasks) array of standard deviations over folds if applicable, and None
            otherwise.

        """
        #ids = sorted(self.pred_vals.keys())
        all_ids = sorted(self.pred_vals.keys())
        # with kfold + SMOTE, not all ids have predictions 
        ids = [id for id in all_ids if not (self.pred_vals[id].size == 0)]

        if self.subset in ['train', 'test', 'train_valid']:
            vals = np.concatenate([self.pred_vals[id].mean(axis=0, keepdims=True).reshape((1,-1)) for id in ids])
            if self.folds > 1:
                stds = np.concatenate([self.pred_vals[id].std(axis=0, keepdims=True).reshape((1,-1))
                                       for id in ids])
            else:
                stds = None
        else:
            vals = np.concatenate([self.pred_vals[id].reshape((1,-1)) for id in ids], axis=0)
            stds = None
        return (ids, vals, stds)



    # ****************************************************************************************
    # class KFoldRegressionPerfData

[docs]
    def get_real_values(self, ids=None):
        """Returns the real dataset response values, as an (ncmpds, ntasks) array
        in the same ID order as get_pred_values() (unless ids is specified).

        Args:
            ids (list of str): Optional list of compound IDs to return values for.

        Returns:
            np.array (ncmpds, ntasks) of the real dataset response values, in the same
            ID order as get_pred_values().

        """
        if ids is None:
            ids = sorted(self.pred_vals.keys())
        real_vals = np.concatenate([self.real_vals[id].reshape((1,-1)) for id in ids], axis=0)
        return real_vals



    # ****************************************************************************************
    # class KFoldRegressionPerfData

[docs]
    def get_weights(self, ids=None):
        """Returns the dataset response weights, as an (ncmpds, ntasks) array
        in the same ID order as get_pred_values() (unless ids is specified).

        Args:
            ids (list of str): Optional list of compound IDs to return values for.

        Returns:
            np.array (ncmpds, ntasks) of the real dataset response weights, in the same
            ID order as get_pred_values().

        """
        if ids is None:
            ids = sorted(self.pred_vals.keys())
        return np.concatenate([self.weights[id].reshape((1,-1)) for id in ids], axis=0)




    # ****************************************************************************************
    # class KFoldRegressionPerfData

[docs]
    def compute_perf_metrics(self, per_task=False):
        """Computes the R-squared metrics for each task based on the accumulated values, averaged over
        training folds, along with standard deviations of the scores. If per_task is False, the scores
        are averaged over tasks and the overall standard deviation is reported instead.

        Args:
            per_task (bool): True if calculating per-task metrics, False otherwise.

        Returns:
            A tuple (r2_mean, r2_std):

                r2_mean: A numpy array of mean R^2 scores for each task, averaged over folds, if per_task is True.
                         Otherwise, a float giving the R^2 score averaged over both folds and tasks.

                r2_std:  A numpy array of standard deviations over folds of R^2 values, if per_task is True.
                         Otherwise, a float giving the overall standard deviation.
        """
        r2_scores = np.stack(self.perf_metrics)
        if per_task:
            r2_mean = np.mean(r2_scores, axis=0)
            r2_std = np.std(r2_scores, axis=0)
        else:
            r2_mean = np.mean(r2_scores.flatten())
            r2_std = np.std(r2_scores.flatten())
        return (r2_mean, r2_std)




# ****************************************************************************************

[docs]
class KFoldClassificationPerfData(ClassificationPerfData):
    """Class with methods for accumulating classification model performance data over multiple
    cross-validation folds and computing performance metrics after all folds have been run.

    Attributes:
        Set in __init__:
            subset (str): Label of the type of subset of dataset for tracking predictions
            num_cmps (int): The number of compounds in the dataset
            num_tasks (int): The number of tasks in the dataset
            pred-vals (dict): The dictionary of prediction results
            folds (int): Initialized at zero, flag for determining which k-fold is being assessed
            real_vals (dict): The dictionary containing the origin response column values
            class_names (np.array): Assumes the classes are of deepchem index type (e.g. 0,1,2,...)
            num_classes (int): The number of classes to predict on
    """

    # ****************************************************************************************
    # class KFoldClassificationPerfData
    def __init__(self, model_dataset, subset, predict_probs=True):
        """Initialize any attributes that are common to all KFoldClassificationPerfData subclasses

        Args:
           model_dataset (ModelDataset object): contains the dataset and related methods

           subset (str): Label in ['train', 'valid', 'test', 'full'], indicating the type of subset of dataset for
           tracking predictions

           predict_probs (bool): True if using classifier supports probabilistic predictions, False otherwise

                Raises:
           ValueError if subset not in ['train','valid','test'], unsupported dataset subset

           NotImplementedError if predict_probs is not True, non-probabilistic classifiers are not supported yet

        Side effects:
           Sets the following attributes of KFoldClassificationPerfData:
               subset (str): Label of the type of subset of dataset for tracking predictions

               num_cmps (int): The number of compounds in the dataset

               num_tasks (int): The number of tasks in the dataset

               pred_vals (dict): The dictionary of prediction results

               folds (int): Initialized at zero, flag for determining which k-fold is being assessed

               real_vals (dict): The dictionary containing the origin response column values in one-hot encoding

               class_names (np.array): Assumes the classes are of deepchem index type (e.g. 0,1,2,...)

               num_classes (int): The number of classes to predict on
        """

        self.subset = subset
        if self.subset in ('train', 'valid', 'train_valid'):
            dataset = model_dataset.combined_training_data()
        elif self.subset == 'test':
            dataset = model_dataset.test_dset
        else:
            raise ValueError('Unknown dataset subset type "%s"' % self.subset)

        # All currently used classifiers generate class probabilities in their predict methods;
        # if in the future we implement a classification algorithm such as kNN that doesn't support
        # probabilistic predictions, the ModelWrapper for that classifier should pass predict_probs=False
        # when constructing this object. When that happens, modify the code in this class to support
        # this option.
        if not predict_probs:
            raise NotImplementedError("Need to add support for non-probabilistic classifiers")

        self.num_cmpds = dataset.y.shape[0]
        self.num_tasks = dataset.y.shape[1]
        self.num_classes = len(set(model_dataset.dataset.y.flatten()))
        # pred vals maps compound ids to a matrix of predictions.
        # predictions will be concatentated one by one as they come in in accumulate_preds
        self.pred_vals = dict([(id, np.empty((0, self.num_tasks, self.num_classes), dtype=np.float32)) for id in dataset.ids])

        real_vals, self.weights = model_dataset.get_subset_responses_and_weights(self.subset)
        self.real_classes = real_vals
        # Change real_vals to one-hot encoding
        if self.num_classes > 2:
            self.real_vals = dict([(id, 
                                   np.concatenate([dc.metrics.to_one_hot(np.array([class_labels[j]]), self.num_classes)
                                                   for j in range(self.num_tasks)], axis=0))
                                   for id, class_labels in real_vals.items()])
        else:
            self.real_vals = real_vals

        self.folds = 0
        self.perf_metrics = []
        self.model_score = None


    # ****************************************************************************************
    # class KFoldClassificationPerfData

[docs]
    def accumulate_preds(self, predicted_vals, ids, pred_stds=None):
        """Add training, validation or test set predictions from the current fold to the data structure
        where we keep track of them.

        Args:
            predicted_vals (np.array): Array of the predicted values for the current dataset

            ids (np.array): An np.array of compound ids for the current dataset

            pred_stds (np.array): An array of the standard deviation in the predictions, not used in this method

        Returns:
            None

        Side effects:
            Overwrites the attribute pred_vals

            Increments folds by 1

        """
        class_probs = self._reshape_preds(predicted_vals)
        for i, id in enumerate(ids):
            # Record predictions for each compound.
            self.pred_vals[id] = np.concatenate([self.pred_vals[id], class_probs[i,:,:].reshape((1,self.num_tasks,-1))], axis=0)
        self.folds += 1
        real_vals = self.get_real_values(ids=ids)
        weights = self.get_weights(ids)
        # Break out different predictions for each task, with zero-weight compounds masked out, and compute per-task metrics
        scores = []
        for i in range(self.num_tasks):
            nzrows = np.where(weights[:,i] != 0)[0]
            if self.num_classes > 2:
                # If more than 2 classes, real_vals is indicator matrix (one-hot encoded). 
                task_real_vals = np.squeeze(real_vals[nzrows,i,:])
                task_class_probs =np.squeeze(class_probs[nzrows,i,:])
                scores.append(_safe_classification_score('roc_auc', task_real_vals, task_class_probs, average='macro'))
            else:
                # For binary classifier, sklearn metrics functions are expecting single array of 1s and 0s for real_vals_list,
                # and class_probs for class 1 only.
                task_real_vals = np.squeeze(real_vals[nzrows,i])
                task_class_probs = np.squeeze(class_probs[nzrows,i,1])
                scores.append(_safe_classification_score('roc_auc', task_real_vals, task_class_probs))
        self.perf_metrics.append(np.array(scores))
        return float(np.mean(scores))


    # ****************************************************************************************
    # class KFoldClassificationPerfData

[docs]
    def get_pred_values(self):
        """Returns the predicted values accumulated over training.  If self.subset
        is 'train', 'train_valid' or 'test', the function will return the means and standard deviations of the class probabilities
        over the training folds for each compound, for each task.  Otherwise, returns a single set of predicted probabilites for
        each validation set compound. For all subsets, returns the compound IDs and the most probable classes for each task.

        Returns:
            ids (list): list of compound IDs.

            pred_classes (np.array): an (ncmpds, ntasks) array of predicted classes.

            class_probs (np.array): a (ncmpds, ntasks, nclasses) array of predicted probabilities for the classes, and

            prob_stds (np.array): a (ncmpds, ntasks, nclasses) array of standard errors over folds for the class
            probability estimates (only available for the 'train' and 'test' subsets; None otherwise).

        """
        all_ids = sorted(self.pred_vals.keys())
        # with kfold + SMOTE, not all ids have predictions
        ids = [id for id in all_ids if not (self.pred_vals[id].size == 0)]

        if self.subset in ['train', 'test', 'train_valid']:
            class_probs = np.concatenate([self.pred_vals[id].mean(axis=0, keepdims=True)
                                   for id in ids], axis=0)
            prob_stds = np.concatenate([self.pred_vals[id].std(axis=0, keepdims=True)
                                   for id in ids], axis=0)
        else:
            class_probs = np.concatenate([self.pred_vals[id] for id in ids], axis=0)
            prob_stds = None
        pred_classes = np.argmax(class_probs, axis=2)
        return (ids, pred_classes, class_probs, prob_stds)



    # ****************************************************************************************
    # class KFoldClassificationPerfData

[docs]
    def get_real_values(self, ids=None):
        """Returns the real dataset response values as an (ncmpds, ntasks, nclasses) array of indicator bits
        (if nclasses > 2) or an (ncmpds, ntasks) array of binary classes (if nclasses == 2),
        with compound IDs in the same order as in the return from get_pred_values() (unless ids is specified).

        Args:
            ids (list of str): Optional list of compound IDs to return values for.

        Returns:
            np.array of shape (ncmpds, tasks, nclasses): of either indicator bits or a 2D array of binary classes

        """
        if ids is None:
            ids = sorted(self.pred_vals.keys())
        if self.num_classes > 2:
            return np.concatenate([self.real_vals[id].reshape((1,-1,self.num_classes)) for id in ids], axis=0)
        else:
            return np.concatenate([self.real_vals[id].reshape((1,-1)) for id in ids], axis=0)


    # ****************************************************************************************
    # class KFoldClassificationPerfData

[docs]
    def get_weights(self, ids=None):
        """Returns the dataset response weights, as an (ncmpds, ntasks) array
        in the same ID order as get_pred_values() (unless ids is specified).

        Args:
            ids (list of str): Optional list of compound IDs to return values for.

        Returns:
            np.array (ncmpds, ntasks) of the real dataset response weights, in the same
            ID order as get_pred_values().

        """
        if ids is None:
            ids = sorted(self.pred_vals.keys())
        return np.concatenate([self.weights[id].reshape((1,-1)) for id in ids], axis=0)



    # ****************************************************************************************
    # class KFoldClassificationPerfData

[docs]
    def compute_perf_metrics(self, per_task=False):
        """Computes the ROC AUC metrics for each task based on the accumulated values, averaged over
        training folds, along with standard deviations of the scores. If per_task is False, the scores
        are averaged over tasks and the overall standard deviation is reported instead.

        Args:
            per_task (bool): True if calculating per-task metrics, False otherwise.

        Returns:
            A tuple (roc_auc_mean, roc_auc_std):

                roc_auc_mean: A numpy array of mean ROC AUC scores for each task, averaged over folds, if per_task is
                True.
                              Otherwise, a float giving the ROC AUC score averaged over both folds and tasks.

                roc_auc_std:  A numpy array of standard deviations over folds of ROC AUC values, if per_task is True.
                              Otherwise, a float giving the overall standard deviation.
        """
        roc_auc_scores = np.stack(self.perf_metrics)
        if per_task:
            roc_auc_mean = np.mean(roc_auc_scores, axis=0)
            roc_auc_std = np.std(roc_auc_scores, axis=0)
        else:
            roc_auc_mean = np.mean(roc_auc_scores.flatten())
            roc_auc_std = np.std(roc_auc_scores.flatten())
        return (roc_auc_mean, roc_auc_std)




# ****************************************************************************************

[docs]
class SimpleRegressionPerfData(RegressionPerfData):
    """Class with methods for accumulating regression model prediction data from training,
    validation or test sets and computing performance metrics.

    Attributes:
        Set in __init__:
            subset (str): Label of the type of subset of dataset for tracking predictions

            num_cmps (int): The number of compounds in the dataset

            num_tasks (int): The number of tasks in the dataset

            pred-vals (dict): The dictionary of prediction results

            folds (int): Initialized at zero, flag for determining which k-fold is being assessed

            real_vals (dict): The dictionary containing the origin response column values

    """

    # ****************************************************************************************
    # class SimpleRegressionPerfData
    def __init__(self, model_dataset, subset):
        """Initialize any attributes that are common to all SimpleRegressionPerfData subclasses

        Args:
           model_dataset (ModelDataset object): contains the dataset and related methods

           subset (str): Label in ['train', 'valid', 'test', 'full'], indicating the type of subset of dataset for
           tracking predictions

                Raises:
           ValueError: if subset not in ['train','valid','test','full'], subset not supported

        Side effects:
           Sets the following attributes of SimpleRegressionPerfData:
               subset (str): Label of the type of subset of dataset for tracking predictions

               num_cmps (int): The number of compounds in the dataset

               num_tasks (int): The number of tasks in the dataset

               pred_vals (dict): The dictionary of prediction results

               real_vals (dict): The dictionary containing the origin response column values

        """
        self.subset = subset
        if subset == 'train':
            dataset = model_dataset.train_valid_dsets[0][0]
        elif subset == 'valid':
            dataset = model_dataset.train_valid_dsets[0][1]
        elif subset == 'test':
            dataset = model_dataset.test_dset
        elif subset == 'full':
            dataset = model_dataset.dataset
        else:
            raise ValueError('Unknown dataset subset type "%s"' % subset)
        self.num_cmpds = dataset.y.shape[0]
        self.num_tasks = dataset.y.shape[1]
        self.weights = dataset.w
        self.ids = dataset.ids
        self.pred_vals = None
        self.pred_stds = None
        self.perf_metrics = []
        self.model_score = None

        self.real_vals = model_dataset.get_untransformed_responses(dataset.ids)


    # ****************************************************************************************
    # class SimpleRegressionPerfData

[docs]
    def accumulate_preds(self, predicted_vals, ids=None, pred_stds=None):
        """Add training, validation or test set predictions to the data structure
        where we keep track of them.

        Args:
            predicted_vals (np.array): Array of predicted values

            ids (list): List of the compound ids of the dataset

            pred_stds (np.array): Optional np.array of the prediction standard deviations

        Side effects:
            Reshapes the predicted values and the standard deviations (if they are given)

        """

        self.pred_vals = self._reshape_preds(predicted_vals)
        if pred_stds is not None:
            self.pred_stds = self._reshape_preds(pred_stds)
        pred_vals = self.pred_vals
        real_vals = self.get_real_values(ids=ids)
        weights = self.get_weights(ids)
        scores = []
        for i in range(self.num_tasks):
            nzrows = np.where(weights[:,i] != 0)[0]
            task_real_vals = np.squeeze(real_vals[nzrows,i])
            task_pred_vals = np.squeeze(pred_vals[nzrows,i])
            scores.append(_safe_regression_score('r2', task_real_vals, task_pred_vals))
        self.perf_metrics.append(np.array(scores))
        return _safe_mean(scores)



    # ****************************************************************************************
    # class SimpleRegressionPerfData

[docs]
    def get_pred_values(self):
        """Returns the predicted values accumulated over training.  Returns
        a tuple (ids, values, stds), where ids is the list of compound IDs, values is a (ncmpds, ntasks) array
        of predictions, and stds is always None for this class.

        Returns:
            Tuple (ids, vals, stds)
                ids (list): Contains the dataset compound ids

                vals (np.array): Contains (ncmpds, ntasks) array of prediction

                stds (np.array): Contains (ncmpds, ntasks) array of prediction standard deviations
        """
        vals = self.pred_vals
        stds = None
        if self.pred_stds is not None:
            stds = self.pred_stds

        return (self.ids, vals, stds)



    # ****************************************************************************************
    # class SimpleRegressionPerfData

[docs]
    def get_real_values(self, ids=None):
        """Returns the real dataset response values, as an (ncmpds, ntasks) array
        with compounds in the same ID order as in the return from get_pred_values().

        Args:
            ids: Ignored for this class

        Returns:
            np.array: Containing the real dataset response values.

        """
        return self.real_vals



    # ****************************************************************************************
    # class SimpleRegressionPerfData

[docs]
    def get_weights(self, ids=None):
        """Returns the dataset response weights as an (ncmpds, ntasks) array

        Args:
            ids: Ignored for this class

        Returns:
            np.array: Containing the dataset response weights

        """
        return self.weights



    # ****************************************************************************************
    # class SimpleRegressionPerfData

[docs]
    def compute_perf_metrics(self, per_task=False):
        """Returns the R-squared metrics for each task or averaged over tasks based on the accumulated values

        Args:
            per_task (bool): True if calculating per-task metrics, False otherwise.

        Returns:
            A tuple (r2_score, std):
                r2_score (np.array): An array of scores for each task, if per_task is True.
                Otherwise, it is a float containing the average R^2 score over tasks.

                std: Always None for this class.
        """
        r2_scores = self.perf_metrics[0]
        if per_task or self.num_tasks == 1:
            return (r2_scores, None)
        else:
            return (r2_scores.mean(), None)





# ****************************************************************************************

[docs]
class SimpleClassificationPerfData(ClassificationPerfData):
    """Class with methods for collecting classification model prediction and performance data from single-fold
    training and prediction runs.

    Attributes:
        Set in __init__:
            subset (str): Label of the type of subset of dataset for tracking predictions

            num_cmps (int): The number of compounds in the dataset

            num_tasks (int): The number of tasks in the dataset

            pred-vals (dict): The dictionary of prediction results

            folds (int): Initialized at zero, flag for determining which k-fold is being assessed

            real_vals (dict): The dictionary containing the origin response column values

            class_names (np.array): Assumes the classes are of deepchem index type (e.g. 0,1,2,...)

            num_classes (int): The number of classes to predict on

    """

    # ****************************************************************************************
    # class SimpleClassificationPerfData
    def __init__(self, model_dataset, subset, predict_probs=True):
        """Initialize any attributes that are common to all SimpleClassificationPerfData subclasses

        Args:
           model_dataset (ModelDataset object): contains the dataset and related methods

           subset (str): Label in ['train', 'valid', 'test', 'full'], indicating the type of subset of dataset for
           tracking predictions

           predict_probs (bool): True if using classifier supports probabilistic predictions, False otherwise

                Raises:
           ValueError: if subset not in ['train','valid','test','full'], subset not supported

           NotImplementedError: if predict_probs is not True, non-probabilistic functions are not supported yet

        Side effects:
           Sets the following attributes of SimpleClassificationPerfData:
               subset (str): Label of the type of subset of dataset for tracking predictions

               num_cmps (int): The number of compounds in the dataset

               num_tasks (int): The number of tasks in the dataset

               pred_vals (dict): The dictionary of prediction results

               real_vals (dict): The dictionary containing the origin response column values

               num_classes (int): The number of classes to predict on

        """

        self.subset = subset
        if subset == 'train':
            dataset = model_dataset.train_valid_dsets[0][0]
        elif subset == 'valid':
            dataset = model_dataset.train_valid_dsets[0][1]
        elif subset == 'test':
            dataset = model_dataset.test_dset
        elif subset == 'full':
            dataset = model_dataset.dataset
        else:
            raise ValueError('Unknown dataset subset type "%s"' % subset)

        # All currently used classifiers generate class probabilities in their predict methods;
        # if in the future we implement a classification algorithm such as kNN that doesn't support
        # probabilistic predictions, the ModelWrapper for that classifier should pass predict_probs=False
        # when constructing this object. When that happens, modify the code in this class to support
        # this option.
        if not predict_probs:
            raise NotImplementedError("Need to add support for non-probabilistic classifiers")

        self.num_cmpds = dataset.y.shape[0]
        if len(dataset.y.shape) > 1:
            self.num_tasks = dataset.y.shape[1]
        else:
            self.num_tasks = 1
        self.pred_vals = None
        self.pred_stds = None
        self.ids = dataset.ids
        self.perf_metrics = []
        self.model_score = None
        self.weights = dataset.w

        # TODO: Everything down to here is same as in SimpleRegressionPerfData.__init__.
        # TODO: Consider defining a SimplePerfData class with the common stuff in its __init__
        # TODO: method, and doing multiple inheritance so we can call it from here.

        # DeepChem does not currently support arbitary class names in classification datasets; 
        # enforce class indices (0, 1, 2, ...) here.
        self.real_classes = model_dataset.get_untransformed_responses(dataset.ids)
        self.class_indeces = list(set(self.real_classes.flatten()))
        self.num_classes = len(self.class_indeces)
        # Convert true values to one-hot encoding
        if self.num_classes > 2:
            self.real_vals = np.concatenate([dc.metrics.to_one_hot(self.real_classes[:,j], self.num_classes).reshape(-1,1,self.num_classes)
                                             for j in range(self.num_tasks)], axis=1)
        else:
            self.real_vals = self.real_classes.reshape((-1,self.num_tasks))


    # ****************************************************************************************
    # class SimpleClassificationPerfData

[docs]
    def accumulate_preds(self, predicted_vals, ids, pred_stds=None):
        """Add training, validation or test set predictions from the current dataset to the data structure
        where we keep track of them.

        Arguments:
            predicted_vals (np.array): Array of predicted values (class probabilities)

            ids (list): List of the compound ids of the dataset

            pred_stds (np.array): Optional np.array of the prediction standard deviations

        Side effects:
            Updates self.pred_vals and self.perf_metrics

        """
        class_probs = self.pred_vals = self._reshape_preds(predicted_vals)
        if pred_stds is not None:
            self.pred_stds = self._reshape_preds(pred_stds)
        real_vals = self.get_real_values(ids=ids)
        weights = self.get_weights(ids)
        # Break out different predictions for each task, with zero-weight compounds masked out, and compute per-task metrics
        scores = []
        for i in range(self.num_tasks):
            nzrows = np.where(weights[:,i] != 0)[0]
            if self.num_classes > 2:
                # If more than 2 classes, real_vals is indicator matrix (one-hot encoded). 
                task_real_vals = np.squeeze(real_vals[nzrows,i,:])
                task_class_probs = np.squeeze(class_probs[nzrows,i,:])

                scores.append(_safe_classification_score('roc_auc', task_real_vals, task_class_probs, average='macro'))
            else:
                # For binary classifier, sklearn metrics functions are expecting single array of 1s and 0s for real_vals_list,
                # and class_probs for class 1 only.
                task_real_vals = np.squeeze(real_vals[nzrows,i])
                task_class_probs = np.squeeze(class_probs[nzrows,i,1])
                scores.append(_safe_classification_score('roc_auc', task_real_vals, task_class_probs))
        self.perf_metrics.append(np.array(scores))
        return float(np.mean(scores))


    # ****************************************************************************************
    # class SimpleClassificationPerfData

[docs]
    def get_pred_values(self):
        """Returns the predicted values accumulated over training.
        If self.subset is 'train', the function will average class probabilities over the k-1 folds in which each
        compound was part of the training set, and return the most probable class. Otherwise, there should be a
        single set of predicted probabilites for each validation or test set compound. Returns a tuple (ids,
        pred_classes, class_probs, prob_stds), where ids is the list of compound IDs, pred_classes is an
        (ncmpds, ntasks) array of predicted classes, class_probs is a (ncmpds, ntasks, nclasses) array of predicted
        probabilities for the classes, and prob_stds is a (ncmpds, ntasks, nclasses) array of standard errors for the
        class probability estimates.

        Returns:
            Tuple (ids, pred_classes, class_probs, prob_stds)
                ids (list): Contains the dataset compound ids

                pred_classes (np.array): Contains (ncmpds, ntasks) array of prediction classes

                class_probs (np.array): Contains (ncmpds, ntasks, nclasses) array of predict class probabilities

                prob_stds (np.array): Contains (ncmpds, ntasks, nclasses) array of standard errors for the class
                probability estimates
        """
        class_probs = self.pred_vals
        pred_classes = np.argmax(class_probs, axis=2)
        prob_stds = self.pred_stds
        return (self.ids, pred_classes, class_probs, prob_stds)



    # ****************************************************************************************
    # class SimpleClassificationPerfData

[docs]
    def get_real_values(self, ids=None):
        """Returns the real dataset response values as an (ncmpds, ntasks, nclasses) array of indicator bits.
        If nclasses == 2, the returned array has dimension (ncmpds, ntasks).

        Args:
            ids: Ignored for this class

        Returns:
            np.array of the response values of the real dataset as indicator bits

        """
        return self.real_vals



    # ****************************************************************************************
    # class SimpleClassificationPerfData

[docs]
    def get_weights(self, ids=None):
        """Returns the dataset response weights

        Args:
            ids: Ignored for this class

        Returns:
            np.array: Containing the dataset response weights

        """
        return self.weights



    # ****************************************************************************************
    # class SimpleClassificationPerfData

[docs]
    def compute_perf_metrics(self, per_task=False):
        """Returns the ROC_AUC metrics for each task based on the accumulated predictions. If
        per_task is False, returns the average ROC AUC over tasks.

        Args:
            per_task (bool): Whether to return individual ROC AUC scores for each task

        Returns:
            A tuple (roc_auc, std):
                roc_auc: A numpy array of ROC AUC scores, if per_task is True. Otherwise,
                         a float giving the mean ROC AUC score over tasks.

                std:     Placeholder for an array of standard deviations. Always None for this class.

        """
        roc_auc_scores = self.perf_metrics[0]
        if per_task or self.num_tasks == 1:
            return (roc_auc_scores, None)
        else:
            return (roc_auc_scores.mean(), None)




# ****************************************************************************************

[docs]
class SimpleHybridPerfData(HybridPerfData):
    """Class with methods for accumulating hybrid model prediction data from training,
    validation or test sets and computing performance metrics.

    Attributes:
        Set in __init__:
            subset (str): Label of the type of subset of dataset for tracking predictions

            num_cmps (int): The number of compounds in the dataset

            num_tasks (int): The number of tasks in the dataset

            pred-vals (dict): The dictionary of prediction results

            folds (int): Initialized at zero, flag for determining which k-fold is being assessed

            real_vals (dict): The dictionary containing the origin response column values

    """

    # ****************************************************************************************
    # class SimpleHybridPerfData
    def __init__(self, model_dataset, subset, is_ki, ki_convert_ratio=None):
        """Initialize any attributes that are common to all SimpleRegressionPerfData subclasses

        Args:
           model_dataset (ModelDataset object): contains the dataset and related methods

           subset (str): Label in ['train', 'valid', 'test', 'full'], indicating the type of subset of dataset for
           tracking predictions

           is_ki: whether the dose-response activity is Ki or IC50, it will decide how to convert them into single
           concentration activities.

           ki_convert_ratio: If the given activity is pKi, a ratio to convert Ki into IC50 is needed. It can be the
           ratio of concentration and Kd of the radioligand in a competitive binding assay, or the concentration
           of the substrate and Michaelis constant (Km) of enzymatic inhibition assay.

        Raises:
           ValueError: if subset not in ['train','valid','test','full'], subset not supported

        Side effects:
           Sets the following attributes of SimpleRegressionPerfData:
               subset (str): Label of the type of subset of dataset for tracking predictions

               num_cmps (int): The number of compounds in the dataset

               num_tasks (int): The number of tasks in the dataset

               pred_vals (dict): The dictionary of prediction results

               real_vals (dict): The dictionary containing the origin response column values

        """
        self.subset = subset
        if subset == 'train':
            dataset = model_dataset.train_valid_dsets[0][0]
        elif subset == 'valid':
            dataset = model_dataset.train_valid_dsets[0][1]
        elif subset == 'test':
            dataset = model_dataset.test_dset
        elif subset == 'full':
            dataset = model_dataset.dataset
        else:
            raise ValueError('Unknown dataset subset type "%s"' % subset)
        self.num_cmpds = dataset.y.shape[0]
        self.num_tasks = dataset.y.shape[1]
        self.weights = dataset.w
        self.ids = dataset.ids
        self.pred_vals = None
        self.pred_stds = None
        self.perf_metrics = []
        self.model_score = None
        self.is_ki = is_ki
        self.ki_convert_ratio = ki_convert_ratio
        self.real_vals = model_dataset.get_untransformed_responses(dataset.ids)

    # ****************************************************************************************
    # class SimpleHybridPerfData

[docs]
    def accumulate_preds(self, predicted_vals, ids, pred_stds=None):
        """Add training, validation or test set predictions to the data structure
        where we keep track of them.

        Args:
            predicted_vals (np.array): Array of predicted values

            ids (list): List of the compound ids of the dataset

            pred_stds (np.array): Optional np.array of the prediction standard deviations

        Side effects:
            Reshapes the predicted values and the standard deviations (if they are given)

        """

        self.pred_vals = self._reshape_preds(predicted_vals)
        if pred_stds is not None:
            self.pred_stds = self._reshape_preds(pred_stds)
        pred_vals = self.pred_vals
        real_vals = self.get_real_values(ids=ids)
        weights = self.get_weights(ids)
        scores = []
        pos_ki = np.where(np.isnan(real_vals[:, 1]))[0]
        pos_bind = np.where(~np.isnan(real_vals[:, 1]))[0]

        # score for pKi/IC50
        nzrows = np.where(weights[:, 0] != 0)[0]
        rowki = np.intersect1d(nzrows, pos_ki)
        rowbind = np.intersect1d(nzrows, pos_bind)
        ki_real_vals = np.squeeze(real_vals[rowki,0])
        ki_pred_vals = np.squeeze(pred_vals[rowki,0])
        bind_real_vals = np.squeeze(real_vals[rowbind,0])
        bind_pred_vals = np.squeeze(pred_vals[rowbind,0])
        if len(rowki) > 0:
            scores.append(_safe_regression_score('r2', ki_real_vals, ki_pred_vals))
            if len(rowbind) > 0:
                scores.append(_safe_regression_score('r2', bind_real_vals, bind_pred_vals))
            else:
                # if all values are dose response activities, use the r2_score above.
                scores.append(scores[0])
        elif len(rowbind) > 0:
            # all values are single concentration activities.
            scores.append(_safe_regression_score('r2', bind_real_vals, bind_pred_vals))
            scores.append(scores[0])

        self.perf_metrics.append(np.array(scores))
        return _safe_mean(scores)


    # ****************************************************************************************
    # class SimpleHybridPerfData
    def _predict_binding(self, activity, conc):
        """Predict measurements of fractional binding/inhibition of target receptors by a compound with the given activity,
        in -Log scale, at the specified concentration in nM. If the given activity is pKi, a ratio to convert Ki into IC50
        is needed. It can be the ratio of concentration and Kd of the radioligand in a competitive binding assay, or the concentration
        of the substrate and Michaelis constant (Km) of enzymatic inhibition assay.
        """
        
        if self.is_ki:
            if self.ki_convert_ratio is None:
                raise Exception("Ki converting ratio is missing. Cannot convert Ki into IC50")
            Ki = 10**(9-activity)
            IC50 = Ki * (1 + self.ki_convert_ratio)
        else:
            IC50 = 10**(9-activity)
        pred_frac = 1.0/(1.0 + IC50/conc)
        
        return pred_frac

    # ****************************************************************************************
    # class SimpleHybridPerfData

[docs]
    def get_pred_values(self):
        """Returns the predicted values accumulated over training.  Returns
        a tuple (ids, values, stds), where ids is the list of compound IDs, values is a (ncmpds, ntasks) array
        of predictions, and stds is always None for this class.

        Returns:
            Tuple (ids, vals, stds)
                ids (list): Contains the dataset compound ids

                vals (np.array): Contains (ncmpds, ntasks) array of prediction

                stds (np.array): Contains (ncmpds, ntasks) array of prediction standard deviations
        """
        vals = self.pred_vals
        # pos_bind = np.where(~np.isnan(self.real_vals[:,1]))[0]
        # vals[pos_bind, 0] = self._predict_binding(vals[pos_bind, 0], self.real_vals[pos_bind, 1])
        stds = None
        
        return (self.ids, vals, stds)



    # ****************************************************************************************
    # class SimpleHybridPerfData

[docs]
    def get_real_values(self, ids=None):
        """Returns the real dataset response values, as an (ncmpds, ntasks) array
        with compounds in the same ID order as in the return from get_pred_values().

        Args:
            ids: Ignored for this class

        Returns:
            np.array: Containing the real dataset response values.

        """
        return self.real_vals



    # ****************************************************************************************
    # class SimpleHybridPerfData

[docs]
    def get_weights(self, ids=None):
        """Returns the dataset response weights as an (ncmpds, ntasks) array

        Args:
            ids: Ignored for this class

        Returns:
            np.array: Containing the dataset response weights

        """
        return self.weights



    # ****************************************************************************************
    # class SimpleHybridPerfData

[docs]
    def compute_perf_metrics(self, per_task=False):
        """Returns the R-squared metrics for each task or averaged over tasks based on the accumulated values

        Args:
            per_task (bool): True if calculating per-task metrics, False otherwise.

        Returns:
            A tuple (r2_score, std):
                r2_score (np.array): An array of scores for each task, if per_task is True.
                Otherwise, it is a float containing the average R^2 score over tasks.

                std: Always None for this class.
        """
        r2_scores = self.perf_metrics[0]
        if per_task or self.num_tasks == 1:
            return (r2_scores, None)
        else:
            return (r2_scores.mean(), None)




# ****************************************************************************************

[docs]
class EpochManager:
    """Manages lists of PerfDatas

        This class manages lists of PerfDatas as well as variables related to iteratively
        training a model over several epochs. This class sets several varaibles in a given
        ModelWrapper for the sake of backwards compatibility

    Attributes:
       Set in __init__:
           _subsets (dict): Must contain the keys 'train', 'valid', 'test'. The values
               are used as subsets when calling create_perf_data.

           _model_choice_score_type (str): Passed into PerfData.model_choice_score

           _log (logger): This is the from wrapper.log

           _should_stop (bool): True when training as satisfied stopping conditions. Either
               it has reached the max number of epochs or has exceeded early_stopping_patience

           wrapper (ModelWrapper): The model wrapper where this object is being used.

           _new_best_valid_score (function): This function takes no arguments and is called
               whenever a new best validation score is achieved.

    """

    # ****************************************************************************************
    # class EpochManager
    def __init__(self, wrapper,
            subsets={'train':'train',  'valid':'valid', 'test':'test'}, production=False, **kwargs):
        """Initialize EpochManager

        Args:
           wrapper (ModelWrapper): The ModelWrapper that's doing the training

           subsets (dict): Must contain the keys 'train', 'valid', 'test'. The values
               are used as subsets when calling create_perf_data.

           production (bool): True if this is running in production mode.

           kwargs (dict): Additional keyword args are passed to create_perf_data. The
               subset argument should not be passed.

        Side effects:
           Creates the following attributes in wrapper:
               best_epoch
               best_valid_score
               train_epoch_perfs
               valid_epoch_perfs
               test_epoch_perfs
               train_epoch_perf_stds
               valid_epoch_perf_stds
               test_epoch_perf_stds
               model_choice_scores
               early_stopping_min_improvement
               early_stopping_patience
               train_perf_data
               valid_perf_data
               test_perf_data
        """
        params = wrapper.params
        self.production = production
        self._subsets = subsets
        self._model_choice_score_type = params.model_choice_score_type
        self._log = wrapper.log
        self._should_stop = False
        self.wrapper = wrapper
        
        self._new_best_valid_score = lambda: False

        self.wrapper.best_epoch = 0
        self.wrapper.best_valid_score = None
        self.wrapper.train_epoch_perfs = np.zeros(params.max_epochs)
        self.wrapper.valid_epoch_perfs = np.zeros(params.max_epochs)
        self.wrapper.test_epoch_perfs = np.zeros(params.max_epochs)
        self.wrapper.train_epoch_perf_stds = np.zeros(params.max_epochs)
        self.wrapper.valid_epoch_perf_stds = np.zeros(params.max_epochs)
        self.wrapper.test_epoch_perf_stds = np.zeros(params.max_epochs)
        self.wrapper.model_choice_scores = np.zeros(params.max_epochs)
        self.wrapper.early_stopping_min_improvement = params.early_stopping_min_improvement
        self.wrapper.early_stopping_patience = params.early_stopping_patience

        self.wrapper.train_perf_data = []
        self.wrapper.valid_perf_data = []
        self.wrapper.test_perf_data = []

        for _ in range(params.max_epochs):
            self.wrapper.train_perf_data.append(
                create_perf_data(subset=self._subsets['train'], **kwargs))
            self.wrapper.valid_perf_data.append(
                create_perf_data(subset=self._subsets['valid'], **kwargs))
            self.wrapper.test_perf_data.append(
                create_perf_data(subset=self._subsets['test'], **kwargs))

    # ****************************************************************************************
    # class EpochManager

[docs]
    def should_stop(self):
        """Returns True when the training loop should stop

        Returns:
            bool: True when the training loop should stop
        """
        return self._should_stop


    # ****************************************************************************************
    # class EpochManager

[docs]
    def update_epoch(self, ei, train_dset=None, valid_dset=None, test_dset=None):
        """Update training state after an epoch

                This function updates train/valid/test_perf_data. Call this function once
                per epoch. Call self.should_stop() after calling this function to see if you should
                exit the training loop.

                Subsets with None arguments will be ignored

        Args:
           ei (int): The epoch index

           train_dset (dc.data.Dataset): The train dataset

           valid_dset (dc.data.Dataset): The valid dataset. Providing this argument updates
               best_valid_score and _should_stop

           test_dset (dc.data.Dataset): The test dataset

        Returns:
           list: A list of performance values for the provided datasets.

        Side effects:
           This function updates self._should_stop

        """
        train_perf = self.update(ei, 'train', train_dset)
        valid_perf = self.update(ei, 'valid', valid_dset)
        test_perf = self.update(ei, 'test', test_dset)

        return [p for p in [train_perf, valid_perf, test_perf] if p is not None]


    # ****************************************************************************************
    # class EpochManager

[docs]
    def accumulate(self, ei, subset, dset):
        """Accumulate predictions

                Makes predictions, accumulate predictions and calculate the performance metric. Calls PerfData.accumulate_preds
                belonging to the epoch, subset, and given dataset.

        Args:
           ei (int): Epoch index

           subset (str): Which subset, should be train, valid, or test.

           dset (dc.data.Dataset): Calculates the performance for the given dset

        Returns:
           float: Performance metric for the given dset.
        """
        pred = self._make_pred(dset)
        perf = getattr(self.wrapper, f'{subset}_perf_data')[ei].accumulate_preds(pred, dset.ids)
        return perf


    # ****************************************************************************************
    # class EpochManager

[docs]
    def compute(self, ei, subset):
        """Computes performance metrics

                This calls PerfData.compute_perf_metrics and saves the result in
                f'{subset}_epoch_perfs'

        Args:
           ei (int): Epoch index

           subset (str): Which subset to compute_perf_metrics. Should be train, valid, or test

        Returns:
           None

        """
        getattr(self.wrapper, f'{subset}_epoch_perfs')[ei], _ = \
            getattr(self.wrapper, f'{subset}_perf_data')[ei].compute_perf_metrics()


    # ****************************************************************************************
    # class EpochManager

[docs]
    def update_valid(self, ei):
        """Checks validation score

                Checks validation performance of the given epoch index. Updates self._should_stop, checks
                on early stopping conditions, calls self._new_best_valid_score() when necessary.

        Args:
           ei (int): Epoch index

        Returns:
           None

        Side effects:
           Updates self._should_stop when it's time to exit the training loop.
        """
        valid_score = self.wrapper.valid_perf_data[ei].model_choice_score(self._model_choice_score_type)
        self.wrapper.model_choice_scores[ei] = valid_score
        if self.wrapper.best_valid_score is None or self.production:
            # If we're in production mode, every epoch is the new best epoch
            self._new_best_valid_score()
            self.wrapper.best_valid_score = valid_score
            self.wrapper.best_epoch = ei
            self._log.info(f"Total score for epoch {ei} is {valid_score:.3}")
        elif valid_score - self.wrapper.best_valid_score > self.wrapper.early_stopping_min_improvement:
            self._new_best_valid_score()
            self.wrapper.best_valid_score = valid_score
            self.wrapper.best_epoch = ei
            self._log.info(f"*** Total score for epoch {ei} is {valid_score:.3}, is new maximum")
        elif ei - self.wrapper.best_epoch > self.wrapper.early_stopping_patience:
            self._log.info(f"No improvement after {self.wrapper.early_stopping_patience} epochs, stopping training")
            self._should_stop = True


    # ****************************************************************************************
    # class EpochManager

[docs]
    def update(self, ei, subset, dset=None):
        """Update training state

                Updates the training state for a given subset and epoch index with the given dataset.

        Args:
           ei (int): Epoch index.

           subset (str): Should be train, valid, test

           dset (dc.data.Dataset): Updates using this dset

        Returns:
           perf (float): the performance of the given dset.

        """
        if dset is None:
            return None

        perf = self.accumulate(ei, subset, dset)
        self.compute(ei, subset)

        if subset == 'valid':
            self.update_valid(ei)

        return perf


    # ****************************************************************************************
    # class EpochManager

[docs]
    def set_make_pred(self, functional):
        """Sets the function used to make predictions

        Sets the function used to make predictions. This must be called before invoking
        self.update and self.accumulate

        Args:
           functional (function): This function takes one argument, a dc.data.Dataset, and
               returns an array of predictions for that dset. This function is called
               when updating the training state after a given epoch.

        Returns:
           None

        Side effects:
           Saves the functional as self._make_pred
        """
        self._make_pred = functional


    # ****************************************************************************************
    # class EpochManager

[docs]
    def on_new_best_valid(self, functional):
        """Sets the function called when a new best validation score is achieved

                Saves the function called when there's a new best validation score.

        Args:
           functional (function): This function takes no arguments and returns nothing. This
               function is called when there's a new best validation score. This can be used
               to tell the ModelWrapper to save the model.

        Returns:
           None

        Side effects:
           Saves the _new_best_valid_score function.

        """
        self._new_best_valid_score = functional



# ****************************************************************************************

[docs]
class EpochManagerKFold(EpochManager):
    """This class manages the training state when using KFold cross validation. This is
    necessary because this manager uses f'{subset}_epoch_perf_stds' unlike EpochManager

    """
    # ****************************************************************************************
    # class EpochManagerKFold

[docs]
    def compute(self, ei, subset):
        """Calls PerfData.compute_perf_metrics()

        This differs from EpochManager.compute in that it saves the results into
        f'{subset}_epoch_perf_stds'

        Args:
           ei (int): Epoch index

           subset (str): Should be train, valid, test.

        Returns:
           None

        """
        getattr(self.wrapper, f'{subset}_epoch_perfs')[ei], getattr(self.wrapper, f'{subset}_epoch_perf_stds')[ei]= \
            getattr(self.wrapper, f'{subset}_perf_data')[ei].compute_perf_metrics()