Source code for pipeline.hyper_perf_plots

"""Functions for visualizing hyperparameter performance. These functions work with
a dataframe of model performance metrics and hyperparameter specifications from
compare_models.py. For models on the tracker, use get_multitask_perf_from_tracker().
For models in the file system, use get_filesystem_perf_results().
By Amanda P. 7/19/2022
"""
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Create an array with the colors you want to use
colors = ["#7682A4","#A7DDD8","#373C50","#694691","#BE2369","#EB1E23","#6EC8BE","#FFC30F",]
# Set your custom color palette
pal=sns.color_palette(colors)
sns.set_palette(pal)

regselmets=[
 'r2_score',
 'mae_score',
 'rms_score',
]
classselmets = [
 'roc_auc_score',
 'prc_auc_score',
 'precision',
 'recall_score',
 'npv',
 'accuracy_score',
 'kappa',
 'matthews_cc',
 'bal_accuracy',
]

[docs] def get_score_types(): """Helper function to show score type choices.""" print(classselmets) print(regselmets)
def _prep_perf_df(df): """This function splits columns that contain lists into individual columns to use for plotting later. Args: df (pd.DataFrame): A dataframe containing model performances from a hyperparameter search. Best practice is to use get_multitask_perf_from_tracker() or get_filesystem_perf_results(). Returns: perf_track_df (pd.DataFrame): a new df with modified and extra columns. """ perf_track_df=df.copy() if 'NN' in perf_track_df.model_type.unique(): perf_track_df['plot_dropout'] = perf_track_df.dropouts.astype(str).str.strip('[]').str.split(pat=',',n=1, expand=True)[0] perf_track_df['plot_dropout'] = perf_track_df.plot_dropout.astype(float) perf_track_df['layer_sizes'] = perf_track_df.layer_sizes.astype(str).str.strip('[]') cols=['dummy_nodes_1','dummy_nodes_2','dummy_nodes_3'] tmp=perf_track_df.layer_sizes.str.split(pat=',', expand=True).astype(float) n=len(tmp.columns) perf_track_df[cols[0:n]]=tmp perf_track_df['num_layers'] = n-perf_track_df[cols[0:n]].isna().sum(axis=1) perf_track_df[cols[0:n]]=perf_track_df[cols[0:n]].fillna(value=1).astype(int) perf_track_df['num_nodes']=perf_track_df[cols[0:n]].product(axis=1) perf_track_df=perf_track_df.drop(columns=cols[0:n]) perf_track_df.loc[perf_track_df.model_type != "NN", 'layer_sizes']=np.nan perf_track_df.loc[perf_track_df.model_type != "NN", 'num_layers']=np.nan perf_track_df.loc[perf_track_df.model_type != "NN", 'num_nodes']=np.nan perf_track_df.loc[perf_track_df.model_type != "NN", 'plot_dropout']=np.nan return perf_track_df
[docs] def plot_train_valid_test_scores(df, scoretype='r2_score'): """This function plots kde and line plots of performance scores based on their partitions. Args: df (pd.DataFrame): A dataframe containing model performances from a hyperparameter search. Best practice is to use get_multitask_perf_from_tracker() or get_filesystem_perf_results(). scoretype (str): the score type you want to use. Valid options can be found in hpp.classselmets or hpp.regselmets. """ sns.set_context('poster') perf_track_df=df.copy().reset_index(drop=True) plot_df=perf_track_df[[f"best_train_{scoretype}",f"best_valid_{scoretype}",f"best_test_{scoretype}"]] # turn off sorting if you have a ton of models.. can be slow plot_df=plot_df.sort_values(f"best_valid_{scoretype}") fig, ax = plt.subplots(1,2,figsize=(26,8)) sns.kdeplot(perf_track_df[f'best_train_{scoretype}'], label="train",ax=ax[0]) sns.kdeplot(perf_track_df[f'best_valid_{scoretype}'], label="valid",ax=ax[0]) sns.kdeplot(perf_track_df[f'best_test_{scoretype}'], label="test",ax=ax[0]) ax[0].set_xlabel(f'{scoretype}s') ax[0].legend(loc="upper left") ax[1].plot(plot_df.T); ax[1].set_ylim(plot_df.min().min()-.1,1) fig.suptitle(f"Model performance by partition");
### the following 3 plots are originally from Amanda M.
[docs] def plot_rf_perf(df, scoretype='r2_score',subset='valid'): """This function plots scatterplots of performance scores based on their RF hyperparameters. Args: df (pd.DataFrame): A dataframe containing model performances from a hyperparameter search. Best practice is to use get_multitask_perf_from_tracker() or get_filesystem_perf_results(). scoretype (str): the score type you want to use. Valid options can be found in hpp.classselmets or hpp.regselmets. subset (str): the subset of scores you'd like to plot from 'train', 'valid' and 'test'. """ sns.set_context('poster') perf_track_df=df.copy().reset_index(drop=True) plot_df=perf_track_df[perf_track_df.model_type=='RF'] winnertype= f'best_{subset}_{scoretype}' if len(plot_df)>0: feat1 = 'rf_max_features'; feat2 = 'rf_max_depth'; feat3 = 'rf_estimators' hue=feat3 plot_df = plot_df.sort_values([feat3, feat1, feat2]) plot_df[f'{feat1}/{feat2}'] = ['%s / %s' % (mf,est) for mf,est in zip(plot_df[feat1], plot_df[feat2])] with sns.axes_style("whitegrid"): fig = plt.figure(figsize=(40,15)) ax1 = fig.add_subplot(111) sns.scatterplot(x=f'{feat1}/{feat2}', y=winnertype, hue=hue, palette=sns.cubehelix_palette(len(plot_df[hue].unique())), data=plot_df, ax=ax1) plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) plt.xticks(rotation=30, ha='right') plt.title(f'RF model performance'); else: print("There are no RF models in this set.")
[docs] def plot_nn_perf(df, scoretype='r2_score',subset='valid'): """This function plots scatterplots of performance scores based on their NN hyperparameters. Args: df (pd.DataFrame): A dataframe containing model performances from a hyperparameter search. Best practice is to use get_multitask_perf_from_tracker() or get_filesystem_perf_results(). scoretype (str): the score type you want to use. Valid options can be found in hpp.classselmets or hpp.regselmets. subset (str): the subset of scores you'd like to plot from 'train', 'valid' and 'test'. """ sns.set_context('poster') perf_track_df=_prep_perf_df(df).reset_index(drop=True) plot_df=perf_track_df[perf_track_df.model_type=='NN'] winnertype= f'best_{subset}_{scoretype}' if len(plot_df)>0: feat1 = 'learning_rate'; feat2 = 'plot_dropout'; feat3 = 'layer_sizes' hue=feat3 plot_df = plot_df.sort_values([feat3, feat1, feat2]) plot_df[f'{feat1}/{feat2}'] = ['%s / %s' % (mf,est) for mf,est in zip(plot_df[feat1], plot_df[feat2])] with sns.axes_style("whitegrid"): fig = plt.figure(figsize=(40,15)) ax1 = fig.add_subplot(111) sns.scatterplot(x=f'{feat1}/{feat2}', y=winnertype, hue=hue, palette=sns.cubehelix_palette(len(plot_df[hue].unique())), data=plot_df, ax=ax1) plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) plt.xticks(rotation=30, ha='right') plt.title(f'NN model performance'); else: print("There are no NN models in this set.")
[docs] def plot_xg_perf(df, scoretype='r2_score',subset='valid'): """This function plots scatterplots of performance scores based on their XG hyperparameters. Args: df (pd.DataFrame): A dataframe containing model performances from a hyperparameter search. Best practice is to use get_multitask_perf_from_tracker() or get_filesystem_perf_results(). scoretype (str): the score type you want to use. Valid options can be found in hpp.classselmets or hpp.regselmets. subset (str): the subset of scores you'd like to plot from 'train', 'valid' and 'test'. """ sns.set_context('poster') perf_track_df=df.copy().reset_index(drop=True) plot_df=perf_track_df[perf_track_df.model_type=='xgboost'] winnertype= f'best_{subset}_{scoretype}' if len(plot_df)>0: feat1 = 'xgb_learning_rate'; feat2 = 'xgb_gamma' hue=feat2 plot_df = plot_df.sort_values([feat1, feat2]) #plot_df[f'{feat1}/{feat2}'] = ['%s / %s' % (mf,est) for mf,est in zip(plot_df[feat1], plot_df[feat2])] with sns.axes_style("whitegrid"): fig = plt.figure(figsize=(40,15)) ax1 = fig.add_subplot(111) sns.scatterplot(x=feat1, y=winnertype, hue=hue, palette=sns.cubehelix_palette(len(plot_df[hue].unique())), data=plot_df, ax=ax1) plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) plt.xticks(rotation=30, ha='right') plt.title(f'XGboost model performance'); else: print('There are no XGBoost models in this set.')
[docs] def plot_rf_nn_xg_perf(df, scoretype='r2_score',subset='valid'): """This function plots boxplots of performance scores based on their hyperparameters including RF, NN and XGBoost parameters as well as feature types, model types and ECFP radius. Args: df (pd.DataFrame): A dataframe containing model performances from a hyperparameter search. Best practice is to use get_multitask_perf_from_tracker() or get_filesystem_perf_results(). scoretype (str): the score type you want to use. Valid options can be found in hpp.classselmets or hpp.regselmets. subset (str): the subset of scores you'd like to plot from 'train', 'valid' and 'test'. """ sns.set_context('paper') perf_track_df=_prep_perf_df(df).reset_index(drop=True) winnertype= f'best_{subset}_{scoretype}' nfeats=3 feat1='plot_dropout'; feat2='learning_rate'; feat3='num_nodes' feat4='num_layers'; feat5='rf_max_depth'; feat6='rf_max_features' feat7='rf_estimators'; feat8='xgb_gamma'; feat9='xgb_learning_rate' feat10='features'; feat11='model_type'; feat12=f'best_test_{scoretype}'; feat13='ecfp_radius' plotdf2=perf_track_df fig, ax = plt.subplots(4,3, figsize=(16,12)) if 'NN' in perf_track_df.model_type.unique(): sns.boxplot(x=feat1, y=winnertype, palette=sns.cubehelix_palette(len(plotdf2[feat1].unique()), rot=0, start=0.40), data=plotdf2, ax=ax[0,0]); ax[0,0].tick_params(rotation=0); ax[0,0].set_xlabel('NN dropouts') sns.boxplot(x=feat2, y=winnertype, palette=sns.cubehelix_palette(len(plotdf2[feat2].unique()), rot=0, start=0.40), data=plotdf2, ax=ax[0,1]); ax[0,1].tick_params(rotation=30); ax[0,1].set_xlabel('NN learning rate')#ax[0,1].legend_.remove(); ax[0,1].title.set_text(f"Hyperparameters colored by {feat1}") plotdf=perf_track_df[perf_track_df[feat3]>0] sns.boxplot(x=feat3, y=winnertype, palette=sns.cubehelix_palette(len(plotdf[feat3].unique()), rot=0, start=0.40), data=plotdf, ax=ax[0,2]); ax[0,2].tick_params(rotation=30); ax[0,2].set_xlabel('NN number of parameters in hidden layers')#ax[0,2].legend_.remove()#(bbox_to_anchor=(1,1), title=feat1)#, prop={'size': 12}) sns.boxplot(x=feat4, y=winnertype, palette=sns.cubehelix_palette(len(plotdf2[feat4].unique()), rot=0, start=0.40), data=plotdf2, ax=ax[1,0]); ax[1,0].tick_params(rotation=0); ax[1,0].set_xlabel('NN number of layers')#ax[1,0].legend_.remove(); ax[1,0].tick_params(rotation=45) if 'xgboost' in perf_track_df.model_type.unique(): sns.boxplot(x=feat8, y=winnertype, palette=sns.cubehelix_palette(len(plotdf2[feat8].unique()), rot=0, start=2.75), data=plotdf2, ax=ax[1,1]); ax[1,1].tick_params(rotation=0); ax[1,1].set_xlabel('XGBoost gamma')#ax[1,1].title.set_text(f"Hyperparameters colored by {feat2}") sns.boxplot(x=feat9, y=winnertype, palette=sns.cubehelix_palette(len(plotdf2[feat9].unique()), rot=0, start=2.75), data=plotdf2, ax=ax[1,2]); ax[1,2].tick_params(rotation=0); ax[1,2].set_xlabel('XGBoost learning rate')#ax[1,2].legend_.remove()#(bbox_to_anchor=(1,1), title=feat2) if 'RF' in perf_track_df.model_type.unique(): sns.boxplot(x=plotdf2.loc[~plotdf2[feat7].isna(),feat7].astype(int), y=winnertype, palette=sns.cubehelix_palette(len(plotdf2[feat7].unique()), rot=0, start=2.00), data=plotdf2, ax=ax[2,0]); ax[2,0].tick_params(rotation=0); ax[2,0].set_xlabel('RF number of trees')#ax[2,0].legend_.remove(); ax[2,0].tick_params(rotation=45) try: sns.boxplot(x=plotdf2.loc[~plotdf2[feat5].isna(),feat5].astype(int), y=winnertype, palette=sns.cubehelix_palette(len(plotdf2[feat5].unique()), rot=0, start=2.00), data=plotdf2, ax=ax[2,1]); ax[2,1].tick_params(rotation=0) except: pass ax[2,1].set_xlabel('RF max depth')#ax[2,1].legend_.remove(); ax[2,1].title.set_text(f"Hyperparameters colored by {feat3}") sns.boxplot(x=plotdf2.loc[~plotdf2[feat6].isna(),feat6].astype(int), y=winnertype, palette=sns.cubehelix_palette(len(plotdf2[feat6].unique()), rot=0, start=2.00), data=plotdf2, ax=ax[2,2]); ax[2,2].tick_params(rotation=0); ax[2,2].set_xlabel('RF max features per node')#ax[2,2].legend(bbox_to_anchor=(1,1), title=feat3); #general plotdf2=plotdf2.sort_values(feat10) sns.boxplot(x=feat10, y=winnertype, palette=sns.cubehelix_palette(len(plotdf2[feat10].unique()), rot=60, start=0.20), data=plotdf2, ax=ax[3,0]); ax[3,0].tick_params(rotation=0); ax[3,0].set_xlabel('Featurization type');ax[3,0].set_xticklabels( ax[3,0].get_xticklabels(), rotation=30, ha='right', rotation_mode='anchor' )#ax[2,0].legend_.remove(); sns.boxplot(x=feat11, y=winnertype, palette=sns.cubehelix_palette(len(plotdf2[feat11].unique()), rot=60, start=0.20), data=plotdf2, ax=ax[3,1]); ax[3,1].tick_params(rotation=0); ax[3,1].set_xlabel('Model type')#ax[2,1].legend_.remove(); ax[2,1].title.set_text(f"Hyperparameters colored by {feat3}") if 'ecfp_radius' in perf_track_df.columns: sns.boxplot(x=feat13, y=winnertype, palette=sns.cubehelix_palette(len(plotdf2[feat13].unique()), rot=60, start=0.20), data=plotdf2, ax=ax[3,2]) ax[3,2].tick_params(rotation=0); ax[3,2].set_xlabel('ECFP radius')#ax[2,1].legend_.remove(); ax[2,1].title.set_text(f"Hyperparameters colored by {feat3}") # sns.scatterplot(x=feat12, y=winnertype,palette=sns.cubehelix_palette(len(plotdf2[feat12].unique()),rot=0, start=0.20),data=plotdf2, ax=ax[3,2]); ax[3,2].tick_params(rotation=0); ax[3,2].set_xlabel(f'{feat12}')#ax[2,2].legend(bbox_to_anchor=(1,1), title=feat3); plt.tight_layout() fig.suptitle(f"Effect of hyperparameter tuning on model performance", y=1.01);
[docs] def plot_split_perf(df, scoretype='r2_score',subset='valid'): """This function plots boxplots of performance scores based on the splitter type. Args: df (pd.DataFrame): A dataframe containing model performances from a hyperparameter search. Best practice is to use get_multitask_perf_from_tracker() or get_filesystem_perf_results(). scoretype (str): the score type you want to use. Valid options can be found in hpp.classselmets or hpp.regselmets. subset (str): the subset of scores you'd like to plot from 'train', 'valid' and 'test'. """ sns.set_style("ticks") sns.set_context("paper") perf_track_df=_prep_perf_df(df).reset_index(drop=True) winnertype= f'best_{subset}_{scoretype}' if scoretype in regselmets: selmets=regselmets elif scoretype in classselmets: selmets=classselmets plot_df=perf_track_df plot_df=plot_df.sort_values('features') fig, axes = plt.subplots(1,len(selmets), figsize=(5*len(selmets),5)) for i, ax in enumerate(axes.flat): selection_metric = f'best_{subset}_{selmets[i]}' g=sns.boxplot(x="features", y=selection_metric, # x="txptr_features" x="model_type" hue='splitter', palette = sns.color_palette(colors), #showfliers=False, data=plot_df, ax=ax); g.set_xlabel('') g.set_ylabel(selection_metric.replace('best_valid_','')) g.set_xticklabels( g.get_xticklabels(), rotation=30, ha='right', rotation_mode='anchor' ) plt.tight_layout() fig.suptitle('Effect of splitter on model performance', y=1.01)