Source code for utils.hyperparam_search_wrapper

#!/usr/bin/env python

# noinspection SpellCheckingInspection
"""Script to generate hyperparameter combinations based on input params and send off jobs to a slurm system.
Author: Amanda Minnich
"""

# from __future__ import unicode_literals

import argparse
import collections
import os, os.path
import sys
import numpy as np
import logging
import itertools
from numpy.core.numeric import NaN
from collections.abc import Iterable, Iterator
import pandas as pd
import uuid

import subprocess
import shutil
import time

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

from atomsci.ddm.pipeline import featurization as feat
from atomsci.ddm.pipeline import model_pipeline as mp
from atomsci.ddm.pipeline import parameter_parser as parse
from atomsci.ddm.pipeline import model_datasets as model_datasets
from atomsci.ddm.utils import datastore_functions as dsf
from atomsci.ddm.pipeline import model_tracker as trkr
logging.basicConfig(format='%(asctime)-15s %(message)s')

import logging
import socket
import traceback
import copy
import pickle
import pdb


[docs] def run_command(shell_script, python_path, script_dir, params): """Function to submit jobs on a slurm system Args: shell_script: Name of shell script to run python_path: Path to python version script_dir: Directory where script lives params: parameters in dictionary format Returns: None """ # dataset_hash sneaks into params. new_params = argparse.Namespace(**parse.remove_unrecognized_arguments(params)) # It's necessary to make this call here becausae it makes sense for # relative paths to be calucated relative to the .json file, not to # wherever maestro will eventually run the model_pipeline script parse.make_dataset_key_absolute(new_params) params_str = parse.to_str(new_params) slurm_command = 'sbatch {0} {1} {2} "{3}"'.format(shell_script, python_path, script_dir, params_str) print(slurm_command) os.system(slurm_command)
[docs] def gen_maestro_command(python_path, script_dir, params): """Generates a string that can be fed into a command line. Side Effects: Dataset key will be converted to an absolute path before returned. It's difficult to predict the working directory used when maestro runs the script. Args: shell_script: Name of shell script to run python_path: Path to python version script_dir: Directory where script lives params: parameters in dictionary format Returns: str: Formatted command in the form of a string """ # Converts dataset_key to an aboslute path new_params = argparse.Namespace(**parse.remove_unrecognized_arguments(params)) # It's necessary to make this call here becausae it makes sense for # relative paths to be calucated relative to the .json file, not to # wherever maestro will eventually run the model_pipeline script parse.make_dataset_key_absolute(new_params) params_str = parse.to_str(new_params) slurm_command = '{0} {1}/pipeline/model_pipeline.py {2}'.format(python_path, script_dir, params_str) return slurm_command
[docs] def run_cmd(cmd): """Function to submit a job using subprocess Args: cmd: Command to run Returns: output: Output of command """ p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) (output, err) = p.communicate() p.wait() return output
[docs] def reformat_filter_dict(filter_dict): """Function to reformat a filter dictionary to match the Model Tracker metadata structure. Updated 9/2020 by A. Paulson for new LC model tracker. Args: filter_dict: Dictionary containing metadata for model of interest Returns: new_filter_dict: Filter dict reformatted """ rename_dict = {'model_parameters': {'dependencies', 'featurizer', 'git_hash_code','model_bucket', 'model_choice_score_type', 'model_dataset_oid','model_type', 'num_model_tasks', 'prediction_type', 'save_results', 'system', 'task_type', 'time_generated', 'transformer_bucket', 'transformer_key', 'transformer_oid', 'transformers', 'uncertainty'}, 'splitting_parameters': {'base_splitter', 'butina_cutoff', 'cutoff_date', 'date_col','num_folds', 'split_strategy', 'split_test_frac', 'split_uuid', 'split_valid_frac', 'splitter'}, 'training_dataset': {'bucket', 'dataset_key', 'dataset_oid', 'num_classes','feature_transform_type', 'response_transform_type', 'id_col', 'smiles_col', 'response_cols'}, 'umap_specific': {'umap_dim', 'umap_metric', 'umap_min_dist', 'umap_neighbors','umap_targ_wt'} } if filter_dict['model_type'] == 'NN': rename_dict['nn_specific'] = {'baseline_epoch', 'batch_size', 'best_epoch', 'bias_init_consts','dropouts', 'layer_sizes', 'learning_rate', 'max_epochs','optimizer_type', 'weight_decay_penalty', 'weight_decay_penalty_type', 'weight_init_stddevs'} elif filter_dict['model_type'] == 'RF': rename_dict['rf_specific'] = {'rf_estimators', 'rf_max_depth', 'rf_max_features'} elif filter_dict['model_type'] == 'xgboost': rename_dict['xgb_specific'] = {'xgb_colsample_bytree', 'xgb_gamma', 'xgb_learning_rate','xgb_max_depth', 'xgb_min_child_weight', 'xgb_n_estimators','xgb_subsample'} if filter_dict['featurizer'] == 'ecfp': rename_dict['ecfp_specific'] = {'ecfp_radius', 'ecfp_size'} elif (filter_dict['featurizer'] == 'descriptor') | (filter_dict['featurizer'] == 'computed_descriptors'): rename_dict['descriptor_specific'] = {'descriptor_key', 'descriptor_bucket', 'descriptor_oid', 'descriptor_type'} elif filter_dict['featurizer'] == 'molvae': rename_dict['autoencoder_specific'] = {'autoencoder_model_key', 'autoencoder_model_bucket', 'autoencoder_model_oid', 'autoencoder_type'} new_filter_dict = {} for key, values in rename_dict.items(): for value in values: if value in filter_dict: filter_val = filter_dict[value] if type(filter_val) == np.int64: filter_dict[value] = int(filter_val) elif type(filter_val) == np.float64: filter_dict[value] = float(filter_val) elif type(filter_val) == list: for i, item in enumerate(filter_val): if type(item) == np.int64: filter_dict[value][i] = int(item) elif type(filter_val) == np.float64: filter_dict[value][i] = float(item) new_filter_dict['%s.%s' % (key, value)] = filter_dict[value] return new_filter_dict
[docs] def permutate_NNlayer_combo_params(layer_nums, node_nums, dropout_list, max_final_layer_size): """Generate combos of layer_sizes(str) and dropouts(str) params from the layer_nums (list), node_nums (list), dropout_list (list). The permutation will make the NN funnel shaped, so that the next layer can only be smaller or of the same size of the current layer. Example: permutate_NNlayer_combo_params([2], [4,8,16], [0], 16) returns [[16, 4], [16, 8], [8,4]] [[0,0],[0,0],[0,0]] If there are duplicates of the same size, it will create consecutive layers of the same size. Example: permutate_NNlayer_combo_params([2], [4,8,8], [0], 16) returns [[8, 8], [8, 4]] [[0,0],[0,0]] Args: layer_nums: specify numbers of layers. node_nums: specify numbers of nodes per layer. dropout_list: specify the dropouts. max_last_layer_size: sets the max size of the last layer. It will be set to the smallest node_num if needed. Returns: layer_sizes, dropouts: the layer sizes and dropouts generated based on the input parameters """ import itertools import numpy as np layer_sizes = [] dropouts = [] node_nums = np.sort(np.array(node_nums))[::-1] max_final_layer_size = int(max_final_layer_size) # set to the smallest node_num in the provided list, if necessary. if node_nums[-1] > max_final_layer_size: max_final_layer_size = node_nums[-1] for dropout in dropout_list: _repeated_layers =[] for layer_num in layer_nums: for layer in itertools.combinations(node_nums, layer_num): layer = [i for i in layer] if (layer[-1] <= max_final_layer_size) and (layer not in _repeated_layers): _repeated_layers.append(layer) layer_sizes.append(layer) dropouts.append([(dropout) for i in layer]) return layer_sizes, dropouts
[docs] def get_num_params(combo): """Calculates the number of parameters in a fully-connected neural networ Args: combo: Model parameters Returns: tmp_sum: Calculated number of parameters """ layers = combo['layer_sizes'] # All layers multiplied by adjacent layers, summed, plus the final layer times the number of samples. Extra addition is for bias terms tmp_sum = layers[0] + sum(layers[i] * layers[i + 1] + layers[i+1] for i in range(len(layers) - 1)) # Add in first layer times the feature vector size. Estimate 300 for descriptors. #TODO: Update for moe vs mordred if combo['featurizer'] == 'ecfp': return tmp_sum + layers[0]*1024 if combo['featurizer'] == 'descriptors': if combo['descriptor_type'] == 'moe': return tmp_sum + layers[0]*306 if combo['descriptor_type'] == 'mordred_filtered': return tmp_sum + layers[0]*1555 else: return tmp_sum
# Global variable with keys that should not be used to generate hyperparameters excluded_keys = {'shortlist_key', 'use_shortlist', 'dataset_key', 'object_oid', 'script_dir', 'python_path', 'config_file', 'hyperparam', 'search_type', 'split_only', 'layer_nums', 'node_nums', 'dropout_list', 'max_final_layer_size', 'splitter', 'nn_size_scale_factor', 'rerun', 'max_jobs'}
[docs] class HyperparameterSearch(object): """The class for generating and running all hyperparameter combinations based on the input params given """ def __init__(self, params): """ Args: params: The input hyperparameter parameters hyperparam_uuid: Optional, UUID for hyperparameter run if you want to group this run with a previous run. We ended up mainly doing this via collections, so not really used """ self.hyperparam_layers = {'layer_sizes', 'dropouts', 'weight_init_stddevs', 'bias_init_consts'} self.hyperparam_keys = {'model_type', 'featurizer', 'splitter', 'learning_rate', 'weight_decay_penalty', 'rf_estimators', 'rf_max_features', 'rf_max_depth', 'umap_dim', 'umap_targ_wt', 'umap_metric', 'umap_neighbors', 'umap_min_dist', 'xgb_learning_rate', 'xgb_gamma'} self.nn_specific_keys = {'learning_rate', 'layers','weight_decay_penalty'} self.rf_specific_keys = {'rf_estimators', 'rf_max_features', 'rf_max_depth'} self.xgboost_specific_keys = {'xgb_learning_rate', 'xgb_gamma'} self.hyperparam_keys |= self.hyperparam_layers self.excluded_keys = excluded_keys self.convert_to_float = parse.convert_to_float_list self.convert_to_int = parse.convert_to_int_list self.params = params # simplify NN layer construction if (params.layer_nums != None) and (params.node_nums != None) and (params.dropout_list != None): self.params.layer_sizes, self.params.dropouts = permutate_NNlayer_combo_params(params.layer_nums, params.node_nums, params.dropout_list, params.max_final_layer_size) if params.hyperparam_uuid is None: self.hyperparam_uuid = str(uuid.uuid4()) else: self.hyperparam_uuid = params.hyperparam_uuid self.hyperparams = {} self.new_params = {} self.layers = {} self.param_combos = [] self.num_rows = {} self.log = logging.getLogger("hyperparam_search") # Create handlers c_handler = logging.StreamHandler() log_path = os.path.join(self.params.result_dir, 'logs') if not os.path.exists(log_path): os.makedirs(log_path) f_handler = logging.FileHandler(os.path.join(log_path, '{0}.log'.format(self.hyperparam_uuid))) self.out_file = open(os.path.join(log_path, '{0}.json'.format(self.hyperparam_uuid)), 'a') c_handler.setLevel(logging.WARNING) f_handler.setLevel(logging.INFO) # Create formatters and add it to handlers c_format = logging.Formatter('%(name)s - %(levelname)s - %(message)s') f_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') c_handler.setFormatter(c_format) f_handler.setFormatter(f_format) # Add handlers to the logger self.log.addHandler(c_handler) self.log.addHandler(f_handler) slurm_path = os.path.join(self.params.result_dir, 'slurm_files') if not os.path.exists(slurm_path): os.makedirs(slurm_path) self.shell_script = os.path.join(self.params.result_dir, 'run.sh') with open(self.shell_script, 'w') as f: f.write("#!/bin/bash\n") f.write("#SBATCH -D {0}\n".format(slurm_path)) # If any of these properties == None, that property is not set if self.params.slurm_account: f.write("#SBATCH -A {0}\n".format(self.params.slurm_account)) elif self.params.lc_account: f.write("#SBATCH -A {0}\n".format(self.params.lc_account)) if self.params.slurm_export: f.write("#SBATCH --export={0}\n".format(self.params.slurm_export)) if self.params.slurm_nodes: f.write("#SBATCH -N {0}\n".format(self.params.slurm_nodes)) if self.params.slurm_partition: f.write("#SBATCH -p {0}\n".format(self.params.slurm_partition)) if self.params.slurm_time_limit: f.write("#SBATCH -t {0}\n".format(self.params.slurm_time_limit)) if self.params.slurm_options: f.write('{0}\n'.format(self.params.slurm_options)) f.write('start=`date +%s`\necho $3\n$1 $2/pipeline/model_pipeline.py $3\nend=`date +%s`\n' 'runtime=$((end-start))\necho "runtime: " $runtime')
[docs] def generate_param_combos(self): """Performs additional parsing of parameters and generates all combinations Returns: None """ for key, value in vars(self.params).items(): if (value is None) or (key in self.excluded_keys): continue elif key == 'result_dir' or key == 'output_dir': self.new_params[key] = os.path.join(value, self.hyperparam_uuid) # Need to zip together layers in special way elif key in self.hyperparam_layers and type(value[0]) == list: self.layers[key] = value # Parses the hyperparameter keys depending on the size of the key list elif key in self.hyperparam_keys: if type(value) != list: self.new_params[key] = value self.hyperparam_keys.remove(key) elif len(value) == 1: self.new_params[key] = value[0] self.hyperparam_keys.remove(key) else: self.hyperparams[key] = value else: self.new_params[key] = value # Adds layers to the parameter combos if self.layers: self.assemble_layers() # setting up the various hyperparameter combos for each model type. if type(self.params.model_type) == str: self.params.model_type = [self.params.model_type] if type(self.params.featurizer) == str: self.params.featurizer = [self.params.featurizer] if type(self.params.descriptor_type) == str: self.params.descriptor_type = [self.params.descriptor_type] for model_type in self.params.model_type: if model_type == 'NN': # if the model type is NN, loops through the featurizer to check for GraphConv. for featurizer in self.params.featurizer: if featurizer == 'computed_descriptors': for desc in self.params.descriptor_type: subcombo = {k: val for k, val in self.hyperparams.items() if k in self.hyperparam_keys - self.rf_specific_keys - self.xgboost_specific_keys} # could put in list subcombo['model_type'] = [model_type] subcombo['featurizer'] = [featurizer] subcombo['descriptor_type'] = [desc] self.param_combos.extend(self.generate_combos(subcombo)) else: subcombo = {k: val for k, val in self.hyperparams.items() if k in self.hyperparam_keys - self.rf_specific_keys - self.xgboost_specific_keys} # could put in list subcombo['model_type'] = [model_type] subcombo['featurizer'] = [featurizer] subcombo['descriptor_type'] = ['moe'] if (featurizer == 'graphconv') & (self.params.prediction_type=='classification'): subcombo['uncertainty'] = [False] self.param_combos.extend(self.generate_combos(subcombo)) elif model_type == 'RF': for featurizer in self.params.featurizer: if featurizer == 'graphconv': continue elif featurizer == 'computed_descriptors': for desc in self.params.descriptor_type: # Adds the subcombo for RF subcombo = {k: val for k, val in self.hyperparams.items() if k in self.hyperparam_keys - self.nn_specific_keys - self.xgboost_specific_keys} subcombo['model_type'] = [model_type] subcombo['featurizer'] = [featurizer] subcombo['descriptor_type'] = [desc] self.param_combos.extend(self.generate_combos(subcombo)) else: # Adds the subcombo for RF subcombo = {k: val for k, val in self.hyperparams.items() if k in self.hyperparam_keys - self.nn_specific_keys - self.xgboost_specific_keys} subcombo['model_type'] = [model_type] subcombo['featurizer'] = [featurizer] subcombo['descriptor_type'] = ['moe'] self.param_combos.extend(self.generate_combos(subcombo)) elif model_type == 'xgboost': for featurizer in self.params.featurizer: if featurizer == 'graphconv': continue elif featurizer == 'computed_descriptors': for desc in self.params.descriptor_type: # Adds the subcombo for xgboost subcombo = {k: val for k, val in self.hyperparams.items() if k in self.hyperparam_keys - self.nn_specific_keys - self.rf_specific_keys} subcombo['model_type'] = [model_type] subcombo['featurizer'] = [featurizer] subcombo['descriptor_type'] = [desc] self.param_combos.extend(self.generate_combos(subcombo)) else: # Adds the subcombo for xgboost subcombo = {k: val for k, val in self.hyperparams.items() if k in self.hyperparam_keys - self.nn_specific_keys - self.rf_specific_keys} subcombo['model_type'] = [model_type] subcombo['featurizer'] = [featurizer] subcombo['descriptor_type'] = ['moe'] self.param_combos.extend(self.generate_combos(subcombo))
[docs] def generate_combos(self, params_dict): """Calls sub-function generate_combo and then uses itertools.product to generate all desired combinations Args: params_dict: Returns: None """ new_dict = self.generate_combo(params_dict) hyperparam_combos = [] hyperparams = new_dict.keys() hyperparam_vals = new_dict.values() for ind, hyperparameter_tuple in enumerate(itertools.product(*hyperparam_vals)): model_params = {} for hyperparam, hyperparam_val in zip(hyperparams, hyperparameter_tuple): model_params[hyperparam] = hyperparam_val hyperparam_combos.append(model_params) return hyperparam_combos
[docs] def assemble_layers(self): """Reformats layer parameters Returns: None """ tmp_list = [] for i in range(min([len(x) for x in list(self.layers.values())])): tmp_dict = {} for key, value in self.layers.items(): tmp_dict[key] = value[i] x = [len(y) for y in tmp_dict.values()] try: assert x.count(x[0]) == len(x) except: continue tmp_list.append(tmp_dict) self.hyperparams['layers'] = tmp_list self.hyperparam_keys.add('layers')
[docs] def generate_assay_list(self): """Generates the list of datasets to build models for, with their key, bucket, split, and split uuid Returns: None """ # Creates the assay list with additional options for use_shortlist if not self.params.use_shortlist: if type(self.params.splitter) == str: splitters = [self.params.splitter] else: splitters = self.params.splitter self.assays = [] for splitter in splitters: if 'previously_split' in self.params.__dict__.keys() and 'split_uuid' in self.params.__dict__.keys() \ and self.params.previously_split and self.params.split_uuid is not None: self.assays.append((self.params.dataset_key, self.params.bucket, self.params.response_cols, self.params.collection_name, self.params.splitter, self.params.split_uuid)) else: try: split_uuid = self.return_split_uuid(self.params.dataset_key, splitter=splitter) self.assays.append((self.params.dataset_key, self.params.bucket, self.params.response_cols, self.params.collection_name, splitter, split_uuid)) except Exception as e: print(e) print(traceback.print_exc()) sys.exit(1) else: self.assays = self.get_shortlist_df(split_uuids=True) self.assays = [(t[0].strip(), t[1].strip(), t[2], t[3].strip(), t[4].strip(), t[5].strip()) for t in self.assays]
[docs] def get_dataset_metadata(self, assay_params, retry_time=60): """Gather the required metadata for a dataset Args: assay_params: dataset metadata Returns: None """ if not self.params.datastore: return print(assay_params['dataset_key']) retry = True i = 0 #TODO: need to catch if dataset doesn't exist versus 500 failure while retry: try: metadata = dsf.get_keyval(dataset_key=assay_params['dataset_key'], bucket=assay_params['bucket']) retry = False except Exception as e: if i < 5: print("Could not get metadata from datastore for dataset %s because of exception %s, sleeping..." % (assay_params['dataset_key'], e)) time.sleep(retry_time) i += 1 else: print("Could not get metadata from datastore for dataset %s because of exception %s, exiting" % (assay_params['dataset_key'], e)) return None if 'id_col' in metadata.keys(): assay_params['id_col'] = metadata['id_col'] if 'response_cols' not in assay_params or assay_params['response_cols'] is None: if 'param' in metadata.keys(): assay_params['response_cols'] = [metadata['param']] if 'response_col' in metadata.keys(): assay_params['response_cols'] = [metadata['response_col']] if 'response_cols' in metadata.keys(): assay_params['response_cols'] = metadata['response_cols'] if 'smiles_col' in metadata.keys(): assay_params['smiles_col'] = metadata['smiles_col'] if 'class_name' in metadata.keys(): assay_params['class_name'] = metadata['class_name'] if 'class_number' in metadata.keys(): assay_params['class_number'] = metadata['class_number'] if 'num_row' in metadata.keys(): self.num_rows[assay_params['dataset_key']] = metadata['num_row'] assay_params['dataset_name'] = assay_params['dataset_key'].split('/')[-1].rstrip('.csv') assay_params['hyperparam_uuid'] = self.hyperparam_uuid
[docs] def split_and_save_dataset(self, assay_params): """Splits a given dataset, saves it, and sets the split_uuid in the metadata Args: assay_params: Dataset metadata Returns: None """ self.get_dataset_metadata(assay_params) # TODO: check usage with defaults namespace_params = parse.wrapper(assay_params) # TODO: Don't want to recreate each time featurization = feat.create_featurization(namespace_params) data = model_datasets.create_model_dataset(namespace_params, featurization) data.get_featurized_data() data.split_dataset() data.save_split_dataset() assay_params['previously_split'] = True assay_params['split_uuid'] = data.split_uuid
[docs] def return_split_uuid(self, dataset_key, bucket=None, splitter=None, split_combo=None, retry_time=60): """Loads a dataset, splits it, saves it, and returns the split_uuid Args: dataset_key: key for dataset to split bucket: datastore-specific user group bucket splitter: Type of splitter to use to split the dataset split_combo: tuple of form (split_valid_frac, split_test_frac) Returns: None """ if bucket is None: bucket = self.params.bucket if splitter is None: splitter=self.params.splitter if split_combo is None: split_valid_frac = self.params.split_valid_frac split_test_frac = self.params.split_test_frac else: split_valid_frac = split_combo[0] split_test_frac = split_combo[1] retry = True i = 0 #TODO: need to catch if dataset doesn't exist versus 500 failure while retry: try: metadata = dsf.get_keyval(dataset_key=dataset_key, bucket=bucket) retry = False except Exception as e: if i < 5: print("Could not get metadata from datastore for dataset %s because of exception %s, sleeping..." % (dataset_key, e)) time.sleep(retry_time) i += 1 else: print("Could not get metadata from datastore for dataset %s because of exception %s, exiting" % (dataset_key, e)) return None assay_params = {'dataset_key': dataset_key, 'bucket': bucket, 'splitter': splitter, 'split_valid_frac': split_valid_frac, 'split_test_frac': split_test_frac} #Need a featurizer type to split dataset, but since we only care about getting the split_uuid, does not matter which featurizer you use if type(self.params.featurizer) == list: assay_params['featurizer'] = self.params.featurizer[0] else: assay_params['featurizer'] = self.params.featurizer if 'id_col' in metadata.keys(): assay_params['id_col'] = metadata['id_col'] if 'response_cols' not in assay_params or assay_params['response_cols'] is None: if 'param' in metadata.keys(): assay_params['response_cols'] = [metadata['param']] if 'response_col' in metadata.keys(): assay_params['response_cols'] = [metadata['response_col']] if 'response_cols' in metadata.keys(): assay_params['response_cols'] = metadata['response_cols'] if 'smiles_col' in metadata.keys(): assay_params['smiles_col'] = metadata['smiles_col'] if 'class_name' in metadata.keys(): assay_params['class_name'] = metadata['class_name'] if 'class_number' in metadata.keys(): assay_params['class_number'] = metadata['class_number'] assay_params['dataset_name'] = assay_params['dataset_key'].split('/')[-1].rstrip('.csv') assay_params['datastore'] = True assay_params['previously_featurized'] = self.params.previously_featurized try: assay_params['descriptor_key'] = self.params.descriptor_key assay_params['descriptor_bucket'] = self.params.descriptor_bucket except: print("") #TODO: check usage with defaults namespace_params = parse.wrapper(assay_params) # TODO: Don't want to recreate each time featurization = feat.create_featurization(namespace_params) data = model_datasets.create_model_dataset(namespace_params, featurization) retry = True i = 0 while retry: try: data.get_featurized_data() data.split_dataset() data.save_split_dataset() return data.split_uuid except Exception as e: if i < 5: print("Could not get metadata from datastore for dataset %s because of exception %s, sleeping" % (dataset_key, e)) time.sleep(retry_time) i += 1 else: print("Could not save split dataset for dataset %s because of exception %s" % (dataset_key, e)) return None
[docs] def return_split_uuid_file(self, dataset_key, response_cols, bucket=None, splitter=None, split_combo=None, retry_time=60): """Loads a dataset, splits it, saves it, and returns the split_uuid. Args: dataset_key: key for dataset to split bucket: datastore-specific user group bucket splitter: Type of splitter to use to split the dataset split_combo: tuple of form (split_valid_frac, split_test_frac) Returns: None """ if bucket is None: bucket = self.params.bucket if splitter is None: splitter=self.params.splitter if split_combo is None: split_valid_frac = self.params.split_valid_frac split_test_frac = self.params.split_test_frac else: split_valid_frac = split_combo[0] split_test_frac = split_combo[1] assay_params = {'dataset_key': dataset_key, 'bucket': bucket, 'splitter': splitter, 'split_valid_frac': split_valid_frac, 'split_test_frac': split_test_frac} if 'id_col' in self.params.__dict__.keys(): assay_params['id_col']=self.params.id_col if 'smiles_col' in self.params.__dict__.keys(): assay_params['smiles_col']=self.params.smiles_col if isinstance(response_cols, list): assay_params['response_cols']=",".join(response_cols) elif isinstance(response_cols,str): assay_params['response_cols']=response_cols assay_params['dataset_name'] = assay_params['dataset_key'].split('/')[-1].replace('.csv','') # rdkit_raw b/c it's the fastest and won't have to be redone every split assay_params['featurizer'] = 'computed_descriptors' assay_params['descriptor_type'] = 'rdkit_raw' assay_params['previously_featurized'] = True assay_params['datastore'] = False namespace_params = parse.wrapper(assay_params) # TODO: Don't want to recreate each time featurization = feat.create_featurization(namespace_params) data = model_datasets.create_model_dataset(namespace_params, featurization) data.get_featurized_data() data.split_dataset() data.save_split_dataset() return data.split_uuid
[docs] def generate_split_shortlist(self, retry_time=60): """Processes a shortlist, generates splits for each dataset on the list, and uploads a new shortlist file with the split_uuids included. Generates splits for the split_combos [[0.1,0.1], [0.1,0.2],[0.2,0.2]], [random, scaffold] Returns: None """ retry = True i = 0 while retry: try: shortlist_metadata = dsf.retrieve_dataset_by_datasetkey( bucket=self.params.bucket, dataset_key=self.params.shortlist_key, return_metadata=True) retry = False except Exception as e: if i < 5: print("Could not retrieve shortlist %s from datastore because of exception %s, sleeping..." % (self.params.shortlist_key, e)) time.sleep(retry_time) i += 1 else: print("Could not retrieve shortlist %s from datastore because of exception %s, exiting" % (self.params.shortlist_key, e)) return None datasets = self.get_shortlist_df() rows = [] for assay, bucket, response_cols, collection in datasets: split_uuids = {'dataset_key': assay, 'bucket': bucket, 'response_cols':response_cols, 'collection':collection} for splitter in ['random', 'scaffold', 'fingerprint']: for split_combo in [[0.1,0.1], [0.15,0.15],[0.1,0.2],[0.2,0.2]]: split_name = "%s_%d_%d" % (splitter, split_combo[0]*100, split_combo[1]*100) try: split_uuids[split_name] = self.return_split_uuid(assay, bucket, splitter, split_combo) except Exception as e: print(e) print("Splitting failed for dataset %s" % assay) split_uuids[split_name] = None continue rows.append(split_uuids) df = pd.DataFrame(rows) new_metadata = {} new_metadata['dataset_key'] = shortlist_metadata['dataset_key'].strip('.csv') + '_with_uuids.csv' new_metadata['has_uuids'] = True new_metadata['description'] = '%s, with UUIDs' % shortlist_metadata['description'] retry = True i = 0 while retry: try: dsf.upload_df_to_DS(df, bucket=self.params.bucket, filename=new_metadata['dataset_key'], title=new_metadata['dataset_key'].replace('_', ' '), description=new_metadata['description'], tags=[], key_values={}, dataset_key=new_metadata['dataset_key']) retry=False except Exception as e: if i < 5: print("Could not save new shortlist because of exception %s, sleeping..." % e) time.sleep(retry_time) i += 1 else: #TODO: Add save to disk. print("Could not save new shortlist because of exception %s, exiting" % e) retry = False
[docs] def generate_split_shortlist_file(self): """Processes a shortlist, generates splits for each dataset on the list, and uploads a new shortlist file with the split_uuids included. Generates splits for the split_combos [[0.1,0.1], [0.15,0.15], [0.1,0.2], [0.2,0.2]], [random, scaffold] Returns: None """ datasets = self.get_shortlist_df() rows = [] for assay, bucket, response_cols, collection in datasets: split_uuids = {'dataset_key': assay, 'bucket': bucket, 'response_cols':response_cols, 'collection':collection} for splitter in ['random', 'scaffold','fingerprint']: for split_combo in [[0.1,0.1], [0.15,0.15],[0.1,0.2],[0.2,0.2]]: split_name = "%s_%d_%d" % (splitter, split_combo[0]*100, split_combo[1]*100) try: split_uuids[split_name] = self.return_split_uuid_file(assay, response_cols, bucket, splitter, split_combo) except Exception as e: print(e) print("Splitting failed for dataset %s" % assay) split_uuids[split_name] = None continue rows.append(split_uuids) df = pd.DataFrame(rows) fname = self.params.shortlist_key.replace('.csv','_with_uuids.csv') df.to_csv(fname, index=False)
[docs] def get_shortlist_df(self, split_uuids=False, retry_time=60): """Get dataframe short list Args: split_uuids: Boolean value saying if you want just datasets returned or the split_uuids as well Returns: The list of dataset_keys, along with their accompanying bucket, split type, and split_uuid if split_uuids is True """ if self.params.datastore: retry = True i = 0 while retry: try: df = dsf.retrieve_dataset_by_datasetkey(self.params.shortlist_key, self.params.bucket) retry=False except Exception as e: if i < 5: print("Could not retrieve shortlist %s because of exception %s, sleeping..." % (self.params.shortlist_key, e)) time.sleep(retry_time) i += 1 else: print("Could not retrieve shortlist %s because of exception %s, exiting" % (self.params.shortlist_key, e)) sys.exit(1) else: if not os.path.exists(self.params.shortlist_key): return None df = pd.read_csv(self.params.shortlist_key, index_col=False) if df is None: sys.exit(1) if len(df.columns) == 1: assays = df[df.columns[0]].values.tolist() else: if 'task_name' in df.columns: col_name = 'task_name' else: col_name = 'dataset_key' assays = df[col_name].values.tolist() if 'bucket' in df.columns: buckets = df['bucket'].values.tolist() elif 'bucket_name' in df.columns: buckets = df['bucket_name'].values.tolist() else: buckets=[self.params.bucket]*len(df) if 'response_cols' in df.columns: responses= df.response_cols.str.split(',').tolist() else: responses=[self.params.response_cols]*len(df) if 'collection' in df.columns: collections=df.collection.values.tolist() else: collections=[self.params.collection_name]*len(df) datasets=list(zip(assays,buckets,responses,collections)) datasets = [(d[0].strip(), d[1].strip(), ",".join(d[2]), d[3].strip()) for d in datasets] if not split_uuids: return datasets if type(self.params.splitter) == str: splitters = [self.params.splitter] else: splitters = self.params.splitter assays = [] for splitter in splitters: split_name = '%s_%d_%d' % (splitter, self.params.split_valid_frac*100, self.params.split_test_frac*100) if split_name in df.columns: for i, row in df.iterrows(): try: assays.append((datasets[i][0], datasets[i][1], datasets[i][2], datasets[i][3], splitter, row[split_name])) except: print("dataset_key, bucket, response_cols, & collecion_name must be specified in shortlist or config file, not neither.") else: print(f"Warning: {split_name} not found in shortlist. Creating default split scaffold_10_10 now.") for assay, bucket, response_cols, collection in datasets: try: # do we want to move this into loop so we ignore ones it failed for? if self.params.datastore: split_uuid = self.return_split_uuid(assay, bucket) else: split_uuid = self.return_split_uuid_file(assay, response_cols, bucket) assays.append((assay, bucket, response_cols, collection, splitter, split_uuid)) except Exception as e: print("Splitting failed for dataset %s, skipping..." % assay) print(e) print(traceback.print_exc()) continue return assays
[docs] def build_jobs(self): """Builds jobs. Reformats parameters as necessary Returns: None """ result_assay_params = [] for assay, bucket, response_cols, collection, splitter, split_uuid in self.assays: # Writes the series of command line arguments for scripts without a hyperparameter combo assay_params = copy.deepcopy(self.new_params) assay_params['dataset_key'] = assay assay_params['dataset_name'] = os.path.splitext(os.path.basename(assay))[0] assay_params['bucket'] = bucket assay_params['response_cols'] = response_cols assay_params['collection_name'] = collection assay_params['split_uuid'] = split_uuid assay_params['previously_split'] = True assay_params['splitter'] = splitter print(f"prediction_type: {assay_params['prediction_type']}") try: self.get_dataset_metadata(assay_params) except Exception as e: print(e) print(traceback.print_exc()) continue # creates output directory base_result_dir = os.path.join(assay_params['result_dir'], assay_params['dataset_name']) if not self.param_combos: assay_params['result_dir'] = os.path.join(base_result_dir, str(uuid.uuid4())) result_assay_params.append(assay_params) else: for combo in self.param_combos: # For a temporary parameter list, appends and modifies parameters for each hyperparameter combo. combo_params = copy.deepcopy(assay_params) for key, value in combo.items(): if key == 'layers': for k, v in value.items(): combo_params[k] = v else: combo_params[key] = value combo_params['result_dir'] = os.path.join(base_result_dir, str(uuid.uuid4())) result_assay_params.append(combo_params) return result_assay_params
[docs] def filter_jobs(self, job_list): """Removes jobs that should not be run Returns: None """ result_list = [] for assay_params in job_list: if assay_params['model_type'] == 'NN' and assay_params['featurizer'] != 'graphconv': if assay_params['dataset_key'] in self.num_rows: num_params = get_num_params(assay_params) if num_params*self.params.nn_size_scale_factor >= self.num_rows[assay_params['dataset_key']]: continue if not self.params.rerun and self.already_run(assay_params): continue result_list.append(assay_params) return result_list
[docs] def submit_jobs(self, job_list, retry_time=60): """Reformats parameters as necessary and then calls run_command in a loop to submit a job for each param combo Returns: None """ for assay_params in job_list: if len(self.filter_jobs([assay_params]))==1: i = int(run_cmd('squeue | grep $(whoami) | wc -l').decode("utf-8")) while i >= self.params.max_jobs: print("%d jobs in queue, sleeping" % i) time.sleep(retry_time) i = int(run_cmd('squeue | grep $(whoami) | wc -l').decode("utf-8")) self.log.info(assay_params) self.out_file.write(str(assay_params)) run_command(self.shell_script, self.params.python_path, self.params.script_dir, assay_params)
[docs] def already_run(self, assay_params, retry_time=10): """Checks to see if a model with a given metadata combination has already been built Args: assay_params: model metadata information Returns: Boolean specifying if model has been previously built """ if not self.params.save_results: return False filter_dict = copy.deepcopy(assay_params) for key in ['result_dir', 'previously_featurized', 'collection_name', 'time_generated', 'hyperparam_uuid', 'model_uuid']: if key in filter_dict: del filter_dict[key] filter_dict = reformat_filter_dict(filter_dict) retry = True i = 0 while retry: try: print(f"Checking model tracker DB for existing model with parameter combo in {assay_params['collection_name']} collection.") models = list(trkr.get_full_metadata(filter_dict, collection_name=assay_params['collection_name'])) retry = False except Exception as e: if i < 5: time.sleep(retry_time) i += 1 else: print("Could not check Model Tracker for existing model at this time because of exception %s" % e) return False if models: print("Already created model for this param combo") return True print("No existing model found") return False
[docs] def generate_combo(self, params_dict): """This is implemented in the specific sub-classes """ raise NotImplementedError
[docs] def generate_searches(self): """Generate a list of training jobs Generates a list of model training jobs that spans the hyperparameter search space. This function filters out jobs that are redundant by calling filter_jobs Args: None Returns: list(tuple): A list of tuples that contain assay parameters """ print("Generating param combos") self.generate_param_combos() print("Generating assay list") self.generate_assay_list() print("build_ jobs") job_list = self.build_jobs() # print("filter redundant jobs") # job_list = self.filter_jobs(job_list) return job_list
[docs] def generate_maestro_commands(self): """Generates commands that can be used by maestro Generates a list of commands that can be put directly into the shell to run model training. Args: None Returns: list: A list of shell commands """ job_list = self.generate_searches() commands = [] for assay_params in job_list: commands.append(gen_maestro_command(self.params.python_path, self.params.script_dir, assay_params)) return commands
[docs] class GridSearch(HyperparameterSearch): """Generates fixed steps on a grid for a given hyperparameter range""" def __init__(self, params): super().__init__(params)
[docs] def split_and_save_dataset(self, assay_params): self.split_and_save_dataset(assay_params)
[docs] def generate_param_combos(self): super().generate_param_combos()
[docs] def generate_assay_list(self): super().generate_assay_list()
[docs] def generate_combo(self, params_dict): """Method to generate all combinations from a given set of key-value pairs Args: params_dict: Set of key-value pairs with the key being the param name and the value being the list of values you want to try for that param Returns: new_dict: The list of all combinations of parameters """ if not params_dict: return None new_dict = {} for key, value in params_dict.items(): assert isinstance(value, Iterable) if key == 'layers': new_dict[key] = value elif type(value[0]) != str: tmp_list = list(np.linspace(value[0], value[1], value[2])) if key in self.convert_to_int: new_dict[key] = [int(x) for x in tmp_list] else: new_dict[key] = tmp_list else: new_dict[key] = value return new_dict
[docs] class RandomSearch(HyperparameterSearch): """Generates the specified number of random parameter values for within the specified range""" def __init__(self, params): super().__init__(params)
[docs] def split_and_save_dataset(self, assay_params): self.split_and_save_dataset(assay_params)
[docs] def generate_param_combos(self): super().generate_param_combos()
[docs] def generate_assay_list(self): super().generate_assay_list()
[docs] def generate_combo(self, params_dict): """Method to generate all combinations from a given set of key-value pairs Args: params_dict: Set of key-value pairs with the key being the param name and the value being the list of values you want to try for that param Returns: new_dict: The list of all combinations of parameters """ if not params_dict: return None new_dict = {} for key, value in params_dict.items(): assert isinstance(value, Iterable) if key == 'layers': new_dict[key] = value elif type(value[0]) != str: tmp_list = list(np.random.uniform(value[0], value[1], value[2])) if key in self.convert_to_int: new_dict[key] = [int(x) for x in tmp_list] else: new_dict[key] = tmp_list else: new_dict[key] = value return new_dict
[docs] class GeometricSearch(HyperparameterSearch): """Generates parameter values in logistic steps, rather than linear like GridSearch does""" def __init__(self, params): super().__init__(params)
[docs] def split_and_save_dataset(self, assay_params): self.split_and_save_dataset(assay_params)
[docs] def generate_param_combos(self): super().generate_param_combos()
[docs] def generate_assay_list(self): super().generate_assay_list()
[docs] def generate_combo(self, params_dict): """Method to generate all combinations from a given set of key-value pairs Args: params_dict: Set of key-value pairs with the key being the param name and the value being the list of values you want to try for that param Returns: new_dict: The list of all combinations of parameters """ if not params_dict: return None new_dict = {} for key, value in params_dict.items(): assert isinstance(value, Iterable) if key == 'layers': new_dict[key] = value elif type(value[0]) != str: tmp_list = list(np.geomspace(value[0], value[1], int(value[2]))) if key in self.convert_to_int: new_dict[key] = [int(x) for x in tmp_list] else: new_dict[key] = tmp_list else: new_dict[key] = value return new_dict
[docs] class UserSpecifiedSearch(HyperparameterSearch): """Generates combinations using the user-specified steps""" def __init__(self, params): super().__init__(params)
[docs] def split_and_save_dataset(self, assay_params): self.split_and_save_dataset(assay_params)
[docs] def generate_param_combos(self): super().generate_param_combos()
[docs] def generate_assay_list(self): super().generate_assay_list()
[docs] def generate_combo(self, params_dict): """Method to generate all combinations from a given set of key-value pairs Args: params_dict: Set of key-value pairs with the key being the param name and the value being the list of values you want to try for that param Returns: new_dict: The list of all combinations of parameters """ if not params_dict: return None new_dict = {} for key, value in params_dict.items(): assert isinstance(value, Iterable) if key == 'layers': new_dict[key] = value elif key in self.convert_to_int: new_dict[key] = [int(x) for x in value] elif key in self.convert_to_float: new_dict[key] = [float(x) for x in value] else: new_dict[key] = value return new_dict
[docs] def build_hyperopt_search_domain(label, method, param_list): """Generate HyperOpt search domain object from method and parameters, layer_nums is only for NN models. This function is used by the HyperOptSearch class, not intended for standalone usage. """ if method == "choice": return hp.choice(label, param_list) elif method == "uniform": return hp.uniform(label, param_list[0], param_list[1]) elif method == "loguniform": return hp.loguniform(label, param_list[0], param_list[1]) elif method == "uniformint": return hp.uniformint(label, param_list[0], param_list[1]) else: raise Exception(f"Method {method} is not supported, choose from 'choice, uniform, loguniform, uniformint'.")
[docs] class HyperOptSearch(): """Perform hyperparameter search with Bayesian Optmization (Tree Parzen Estimator) To use HyperOptSearch, modify the config json file as follows: serach_type: use "hyperopt" result_dir: use two directories (recommended), separated by comma, 1st one will be used to save the best model tarball, 2nd one will be used to store all models during the process. e.g. "result_dir": "/path/of/the/final/dir,/path/of/the/temp/dir" model_type: RF or NN, also add max number of HyperOptSearch evaluations, e.g. "model_type": "RF|100". If no max number provide, the default 100 will be used. #For NN models only lr: specify learning rate searching method and related parameters as the following scheme. method|parameter1,parameter2... method: supported searching schemes in HyperOpt include: choice, uniform, loguniform, and uniformint, see https://github.com/hyperopt/hyperopt/wiki/FMin for details. parameters: choice: all values to search from, separated by comma, e.g. choice|0.0001,0.0005,0.0002,0.001 uniform: low and high bound of the interval to serach, e.g. uniform|0.00001,0.001 loguniform: low and high bound (in natural log) of the interval to serach, e.g. loguniform|-13.8,-6.9 uniformint: low and high bound of the interval to serach, e.g. uniformint|8,256 ls: similar as learning_rate, specify number of layers and size of each one. method|num_layers|parameter1,parameter2... e.g. choice|2|8,16,32,64,128,256,512 #this will generate a two-layer config, each layer takes size from the list "8,16,32,64,128,256,512" e.g. uniformint|3|8,512 #this will generate a three-layer config, each layer takes size from the uniform interval [8,512] dp: similar as layer_sizes, just make sure dropouts and layer_sizes should have the same number of layers. e.g. uniform|3|0,0.4 #this will generate a three-layer config, each layer takes size from the uniform interval [0,0.4] #For RF models only rfe: rf_estimator, same structure as the learning rate above, e.g. uniformint|64,512 #take integer values from a uniform interval [64,512] rfd: rf_max_depth, e.g. uniformint|8,256 rff: rf_max_feature, e.g. uniformint|8,128 """ def __init__(self, params): self.params = params #separate temp output dir and final output dir result_dir_list = params.result_dir.split(",") if len(result_dir_list) > 1: self.params.result_dir = result_dir_list[1] self.final_dir = result_dir_list[0] else: self.params.result_dir = result_dir_list[0] self.final_dir = result_dir_list[0] if len(self.params.model_type.split("|")) > 1: self.max_eval = int(self.params.model_type.split("|")[1]) self.params.model_type = self.params.model_type.split("|")[0] else: self.max_eval = 100 #define the searching space self.space = {} if isinstance(self.params.featurizer, list): self.space["featurizer"] = build_hyperopt_search_domain("featurizer", "choice", self.params.featurizer) if isinstance(self.params.descriptor_type, list): self.space["descriptor_type"] = build_hyperopt_search_domain("descriptor_type", "choice", self.params.descriptor_type) if self.params.model_type == "RF": #build searching domain for RF parameters if self.params.rfe: domain_list = self.params.rfe.split("|") method = domain_list[0] par_list = [float(e) for e in domain_list[1].split(",")] self.space["rf_estimators"] = build_hyperopt_search_domain("rf_estimators", method, par_list) if self.params.rfd: domain_list = self.params.rfd.split("|") method = domain_list[0] par_list = [float(e) for e in domain_list[1].split(",")] self.space["rf_max_depth"] = build_hyperopt_search_domain("rf_max_depth", method, par_list) if self.params.rff: domain_list = self.params.rff.split("|") method = domain_list[0] par_list = [float(e) for e in domain_list[1].split(",")] self.space["rf_max_features"] = build_hyperopt_search_domain("rf_max_features", method, par_list) elif self.params.model_type == "NN": #build searching domain for NN parameters if self.params.lr: domain_list = self.params.lr.split("|") method = domain_list[0] par_list = [float(e) for e in domain_list[1].split(",")] self.space["learning_rate"] = build_hyperopt_search_domain("learning_rate", method, par_list) # for layer sizes, use a different method if the ls_ratio is provided if self.params.ls: domain_list = self.params.ls.split("|") method = domain_list[0] num_layer = int(domain_list[1]) par_list = [float(e) for e in domain_list[2].split(",")] if not self.params.ls_ratio: for i in range(num_layer): self.space[f"ls{i}"] = build_hyperopt_search_domain(f"ls{i}", method, par_list) else: self.space["ls"] = build_hyperopt_search_domain("ls", method, par_list) domain_list = self.params.ls_ratio.split("|") method = domain_list[0] par_list = [float(e) for e in domain_list[-1].split(",")] for i in range(1,num_layer): self.space[f"ratio{i}"] = build_hyperopt_search_domain(f"ratio{i}", method, par_list) if self.params.dp: domain_list = self.params.dp.split("|") method = domain_list[0] num_layer = int(domain_list[1]) par_list = [float(e) for e in domain_list[2].split(",")] for i in range(num_layer): self.space[f"dp{i}"] = build_hyperopt_search_domain(f"dp{i}", method, par_list) elif self.params.model_type == "xgboost": #build searching domain for XGBoost parameters if self.params.xgbg: domain_list = self.params.xgbg.split("|") method = domain_list[0] par_list = [float(e) for e in domain_list[1].split(",")] self.space["xgbg"] = build_hyperopt_search_domain("xgbg", method, par_list) if self.params.xgbl: domain_list = self.params.xgbl.split("|") method = domain_list[0] par_list = [float(e) for e in domain_list[1].split(",")] self.space["xgbl"] = build_hyperopt_search_domain("xgbl", method, par_list) if self.params.xgbd: domain_list = self.params.xgbd.split("|") method = domain_list[0] par_list = [float(e) for e in domain_list[1].split(",")] self.space["xgbd"] = build_hyperopt_search_domain("xgbd", method, par_list) if self.params.xgbc: domain_list = self.params.xgbc.split("|") method = domain_list[0] par_list = [float(e) for e in domain_list[1].split(",")] self.space["xgbc"] = build_hyperopt_search_domain("xgbc", method, par_list) if self.params.xgbs: domain_list = self.params.xgbs.split("|") method = domain_list[0] par_list = [float(e) for e in domain_list[1].split(",")] self.space["xgbs"] = build_hyperopt_search_domain("xgbs", method, par_list) if self.params.xgbn: domain_list = self.params.xgbn.split("|") method = domain_list[0] par_list = [float(e) for e in domain_list[1].split(",")] self.space["xgbn"] = build_hyperopt_search_domain("xgbn", method, par_list) if self.params.xgbw: domain_list = self.params.xgbw.split("|") method = domain_list[0] par_list = [float(e) for e in domain_list[1].split(",")] self.space["xgbw"] = build_hyperopt_search_domain("xgbw", method, par_list)
[docs] def parse_params(param_list): """Parse paramters Parses parameters using parameter_parser.wrapper and filters out unnecessary parameters. Returns what an argparse.Namespace Args: *any_arg: any single input of a str, dict, argparse.Namespace, or list Returns: argparse.Namespace """ params = parse.wrapper(param_list) keep_params = {'prediction_type', 'model_type', 'featurizer', 'hyperparam_uuid', 'splitter', 'datastore', 'save_results', 'previously_featurized', 'previously_split', 'prediction_type', 'descriptor_key', 'descriptor_type', 'split_valid_frac', 'split_test_frac', 'split_uuid', 'bucket', 'lc_account', 'slurm_account', 'slurm_export', 'slurm_nodes', 'slurm_options', 'slurm_partition', 'slurm_time_limit'} | excluded_keys if params.search_type == 'hyperopt': # keep more parameters keep_params = keep_params | {'lr', 'learning_rate','ls', 'layer_sizes','ls_ratio','dp', 'dropouts','rfe', 'rf_estimators','rfd', 'rf_max_depth','rff', 'rf_max_features','xgbg', 'xgb_gamma','xgbl', 'xgb_learning_rate', 'xgbd', 'xgb_max_depth', 'xgbc', 'xgb_colsample_bytree', 'xgbs', 'xgb_subsample', 'xgbn', 'xgb_n_estimators', 'xgbw', 'xgb_min_child_weight', 'hp_checkpoint_load', 'hp_checkpoint_save'} params.__dict__ = parse.prune_defaults(params, keep_params=keep_params) return params
[docs] def main(): """Entry point when script is run Args: None Returns: None """ params = parse_params(sys.argv[1:]) hs = build_search(params) if params.split_only and params.datastore: hs.generate_split_shortlist() elif params.split_only and not params.datastore: hs.generate_split_shortlist_file() else: hs.run_search()
if __name__ == '__main__' and len(sys.argv) > 1: main() sys.exit(0)