Source code for utils.data_curation_functions

"""data_curation_functions.py

Extract Kevin's functions for curation of public datasets
Modify them to match Jonathan's curation methods in notebook
01/30/2020
"""

import os
import sys
import numpy as np
import pandas as pd
import pdb

from rdkit import Chem

from atomsci.ddm.utils.struct_utils import base_smiles_from_smiles, mols_from_smiles
import atomsci.ddm.utils.datastore_functions as dsf
from atomsci.ddm.utils import curate_data as curate
import atomsci.ddm.utils.struct_utils as struct_utils
import atomsci.ddm.utils.curate_data as curate_data, imp

[docs] def set_data_root(dir): """Set global variables for data directories Creates paths for DTC and Excape given a root data directory. Global variables 'data_root' and 'data_dirs'. 'data_root' is the root data directory. 'data_dirs' is a dictionary that maps 'DTC' and 'Excape' to directores calcuated from 'data_root' Args: dir (str): root data directory containing folds 'dtc' and 'excape' Returns: None """ global data_root, data_dirs data_root = dir #data_dirs = dict(ChEMBL = '%s/ChEMBL' % data_root, DTC = '%s/DTC' % data_root, # Excape = '%s/Excape' % data_root) data_dirs = dict(DTC = '%s/dtc' % data_root, Excape = '%s/excape' % data_root)
log_var_map = { 'IC50': 'pIC50', 'AC50': 'pIC50', 'Solubility': 'logSolubility', 'CL': 'logCL' } pub_dsets = dict( CYP2D6 = dict(IC50='cyp2d6'), CYP3A4 = dict(IC50='cyp3a4'), JAK1 = dict(IC50="jak1"), JAK2 = dict(IC50="jak2"), JAK3 = dict(IC50="jak3"), ) # The following list includes the nonmetals commonly found in organic molecules, along with alkali and alkaline earth # metals commonly found in salts (Na, Mg, K, Ca). organic_atomic_nums = [1, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 17, 19, 20, 33, 34, 35, 53] # ---------------------------------------------------------------------------------------------------------------------- # Generic functions for all datasets # ---------------------------------------------------------------------------------------------------------------------- # Note: Functions freq_table and labeled_freq_table have been moved to ddm.utils.curate_data module.
[docs] def is_organometallic(mol): """ Returns True if the molecule is organometallic """ if mol is None: return True for at in mol.GetAtoms(): if not (at.GetAtomicNum() in organic_atomic_nums): return True return False
# ----------------------------------------------------------------------------------------------------------------------
[docs] def exclude_organometallics(df, smiles_col='rdkit_smiles'): """Filters data frame df based on column smiles_col to exclude organometallic compounds""" mols = mols_from_smiles(df[smiles_col].values.tolist(), workers=16) include = np.array([not is_organometallic(mol) for mol in mols]) return df[include].copy()
# ----------------------------------------------------------------------------------------------------------------------
[docs] def standardize_relations(dset_df, db=None, rel_col=None, output_rel_col=None, invert=False): """Standardizes censoring operators Standardize the censoring operators to =, < or >, and remove any rows whose operators don't map to a standard one. There is a special case for db='ChEMBL' that strips the extra "'"s around relationship symbols. Assumes relationship columns are 'Standard Relation', 'standard_relation' and 'activity_prefix' for ChEMBL, DTC and GoStar respectively. This function makes the following mappings: ">" to ">", ">=" to ">", "<" to "<", "<=" to "<", and "=" to "=". All other relations are removed from the DataFrame. Args: dset_df (DataFrame): Input DataFrame. Must contain either 'Standard Relation' or 'standard_relation' db (str): Source database. Must be either 'GoStar', 'DTC' or 'ChEMBL'. Required if rel_col is not specified. rel_col (str): Column containing relational operators. If specified, overrides the default relation column for db. output_rel_col (str): If specified, put the standardized operators in a new column with this name and leave the original operator column unchanged. invert (bool): If true, replace the inequality operators with their inverses. This is useful when a reported value such as IC50 is converted to its negative log such as pIC50. Returns: DataFrame: Dataframe with the standardized relationship sybmols """ if rel_col is None: relation_cols = dict(ChEMBL='standard_relation', DTC='standard_relation', GoStar='activity_prefix') try: rel_col = relation_cols[db] except KeyError: raise ValueError(f"Unknown database {db} for standardize_relations") if output_rel_col is None: output_rel_col = rel_col try: dset_df[rel_col].fillna('=', inplace=True) except KeyError: raise ValueError(f"Dataset doesn't contain relation column {rel_col} expected for source database {db}") ops = dset_df[rel_col].values if db == 'ChEMBL': # Remove annoying quotes around operators ops = [op.lstrip("'").rstrip("'") for op in ops] op_dict = { ">=": ">", "<": "<", "<=": "<", ">": ">", ">R": ">", ">=R": ">", "<R": "<", "<=R": "<", "~": "=", "=": "=" } ops = np.array([op_dict.get(op, "@") for op in ops]) if invert: inv_op = {'>': '<', '<': '>'} ops = np.array([inv_op.get(op, op) for op in ops]) dset_df[output_rel_col] = ops dset_df = dset_df[dset_df[output_rel_col] != "@"].copy() return dset_df
# ---------------------------------------------------------------------------------------------------------------------- # DTC-specific curation functions # ----------------------------------------------------------------------------------------------------------------------
[docs] def upload_file_dtc_raw_data(dset_name, title, description, tags, functional_area, target, target_type, activity, assay_category,file_path, data_origin='journal', species='human', force_update=False): """Uploads raw DTC data to the datastore Upload a raw dataset to the datastore from the given DataFrame. Returns the datastore OID of the uploaded dataset. The dataset is uploaded to the public bucket and lists https://doi.org/10.1016/j.chembiol.2017.11.009' as the doi. This also assumes that the id_col is 'compound_id' Args: dset_name (str): Name of the dataset. Should not include a file extension. title (str): title of the file in (human friendly format) description (str): long text box to describe file (background/use notes) tags (list): Must be a list of strings. functional_area (str): The functional area. target (str): The target. target_type (str): The target type of the dataset. activity (str): The activity of the dataset. assay_category (str): The assay category of the dataset. file_path (str): The filepath of the dataset. data_origin (str): The origin of the dataset e.g. journal. species (str): The species of the dataset e.g. human, rat, dog. force_update (bool): Overwrite existing datasets in the datastore. Returns: str: datastore OID of the uploaded dataset. """ bucket = 'public' filename = '%s.csv' % dset_name dataset_key = 'dskey_' + filename kv = { 'file_category': 'experimental', 'activity': activity, 'assay_category':assay_category, 'assay_endpoint' : 'multiple values', 'curation_level': 'raw', 'data_origin' : data_origin, 'functional_area' : functional_area, 'matrix' : 'multiple values', 'journal_doi' : 'https://doi.org/10.1016/j.chembiol.2017.11.009', 'sample_type' : 'in_vitro', 'species' : species, 'target' : target, 'target_type' : target_type, 'id_col' : 'compound_id' } #uploaded_file = dsf.upload_file_to_DS(bucket=bucket, filepath=file_path, filename=filename, title = title, description=description, tags=tags, key_values=kv, client=None, dataset_key=dataset_key, override_check=False, return_metadata=True) ds_client = dsf.config_client() if force_update or not dsf.dataset_key_exists(dataset_key, bucket, ds_client): #uploaded_file = dsf.upload_df_to_DS(dset_df, bucket, filename=filename, title=title, # description=description, # tags=tags, key_values=kv, client=None, dataset_key=dataset_key, # override_check=True, return_metadata=True) uploaded_file = dsf.upload_file_to_DS(bucket=bucket, filepath=file_path, filename=filename, title = title, description=description, tags=tags, key_values=kv, client=None, dataset_key=dataset_key, override_check=False, return_metadata=True) print("Uploaded raw dataset with key %s" % dataset_key) else: uploaded_file = dsf.retrieve_dataset_by_datasetkey(dataset_key, bucket, ds_client, return_metadata=True) print("Raw dataset %s is already in datastore, skipping upload." % dataset_key) raw_dset_oid = uploaded_file['dataset_oid'] return raw_dset_oid
[docs] def filter_dtc_data(orig_df,geneNames): """Extracts and post processes JAK1, 2, and 3 datasets from DTC This is specific to the DTC database. Extract JAK1, 2 and 3 datasets from Drug Target Commons database, filtered for data usability. filter criteria: gene_names == JAK1 | JAK2 | JAK3 InChi key not missing standard_type IC50 units NM standard_relation mappable to =, < or > wildtype_or_mutant != 'mutated' valid SMILES maps to valid RDKit base SMILES standard_value not missing pIC50 > 3 Args: orig_df (DataFrame): Input DataFrame. Must contain the following columns: gene_names standard_inchi_key, standard_type, standard_units, standard_value, compound_id, wildtype_or_mutant. geneNames (list): A list of gene names to filter out of orig_df e.g. ['JAK1', 'JAK2']. Returns: DataFrame: The filtered rows of the orig_df """ dset_df = orig_df[orig_df.gene_names.isin(geneNames) & ~(orig_df.standard_inchi_key.isna()) & (orig_df.standard_type == 'IC50') & (orig_df.standard_units == 'NM') & ~orig_df.standard_value.isna() & ~orig_df.compound_id.isna() & (orig_df.wildtype_or_mutant != 'mutated') ] return dset_df
[docs] def ic50topic50(x) : """Calculates pIC50 from IC50 Args: x (float): An IC50 in nanomolar (nM) units. Returns: float: The pIC50. """ print(x) return -np.log10((x/1000000000.0))
[docs] def compute_negative_log_responses(df, unit_col='unit', value_col='value', new_value_col='average_col', relation_col=None, new_relation_col=None, unit_conv={'uM':lambda x: x*1e-6, 'nM':lambda x: x*1e-9}, inplace=False): """Given the response values in `value_col` (IC50, Ki, Kd, etc.), compute their negative base 10 logarithms (pIC50, pKi, pKd, etc.) after converting them to molar units and store them in `new_value_col`. If `relation_col` is provided, replace any '<' or '>' relations with their opposites and store the result in `new_relation_col` (if provided), or in `relation_col` if note. Rows where the original value is 0 or negative will be dropped from the dataset. Args: df (DataFrame): A DataFrame that contains `value_col`, `unit_col` and `relation_col`. unit_conv (dict): A dictionary mapping concentration units found in `unit_col` to functions that convert the corresponding concentrations to molar. The default handles micromolar and nanomolar units, represented as 'uM' and 'nM' respectively. unit_col (str): Column containing units. value_col (str): Column containing input values. new_value_col (str): Column to receive converted values. relation_col (str): Column containing relational operators for censored data. new_relation_col (str): Column to receive inverted relations applicable to the negative log transformed values. inplace (bool): If True, the input DataFrame is modified in place when possible. The default is to return a copy Returns: DataFrame: A table containing the transformed values and relations. """ missing_units = list(set(df[unit_col]) - set(unit_conv.keys())) assert len(missing_units) == 0, f"unit_conv lacks converter(s) for units {', '.join(missing_units)}" # Drop rows for which log can't be computed if np.any(df[value_col].values <= 0.0): df = df[df[value_col] > 0.0].copy() elif not inplace: df = df.copy() new_vals = [] new_relations = [] inverse_rel = str.maketrans('<>', '><') for i, row in df.iterrows(): ic50 = row[value_col] pic50 = -np.log10(unit_conv[row[unit_col]](ic50)) new_vals.append(pic50) if relation_col is not None: rel = row[relation_col] if isinstance(rel, str): rel = rel.translate(inverse_rel) new_relations.append(rel) df[new_value_col] = new_vals if relation_col is not None: if new_relation_col is None: df[relation_col] = new_relations else: df[new_relation_col] = new_relations return df
[docs] def convert_IC50_to_pIC50(df, unit_col='unit', value_col='value', new_value_col='average_col', relation_col=None, new_relation_col=None, unit_conv={'uM':lambda x: x*1e-6, 'nM':lambda x: x*1e-9}, inplace=False): """For backward compatibiltiy only: equivalent to calling `compute_negative_log_responses` with the same arguments.""" return compute_negative_log_responses(df, unit_col=unit_col, value_col=value_col, new_value_col=new_value_col, relation_col=relation_col, new_relation_col=new_relation_col, unit_conv=unit_conv, inplace=inplace)
[docs] def down_select(df,kv_lst) : """Filters rows given a set of values Given a DataFrame and a list of tuples columns (k) to values (v), this function filters out all rows where df[k] == v. Args: df (DataFrame): An input DataFrame. kv_list (list): A list of tuples of (column, value) Returns: DataFrame: Rows where all df[k] == v """ for k,v in kv_lst : df=df[df[k]==v] return df
[docs] def get_smiles_dtc_data(nm_df,targ_lst,save_smiles_df): """Returns SMILES strings from DTC data nm_df must be a DataFrame from DTC with the following columns: gene_names, standard_type, standard_value, 'standard_inchi_key', and standard_relation. This function selects all rows where nm_df['gene_names'] is in targ_lst, nm_df['standard_type']=='IC50', nm_df['standard_relation']=='=', and 'standard_value' > 0. Then pIC50 values are calculated and added to the 'PIC50' column, and smiles strings are merged in from save_smiles_df Args: nm_df (DataFrame): Input DataFrame. targ_lst (list): A list of targets. save_smiles_df (DataFrame): A DataFrame with the column 'standard_inchi_key' Returns: list, list: A list of smiles and a list of inchi keys shared between targets. """ save_df={} for targ in targ_lst : lst1= [ ('gene_names',targ),('standard_type','IC50'),('standard_relation','=') ] lst1_tmp= [ ('gene_names',targ),('standard_type','IC50')] jak1_df=down_select(nm_df,lst1) jak1_df_tmp=down_select(nm_df,lst1_tmp) print(targ,"distinct compounds = only",jak1_df['standard_inchi_key'].nunique()) print(targ,"distinct compounds <,>,=",jak1_df_tmp['standard_inchi_key'].nunique()) ## we convert to log values so make sure there are no 0 values save_df[targ]=jak1_df_tmp[jak1_df_tmp['standard_value']>0] prev_targ=targ_lst[0] shared_inchi_keys=save_df[prev_targ]['standard_inchi_key'] for it in range(1,len(targ_lst),1) : curr_targ=targ_lst[it] df=save_df[curr_targ] shared_inchi_keys=df[df['standard_inchi_key'].isin(shared_inchi_keys)]['standard_inchi_key'] print("num shared compounds",shared_inchi_keys.nunique()) lst=[] for targ in targ_lst : df=save_df[targ] #print(aurka_df.shape,aurkb_df.shape, shared_inchi_keys.shape) lst.append(df[df['standard_inchi_key'].isin(shared_inchi_keys)]) shared_df=pd.concat(lst) # Add pIC50 values print('Add pIC50 values.') print(shared_df['standard_value']) shared_df['PIC50']=shared_df['standard_value'].apply(ic50topic50) # Merge in SMILES strings print('Merge in SMILES strings.') smiles_lst=[] for targ in targ_lst : df=save_df[targ] df['PIC50']=df['standard_value'].apply(ic50topic50) smiles_df=df.merge(save_smiles_df,on='standard_inchi_key',suffixes=('_'+targ,'_')) #the file puts the SMILES string in quotes, which need to be removed smiles_df['smiles']=smiles_df['smiles'].str.replace('"','') smiles_df['rdkit_smiles']=smiles_df['smiles'].apply(struct_utils.base_smiles_from_smiles) smiles_df['smiles']=smiles_df['smiles'].str.replace('"','') print(smiles_df.shape) print(smiles_df['standard_inchi_key'].nunique()) smiles_lst.append(smiles_df) return smiles_lst, shared_inchi_keys
[docs] def get_smiles_4dtc_data(nm_df,targ_lst,save_smiles_df): """Returns SMILES strings from DTC data nm_df must be a DataFrame from DTC with the following columns: gene_names, standard_type, standard_value, 'standard_inchi_key', and standard_relation. This function selects all rows where nm_df['gene_names'] is in targ_lst, nm_df['standard_type']=='IC50', nm_df['standard_relation']=='=', and 'standard_value' > 0. Then pIC50 values are calculated and added to the 'PIC50' column, and smiles strings are merged in from save_smiles_df Args: nm_df (DataFrame): Input DataFrame. targ_lst (list): A list of targets. save_smiles_df (DataFrame): A DataFrame with the column 'standard_inchi_key' Returns: list, list, str: A list of smiles. A list of inchi keys shared between targets. And a description of the targets """ save_df={} description_str = "" for targ in targ_lst : lst1= [ ('gene_names',targ),('standard_type','IC50'),('standard_relation','=') ] lst1_tmp= [ ('gene_names',targ),('standard_type','IC50')] jak1_df=down_select(nm_df,lst1) jak1_df_tmp=down_select(nm_df,lst1_tmp) print(targ,"distinct compounds = only",jak1_df['standard_inchi_key'].nunique()) print(targ,"distinct compounds <,>,=",jak1_df_tmp['standard_inchi_key'].nunique()) description = ''' # '''+targ+" distinct compounds = only: "+str(jak1_df['standard_inchi_key'].nunique())+''' # '''+targ+" distinct compounds <,>,=: "+str(jak1_df_tmp['standard_inchi_key'].nunique()) description_str += description #to ignore censored data #save_df[targ]=jak1_df #to include censored data save_df[targ]=jak1_df_tmp prev_targ=targ_lst[0] shared_inchi_keys=save_df[prev_targ]['standard_inchi_key'] for it in range(1,len(targ_lst),1) : curr_targ=targ_lst[it] df=save_df[curr_targ] shared_inchi_keys=df[df['standard_inchi_key'].isin(shared_inchi_keys)]['standard_inchi_key'] print("num shared compounds",shared_inchi_keys.nunique()) lst=[] for targ in targ_lst : df=save_df[targ] #print(aurka_df.shape,aurkb_df.shape, shared_inchi_keys.shape) lst.append(df[df['standard_inchi_key'].isin(shared_inchi_keys)]) shared_df=pd.concat(lst) # Add pIC50 values print('Add pIC50 values.') shared_df['PIC50']=shared_df['standard_value'].apply(ic50topic50) # Merge in SMILES strings print('Merge in SMILES strings.') smiles_lst=[] for targ in targ_lst : df=save_df[targ] df['PIC50']=df['standard_value'].apply(ic50topic50) smiles_df=df.merge(save_smiles_df,on='standard_inchi_key',suffixes=('_'+targ,'_')) #the file puts the SMILES string in quotes, which need to be removed smiles_df['smiles']=smiles_df['smiles'].str.replace('"','') smiles_df['rdkit_smiles']=smiles_df['smiles'].apply(struct_utils.base_smiles_from_smiles) smiles_df['smiles']=smiles_df['smiles'].str.replace('"','') print("Shape of dataframe:", smiles_df.shape) print("Number of unique standard_inchi_key:", smiles_df['standard_inchi_key'].nunique()) smiles_lst.append(smiles_df) return smiles_lst, shared_inchi_keys, description_str
[docs] def upload_df_dtc_smiles(dset_name, title, description, tags, functional_area, target, target_type, activity, assay_category,smiles_df,orig_fileID, data_origin='journal', species='human', force_update=False): """Uploads DTC smiles data to the datastore Upload a raw dataset to the datastore from the given DataFrame. Returns the datastore OID of the uploaded dataset. The dataset is uploaded to the public bucket and lists https://doi.org/10.1016/j.chembiol.2017.11.009' as the doi. This also assumes that the id_col is 'compound_id' Args: dset_name (str): Name of the dataset. Should not include a file extension. title (str): title of the file in (human friendly format) description (str): long text box to describe file (background/use notes) tags (list): Must be a list of strings. functional_area (str): The functional area. target (str): The target. target_type (str): The target type of the dataset. activity (str): The activity of the dataset. assay_category (str): The assay category of the dataset. smiles_df (DataFrame): DataFrame containing SMILES to be uploaded. orig_fileID (str): Source file id used to generate smiles_df. data_origin (str): The origin of the dataset e.g. journal. species (str): The species of the dataset e.g. human, rat, dog. force_update (bool): Overwrite existing datasets in the datastore. Returns: str: datastore OID of the uploaded dataset. """ bucket = 'public' filename = '%s_dtc_smiles.csv' % dset_name dataset_key = 'dskey_' + filename kv = { 'file_category': 'experimental', 'activity': activity, 'assay_category': assay_category, ## seems like this should be called 'kinase_activity' 'assay_endpoint' : 'pic50', 'curation_level': 'raw', 'data_origin' : data_origin, 'functional_area' : functional_area, 'matrix' : 'multiple values', 'journal_doi' : 'https://doi.org/10.1016/j.chembiol.2017.11.009', 'sample_type' : 'in_vitro', 'species' : species, 'target' : target, 'target_type' : target_type, 'id_col' : 'compound_id', 'source_file_id' : orig_fileID } #uploaded_file = dsf.upload_file_to_DS(bucket=bucket, filepath=file_path, filename=filename, title = title, description=description, tags=tags, key_values=kv, client=None, dataset_key=dataset_key, override_check=False, return_metadata=True) ds_client = dsf.config_client() if force_update or not dsf.dataset_key_exists(dataset_key, bucket, ds_client): uploaded_file = dsf.upload_df_to_DS(bucket=bucket, filename=filename,df=smiles_df, title = title, description=description, tags=tags, key_values=kv, client=None, dataset_key=dataset_key, override_check=False, return_metadata=True) #uploaded_file = dsf.upload_file_to_DS(bucket=bucket, filepath=file_path, filename=filename, title = title, description=description, tags=tags, key_values=kv, client=None, dataset_key=dataset_key, override_check=False, return_metadata=True) print("Uploaded raw dataset with key %s" % dataset_key) else: uploaded_file = dsf.retrieve_dataset_by_datasetkey(dataset_key, bucket, ds_client, return_metadata=True) print("Raw dataset %s is already in datastore, skipping upload." % dataset_key) raw_dset_oid = uploaded_file['dataset_oid'] return raw_dset_oid
[docs] def atom_curation(targ_lst, smiles_lst, shared_inchi_keys): """Apply ATOM standard 'curation' step to "shared_df": Average replicate assays, remove duplicates and drop cases with large variance between replicates. mleqonly Args: targ_lst (list): A list of targets. smiles_lst (list): A list of DataFrames. These DataFrames must contain the columns gene_names, standard_type, standard_relation, standard_inchi_key, PIC50, and rdkit_smiles shared_inchi_keys (list): A list of inchi keys used in this dataset. Returns: list, list:A list of curated DataFrames and a list of the number of compounds dropped during the curation process for each target. """ imp.reload(curate_data) tolerance=10 column='PIC50'; #'standard_value' list_bad_duplicates='No' max_std=1 curated_lst=[] num_dropped_lst=[] #print(targ_lst) #print(smiles_lst) for it in range(len(targ_lst)) : data=smiles_lst[it] data = data[data.standard_relation.str.strip() == '='] print("gene_names",data.gene_names.unique()) print("standard_type",data.standard_type.unique()) print("standard_relation",data.standard_relation.unique()) print("before",data.shape) curated_df=curate_data.average_and_remove_duplicates (column, tolerance, list_bad_duplicates, data, max_std, compound_id='standard_inchi_key',smiles_col='rdkit_smiles') # (Yaru) Remove inf in curated_df curated_df = curated_df[~curated_df.isin([np.inf]).any(1)] # (Yaru) Remove nan on rdkit_smiles curated_df = curated_df.dropna(subset=['rdkit_smiles']) curated_lst.append(curated_df) prev_cmpd_cnt=shared_inchi_keys.nunique() num_dropped=prev_cmpd_cnt-curated_df.shape[0] num_dropped_lst.append(num_dropped) print("After",curated_df.shape, "# of dropped compounds",num_dropped) return curated_lst,num_dropped_lst
[docs] def upload_df_dtc_mleqonly(dset_name, title, description, tags, functional_area, target, target_type, activity, assay_category,data_df,dtc_smiles_fileID, data_origin='journal', species='human', force_update=False): """Uploads DTC mleqonly data to the datastore Upload mleqonly data to the datastore from the given DataFrame. The DataFrame must contain the column 'rdkit_smiles' and 'VALUE_NUM_mean'. This function is meant to upload data that has been aggregated using atomsci.ddm.utils.curate_data.average_and_remove_duplicates. Returns the datastore OID of the uploaded dataset. The dataset is uploaded to the public bucket and lists https://doi.org/10.1016/j.chembiol.2017.11.009' as the doi. This also assumes that the id_col is 'compound_id'. Args: dset_name (str): Name of the dataset. Should not include a file extension. title (str): title of the file in (human friendly format) description (str): long text box to describe file (background/use notes) tags (list): Must be a list of strings. functional_area (str): The functional area. target (str): The target. target_type (str): The target type of the dataset. activity (str): The activity of the dataset. assay_category (str): The assay category of the dataset. data_df (DataFrame): DataFrame to be uploaded. dtc_smiles_fileID (str): Source file id used to generate data_df. data_origin (str): The origin of the dataset e.g. journal. species (str): The species of the dataset e.g. human, rat, dog. force_update (bool): Overwrite existing datasets in the datastore. Returns: str: datastore OID of the uploaded dataset. """ bucket = 'public' filename = '%s_dtc_mleqonly.csv' % dset_name dataset_key = 'dskey_' + filename kv = { 'file_category': 'experimental', 'activity': activity, 'assay_category': assay_category, ## seems like this should be called 'kinase_activity' 'assay_endpoint' : 'pic50', 'curation_level': 'ml_ready', 'data_origin' : data_origin, 'functional_area' : functional_area, 'matrix' : 'multiple values', 'journal_doi' : 'https://doi.org/10.1016/j.chembiol.2017.11.009', 'sample_type' : 'in_vitro', 'species' : species, 'target' : target, 'target_type' : target_type, 'id_col' : 'compound_id', 'response_col' : 'VALUE_NUM_mean', 'prediction_type' : 'regression', 'smiles_col' : 'rdkit_smiles', 'units' : 'unitless', 'source_file_id' : dtc_smiles_fileID } #uploaded_file = dsf.upload_file_to_DS(bucket=bucket, filepath=file_path, filename=filename, title = title, description=description, tags=tags, key_values=kv, client=None, dataset_key=dataset_key, override_check=False, return_metadata=True) ds_client = dsf.config_client() if force_update or not dsf.dataset_key_exists(dataset_key, bucket, ds_client): uploaded_file = dsf.upload_df_to_DS(bucket=bucket, filename=filename,df=data_df, title = title, description=description, tags=tags, key_values=kv, client=None, dataset_key=dataset_key, override_check=False, return_metadata=True) #uploaded_file = dsf.upload_file_to_DS(bucket=bucket, filepath=file_path, filename=filename, title = title, description=description, tags=tags, key_values=kv, client=None, dataset_key=dataset_key, override_check=False, return_metadata=True) print("Uploaded raw dataset with key %s" % dataset_key) else: uploaded_file = dsf.retrieve_dataset_by_datasetkey(dataset_key, bucket, ds_client, return_metadata=True) print("Raw dataset %s is already in datastore, skipping upload." % dataset_key) raw_dset_oid = uploaded_file['dataset_oid'] return raw_dset_oid
[docs] def upload_df_dtc_mleqonly_class(dset_name, title, description, tags, functional_area, target, target_type, activity, assay_category,data_df,dtc_mleqonly_fileID, data_origin='journal', species='human', force_update=False): """Uploads DTC mleqonly classification data to the datastore Upload mleqonly classification data to the datastore from the given DataFrame. The DataFrame must contain the column 'rdkit_smiles' and 'binary_class'. This function is meant to upload data that has been aggregated using atomsci.ddm.utils.curate_data.average_and_remove_duplicates and then thresholded to make a binary classification dataset. Returns the datastore OID of the uploaded dataset. The dataset is uploaded to the public bucket and lists https://doi.org/10.1016/j.chembiol.2017.11.009' as the doi. This also assumes that the id_col is 'compound_id'. Args: dset_name (str): Name of the dataset. Should not include a file extension. title (str): title of the file in (human friendly format) description (str): long text box to describe file (background/use notes) tags (list): Must be a list of strings. functional_area (str): The functional area. target (str): The target. target_type (str): The target type of the dataset. activity (str): The activity of the dataset. assay_category (str): The assay category of the dataset. data_df (DataFrame): DataFrame to be uploaded. dtc_mleqonly_fileID (str): Source file id used to generate data_df. data_origin (str): The origin of the dataset e.g. journal. species (str): The species of the dataset e.g. human, rat, dog. force_update (bool): Overwrite existing datasets in the datastore. Returns: str: datastore OID of the uploaded dataset. """ bucket = 'public' filename = '%s_dtc_mleqonly_class.csv' % dset_name dataset_key = 'dskey_' + filename kv = { 'file_category': 'experimental', 'activity': activity, 'assay_category': assay_category, ## seems like this should be called 'kinase_activity' 'assay_endpoint' : 'pic50', 'curation_level': 'ml_ready', 'data_origin' : data_origin, 'functional_area' : functional_area, 'matrix' : 'multiple values', 'journal_doi' : 'https://doi.org/10.1016/j.chembiol.2017.11.009', 'sample_type' : 'in_vitro', 'species' : species, 'target' : target, 'target_type' : target_type, 'id_col' : 'compound_id', 'response_col' : 'binary_class', 'prediction_type' : 'classification', 'num_classes' : 2, 'class_names' : ['inactive','active'], 'smiles_col' : 'rdkit_smiles', 'units' : 'unitless', 'source_file_id' : dtc_mleqonly_fileID } ds_client = dsf.config_client() if force_update or not dsf.dataset_key_exists(dataset_key, bucket, ds_client): uploaded_file = dsf.upload_df_to_DS(bucket=bucket, filename=filename,df=data_df, title = title, description=description, tags=tags, key_values=kv, client=None, dataset_key=dataset_key, override_check=False, return_metadata=True) print("Uploaded raw dataset with key %s" % dataset_key) else: uploaded_file = dsf.retrieve_dataset_by_datasetkey(dataset_key, bucket, ds_client, return_metadata=True) print("Raw dataset %s is already in datastore, skipping upload." % dataset_key) raw_dset_oid = uploaded_file['dataset_oid'] return raw_dset_oid
[docs] def upload_df_dtc_base_smiles_all(dset_name, title, description, tags, functional_area, target, target_type, activity, assay_category,data_df,dtc_mleqonly_fileID, data_origin='journal', species='human', force_update=False): """Uploads DTC base smiles data to the datastore Uploads base SMILES string for the DTC dataset. Returns the datastore OID of the uploaded dataset. The dataset is uploaded to the public bucket and lists https://doi.org/10.1016/j.chembiol.2017.11.009' as the doi. This also assumes that the id_col is 'compound_id', the response column is set to PIC50, and the SMILES are assumed to be in 'base_rdkit_smiles'. Args: dset_name (str): Name of the dataset. Should not include a file extension. title (str): title of the file in (human friendly format) description (str): long text box to describe file (background/use notes) tags (list): Must be a list of strings. functional_area (str): The functional area. target (str): The target. target_type (str): The target type of the dataset. activity (str): The activity of the dataset. assay_category (str): The assay category of the dataset. data_df (DataFrame): DataFrame to be uploaded. dtc_mleqonly_fileID (str): Source file id used to generate data_df. data_origin (str): The origin of the dataset e.g. journal. species (str): The species of the dataset e.g. human, rat, dog. force_update (bool): Overwrite existing datasets in the datastore. Returns: str: datastore OID of the uploaded dataset. """ bucket = 'public' filename = '%s_dtc_base_smiles_all.csv' % dset_name dataset_key = 'dskey_' + filename kv = { 'file_category': 'experimental', 'activity': activity, 'assay_category': assay_category, ## seems like this should be called 'kinase_activity' 'assay_endpoint' : 'pic50', 'curation_level': 'ml_ready', 'data_origin' : data_origin, 'functional_area' : functional_area, 'matrix' : 'multiple values', 'journal_doi' : 'https://doi.org/10.1016/j.chembiol.2017.11.009', 'sample_type' : 'in_vitro', 'species' : species, 'target' : target, 'target_type' : target_type, 'id_col' : 'compound_id', 'response_col' : 'PIC50', 'prediction_type' : 'regression', 'smiles_col' : 'base_rdkit_smiles', 'units' : 'unitless', 'source_file_id' : dtc_mleqonly_fileID } ds_client = dsf.config_client() if force_update or not dsf.dataset_key_exists(dataset_key, bucket, ds_client): uploaded_file = dsf.upload_df_to_DS(bucket=bucket, filename=filename,df=data_df, title = title, description=description, tags=tags, key_values=kv, client=None, dataset_key=dataset_key, override_check=False, return_metadata=True) print("Uploaded raw dataset with key %s" % dataset_key) else: uploaded_file = dsf.retrieve_dataset_by_datasetkey(dataset_key, bucket, ds_client, return_metadata=True) print("Raw dataset %s is already in datastore, skipping upload." % dataset_key) raw_dset_oid = uploaded_file['dataset_oid'] return raw_dset_oid
[docs] def upload_file_dtc_smiles_regr_all(dset_name, title, description, tags, functional_area, target, target_type, activity, assay_category,file_path,dtc_smiles_fileID, smiles_column, data_origin='journal', species='human', force_update=False): """Uploads regression DTC data to the datastore Uploads regression dataset for DTC dataset. Returns the datastore OID of the uploaded dataset. The dataset is uploaded to the public bucket and lists https://doi.org/10.1016/j.chembiol.2017.11.009' as the doi. This also assumes that the id_col is 'compound_id', the response column is set to PIC50. Args: dset_name (str): Name of the dataset. Should not include a file extension. title (str): title of the file in (human friendly format) description (str): long text box to describe file (background/use notes) tags (list): Must be a list of strings. functional_area (str): The functional area. target (str): The target. target_type (str): The target type of the dataset. activity (str): The activity of the dataset. assay_category (str): The assay category of the dataset. data_df (DataFrame): DataFrame to be uploaded. dtc_smiles_fileID(str): Source file id used to generate data_df. smiles_column (str): Column containing SMILES. data_origin (str): The origin of the dataset e.g. journal. species (str): The species of the dataset e.g. human, rat, dog. force_update (bool): Overwrite existing datasets in the datastore. Returns: str: datastore OID of the uploaded dataset. """ bucket = 'public' filename = '%s_dtc_smiles_regr_all.csv' % dset_name dataset_key = 'dskey_' + filename kv = { 'file_category': 'experimental', 'activity': activity, 'assay_category': assay_category, ## seems like this should be called 'kinase_activity' 'assay_endpoint' : 'pic50', 'curation_level': 'ml_ready', 'data_origin' : data_origin, 'functional_area' : functional_area, 'matrix' : 'multiple values', 'journal_doi' : 'https://doi.org/10.1016/j.chembiol.2017.11.009', 'sample_type' : 'in_vitro', 'species' : species, 'target' : target, 'target_type' : target_type, 'id_col' : 'compound_id', 'response_col' : 'PIC50', 'prediction_type' : 'regression', 'smiles_col' : smiles_column, 'units' : 'unitless', 'source_file_id' : dtc_smiles_fileID } #uploaded_file = dsf.upload_file_to_DS(bucket=bucket, filepath=file_path, filename=filename, title = title, description=description, tags=tags, key_values=kv, client=None, dataset_key=dataset_key, override_check=False, return_metadata=True) ds_client = dsf.config_client() if force_update or not dsf.dataset_key_exists(dataset_key, bucket, ds_client): #uploaded_file = dsf.upload_df_to_DS(bucket=bucket, filename=filename,df=data_df, title = title, description=description, tags=tags, key_values=kv, client=None, dataset_key=dataset_key, override_check=False, return_metadata=True) uploaded_file = dsf.upload_file_to_DS(bucket=bucket, filepath=file_path, filename=filename, title = title, description=description, tags=tags, key_values=kv, client=None, dataset_key=dataset_key, override_check=False, return_metadata=True) print("Uploaded raw dataset with key %s" % dataset_key) else: uploaded_file = dsf.retrieve_dataset_by_datasetkey(dataset_key, bucket, ds_client, return_metadata=True) print("Raw dataset %s is already in datastore, skipping upload." % dataset_key) raw_dset_oid = uploaded_file['dataset_oid'] return raw_dset_oid
[docs] def upload_df_dtc_smiles_regr_all_class(dset_name, title, description, tags, functional_area, target, target_type, activity, assay_category,data_df,dtc_smiles_regr_all_fileID, smiles_column, data_origin='journal', species='human', force_update=False): """Uploads DTC classification data to the datastore Uploads binary classiciation data for the DTC dataset. Classnames are assumed to be 'active' and 'inactive' Returns the datastore OID of the uploaded dataset. The dataset is uploaded to the public bucket and lists https://doi.org/10.1016/j.chembiol.2017.11.009' as the doi. This also assumes that the id_col is 'compound_id', the response column is set to PIC50. Args: dset_name (str): Name of the dataset. Should not include a file extension. title (str): title of the file in (human friendly format) description (str): long text box to describe file (background/use notes) tags (list): Must be a list of strings. functional_area (str): The functional area. target (str): The target. target_type (str): The target type of the dataset. activity (str): The activity of the dataset. assay_category (str): The assay category of the dataset. data_df (DataFrame): DataFrame to be uploaded. dtc_smiles_regr_all_fileID(str): Source file id used to generate data_df. smiles_column (str): Column containing SMILES. data_origin (str): The origin of the dataset e.g. journal. species (str): The species of the dataset e.g. human, rat, dog. force_update (bool): Overwrite existing datasets in the datastore. Returns: str: datastore OID of the uploaded dataset. """ bucket = 'public' filename = '%s_dtc_smiles_regr_all_class.csv' % dset_name dataset_key = 'dskey_' + filename kv = { 'file_category': 'experimental', 'activity': activity, 'assay_category': assay_category, ## seems like this should be called 'kinase_activity' 'assay_endpoint' : 'pic50', 'curation_level': 'ml_ready', 'data_origin' : data_origin, 'functional_area' : functional_area, 'matrix' : 'multiple values', 'journal_doi' : 'https://doi.org/10.1016/j.chembiol.2017.11.009', 'sample_type' : 'in_vitro', 'species' : species, 'target' : target, 'target_type' : target_type, 'id_col' : 'compound_id', 'response_col' : 'PIC50', 'prediction_type' : 'classification', 'num_classes' : 2, 'smiles_col' : smiles_column, 'class_names' : ['inactive','active'], 'units' : 'unitless', 'source_file_id' : dtc_smiles_regr_all_fileID } #uploaded_file = dsf.upload_file_to_DS(bucket=bucket, filepath=file_path, filename=filename, title = title, description=description, tags=tags, key_values=kv, client=None, dataset_key=dataset_key, override_check=False, return_metadata=True) ds_client = dsf.config_client() if force_update or not dsf.dataset_key_exists(dataset_key, bucket, ds_client): uploaded_file = dsf.upload_df_to_DS(bucket=bucket, filename=filename,df=data_df, title = title, description=description, tags=tags, key_values=kv, client=None, dataset_key=dataset_key, override_check=False, return_metadata=True) #uploaded_file = dsf.upload_file_to_DS(bucket=bucket, filepath=file_path, filename=filename, title = title, description=description, tags=tags, key_values=kv, client=None, dataset_key=dataset_key, override_check=False, return_metadata=True) print("Uploaded raw dataset with key %s" % dataset_key) else: uploaded_file = dsf.retrieve_dataset_by_datasetkey(dataset_key, bucket, ds_client, return_metadata=True) print("Raw dataset %s is already in datastore, skipping upload." % dataset_key) raw_dset_oid = uploaded_file['dataset_oid'] return raw_dset_oid
# ---------------------------------------------------------------------------------------------------------------------- # Excape-specific curation functions # ----------------------------------------------------------------------------------------------------------------------
[docs] def upload_file_excape_raw_data(dset_name, title, description, tags, functional_area, target, target_type, activity, assay_category,file_path, data_origin='journal', species='human', force_update=False): """Uploads raw Excape data to the datastore Upload a raw dataset to the datastore from the given DataFrame. Returns the datastore OID of the uploaded dataset. The dataset is uploaded to the public bucket and lists https://dx.doi.org/10.1186%2Fs13321-017-0203-5 as the doi. This also assumes that the id_col is 'Original_Entry_ID' Args: dset_name (str): Name of the dataset. Should not include a file extension. title (str): title of the file in (human friendly format) description (str): long text box to describe file (background/use notes) tags (list): Must be a list of strings. functional_area (str): The functional area. target (str): The target. target_type (str): The target type of the dataset. activity (str): The activity of the dataset. assay_category (str): The assay category of the dataset. file_path (str): The filepath of the dataset. data_origin (str): The origin of the dataset e.g. journal. species (str): The species of the dataset e.g. human, rat, dog. force_update (bool): Overwrite existing datasets in the datastore. Returns: str: datastore OID of the uploaded dataset. """ bucket = 'public' filename = '%s_excape.csv' % dset_name dataset_key = 'dskey_' + filename kv = { 'file_category': 'experimental', 'activity': activity, 'assay_category':assay_category, 'assay_endpoint' : 'multiple values', 'curation_level': 'raw', 'data_origin' : data_origin, 'functional_area' : functional_area, 'matrix' : 'multiple values', 'journal_doi' : 'https://dx.doi.org/10.1186%2Fs13321-017-0203-5', # ExCAPE-DB 'sample_type' : 'in_vitro', 'species' : species, 'target' : target, 'target_type' : target_type, 'id_col' : 'Original_Entry_ID' } #uploaded_file = dsf.upload_file_to_DS(bucket=bucket, filepath=file_path, filename=filename, title = title, description=description, tags=tags, key_values=kv, client=None, dataset_key=dataset_key, override_check=False, return_metadata=True) ds_client = dsf.config_client() if force_update or not dsf.dataset_key_exists(dataset_key, bucket, ds_client): #uploaded_file = dsf.upload_df_to_DS(dset_df, bucket, filename=filename, title=title, # description=description, # tags=tags, key_values=kv, client=None, dataset_key=dataset_key, # override_check=True, return_metadata=True) uploaded_file = dsf.upload_file_to_DS(bucket=bucket, filepath=file_path, filename=filename, title = title, description=description, tags=tags, key_values=kv, client=None, dataset_key=dataset_key, override_check=False, return_metadata=True) print("Uploaded raw dataset with key %s" % dataset_key) else: uploaded_file = dsf.retrieve_dataset_by_datasetkey(dataset_key, bucket, ds_client, return_metadata=True) print("Raw dataset %s is already in datastore, skipping upload." % dataset_key) raw_dset_oid = uploaded_file['dataset_oid'] return raw_dset_oid
[docs] def get_smiles_excape_data(nm_df,targ_lst): """Calculate base rdkit smiles Divides up nm_df based on target and makes one DataFrame for each target. Rows with NaN pXC50 values are dropped. Base rdkit SMILES are calculated from the SMILES column using atomsci.ddm.utils.struct_utils.base_rdkit_smiles_from_smiles. A new column, 'rdkit_smiles, is added to each output DataFrame. Args: nm_df (DataFrame): DataFrame for Excape database. Should contain the columns, pXC50, SMILES, and Ambit_InchiKey targ_lst (list): A list of targets to filter out of nm_df Returns: list, list: A list of DataFrames, one for each target, and a list of all inchi keys used in the dataset. """ # Delete NaN nm_df = nm_df.dropna(subset=['pXC50']) # (Yaru) Use nm_df, which has removed nan's # Don't need to retrieve SMILES, since already in excape file # No filtering by censored save_df={} targ = targ_lst[0] save_df[targ_lst[0]] = nm_df print(targ,"distinct compounds = only",nm_df['Ambit_InchiKey'].nunique()) shared_inchi_keys = nm_df['Ambit_InchiKey'] # Merge in SMILES strings smiles_lst=[] save_df[targ_lst[0]] = nm_df for targ in targ_lst : df=save_df[targ] smiles_df = df #df['PIC50']=df['standard_value'].apply(ic50topic50) #smiles_df=df.merge(save_smiles_df,on='standard_inchi_key',suffixes=('_'+targ,'_')) #the file puts the SMILES string in quotes, which need to be removed #smiles_df['smiles']=smiles_df['smiles'].str.replace('"','') smiles_df['rdkit_smiles']=smiles_df['SMILES'].apply(struct_utils.base_smiles_from_smiles) #smiles_df['smiles']=smiles_df['smiles'].str.replace('"','') print(smiles_df.shape) print(smiles_df['Ambit_InchiKey'].nunique()) smiles_lst.append(smiles_df) return smiles_lst, shared_inchi_keys
[docs] def upload_df_excape_smiles(dset_name, title, description, tags, functional_area, target, target_type, activity, assay_category,smiles_df,orig_fileID, data_origin='journal', species='human', force_update=False): """Uploads Excape SMILES data to the datastore Upload SMILES to the datastore from the given DataFrame. Returns the datastore OID of the uploaded dataset. The dataset is uploaded to the public bucket and lists https://dx.doi.org/10.1186%2Fs13321-017-0203-5 as the doi. This also assumes that the id_col is 'Original_Entry_ID' Args: dset_name (str): Name of the dataset. Should not include a file extension. title (str): title of the file in (human friendly format) description (str): long text box to describe file (background/use notes) tags (list): Must be a list of strings. functional_area (str): The functional area. target (str): The target. target_type (str): The target type of the dataset. activity (str): The activity of the dataset. assay_category (str): The assay category of the dataset. smiles_df (DataFrame): DataFrame containing SMILES to be uploaded. orig_fileID (str): Source file id used to generate smiles_df. data_origin (str): The origin of the dataset e.g. journal. species (str): The species of the dataset e.g. human, rat, dog. force_update (bool): Overwrite existing datasets in the datastore. Returns: str: datastore OID of the uploaded dataset. """ bucket = 'public' #he6: this used to say _dtc_smiles.csv filename = '%s_excape_smiles.csv' % dset_name dataset_key = 'dskey_' + filename kv = { 'file_category': 'experimental', 'activity': activity, 'assay_category': assay_category, ## seems like this should be called 'kinase_activity' 'assay_endpoint' : 'pic50', 'curation_level': 'raw', 'data_origin' : data_origin, 'functional_area' : functional_area, 'matrix' : 'multiple values', 'journal_doi' : 'https://dx.doi.org/10.1186%2Fs13321-017-0203-5', # ExCAPE-DB 'sample_type' : 'in_vitro', 'species' : species, 'target' : target, 'target_type' : target_type, 'id_col' : 'Original_Entry_ID', 'source_file_id' : orig_fileID } #uploaded_file = dsf.upload_file_to_DS(bucket=bucket, filepath=file_path, filename=filename, title = title, description=description, tags=tags, key_values=kv, client=None, dataset_key=dataset_key, override_check=False, return_metadata=True) ds_client = dsf.config_client() if force_update or not dsf.dataset_key_exists(dataset_key, bucket, ds_client): uploaded_file = dsf.upload_df_to_DS(bucket=bucket, filename=filename,df=smiles_df, title = title, description=description, tags=tags, key_values=kv, client=None, dataset_key=dataset_key, override_check=False, return_metadata=True) #uploaded_file = dsf.upload_file_to_DS(bucket=bucket, filepath=file_path, filename=filename, title = title, description=description, tags=tags, key_values=kv, client=None, dataset_key=dataset_key, override_check=False, return_metadata=True) print("Uploaded raw dataset with key %s" % dataset_key) else: uploaded_file = dsf.retrieve_dataset_by_datasetkey(dataset_key, bucket, ds_client, return_metadata=True) print("Raw dataset %s is already in datastore, skipping upload." % dataset_key) raw_dset_oid = uploaded_file['dataset_oid'] return raw_dset_oid
[docs] def atom_curation_excape(targ_lst, smiles_lst, shared_inchi_keys): """Apply ATOM standard 'curation' step: Average replicate assays, remove duplicates and drop cases with large variance between replicates. Rows with NaN values in rdkit_smiles, VALUE_NUM_mean, and pXC50 are dropped Args: targ_lst (list): A list of targets. smiles_lst (list): A of DataFrames. These DataFrames must contain the columns gene_names, standard_type, standard_relation, standard_inchi_key, pXC50, and rdkit_smiles shared_inchi_keys (list): A list of inchi keys used in this dataset. Returns: list:A list of curated DataFrames """ imp.reload(curate_data) tolerance=10 column='pXC50'; #'standard_value' list_bad_duplicates='No' max_std=1 curated_lst=[] #print(targ_lst) #print(smiles_lst) for it in range(len(targ_lst)) : data=smiles_lst[it] #data = data[data.standard_relation.str.strip() == '='] #print("gene_names",data.gene_names.unique()) #print("standard_type",data.standard_type.unique()) #print("standard_relation",data.standard_relation.unique()) print("before",data.shape) curated_df=curate_data.average_and_remove_duplicates (column, tolerance, list_bad_duplicates, data, max_std, compound_id='standard_inchi_key',smiles_col='rdkit_smiles') # (Yaru) Remove inf in curated_df curated_df = curated_df[~curated_df.isin([np.inf]).any(1)] # (Yaru) Remove nan on rdkit_smiles curated_df = curated_df.dropna(subset=['rdkit_smiles']) curated_df = curated_df.dropna(subset=['VALUE_NUM_mean']) curated_df = curated_df.dropna(subset=['pXC50']) # (Kevin) # Filter criteria: # pXC50 not missing # rdkit_smiles not blank # pXC50 > 3 #dset_df = dset_df[dset_df.pXC50 >= 3.0] curated_lst.append(curated_df) prev_cmpd_cnt=shared_inchi_keys.nunique() num_dropped=prev_cmpd_cnt-curated_df.shape[0] print("After",curated_df.shape, "# of dropped compounds",num_dropped) return curated_lst
[docs] def upload_df_excape_mleqonly(dset_name, title, description, tags, functional_area, target, target_type, activity, assay_category,data_df,smiles_fileID, data_origin='journal', species='human', force_update=False): """Uploads Excape mleqonly data to the datastore Upload mleqonly to the datastore from the given DataFrame. Returns the datastore OID of the uploaded dataset. The dataset is uploaded to the public bucket and lists https://dx.doi.org/10.1186%2Fs13321-017-0203-5 as the doi. This also assumes that the id_col is 'Original_Entry_ID', smiles_col is 'rdkit_smiles' and response_col is 'VALUE_NUM_mean'. Args: dset_name (str): Name of the dataset. Should not include a file extension. title (str): title of the file in (human friendly format) description (str): long text box to describe file (background/use notes) tags (list): Must be a list of strings. functional_area (str): The functional area. target (str): The target. target_type (str): The target type of the dataset. activity (str): The activity of the dataset. assay_category (str): The assay category of the dataset. data_df (DataFrame): DataFrame containing SMILES to be uploaded. smiles_fileID (str): Source file id used to generate data_df. data_origin (str): The origin of the dataset e.g. journal. species (str): The species of the dataset e.g. human, rat, dog. force_update (bool): Overwrite existing datasets in the datastore. Returns: str: datastore OID of the uploaded dataset. """ bucket = 'public' #he6: this used to say _dtc_mleqonly.csv filename = '%s_excape_mleqonly.csv' % dset_name dataset_key = 'dskey_' + filename kv = { 'file_category': 'experimental', 'activity': activity, 'assay_category': assay_category, ## seems like this should be called 'kinase_activity' 'assay_endpoint' : 'pic50', 'curation_level': 'ml_ready', 'data_origin' : data_origin, 'functional_area' : functional_area, 'matrix' : 'multiple values', 'journal_doi' : 'https://dx.doi.org/10.1186%2Fs13321-017-0203-5', # ExCAPE-DB 'sample_type' : 'in_vitro', 'species' : species, 'target' : target, 'target_type' : target_type, 'id_col' : 'Original_Entry_ID', 'response_col' : 'VALUE_NUM_mean', 'prediction_type' : 'regression', 'smiles_col' : 'rdkit_smiles', 'units' : 'unitless', 'source_file_id' : smiles_fileID } #uploaded_file = dsf.upload_file_to_DS(bucket=bucket, filepath=file_path, filename=filename, title = title, description=description, tags=tags, key_values=kv, client=None, dataset_key=dataset_key, override_check=False, return_metadata=True) ds_client = dsf.config_client() if force_update or not dsf.dataset_key_exists(dataset_key, bucket, ds_client): uploaded_file = dsf.upload_df_to_DS(bucket=bucket, filename=filename,df=data_df, title = title, description=description, tags=tags, key_values=kv, client=None, dataset_key=dataset_key, override_check=False, return_metadata=True) #uploaded_file = dsf.upload_file_to_DS(bucket=bucket, filepath=file_path, filename=filename, title = title, description=description, tags=tags, key_values=kv, client=None, dataset_key=dataset_key, override_check=False, return_metadata=True) print("Uploaded raw dataset with key %s" % dataset_key) else: uploaded_file = dsf.retrieve_dataset_by_datasetkey(dataset_key, bucket, ds_client, return_metadata=True) print("Raw dataset %s is already in datastore, skipping upload." % dataset_key) raw_dset_oid = uploaded_file['dataset_oid'] return raw_dset_oid
[docs] def upload_df_excape_mleqonly_class(dset_name, title, description, tags, functional_area, target, target_type, activity, assay_category,data_df,mleqonly_fileID, data_origin='journal', species='human', force_update=False): """Uploads Excape mleqonly classification data to the datastore data_df contains a binary classification dataset with 'active' and 'incative' classes. Upload mleqonly classification to the datastore from the given DataFrame. Returns the datastore OID of the uploaded dataset. The dataset is uploaded to the public bucket and lists https://dx.doi.org/10.1186%2Fs13321-017-0203-5 as the doi. This also assumes that the id_col is 'Original_Entry_ID', smiles_col is 'rdkit_smiles' and response_col is 'binary_class'. Args: dset_name (str): Name of the dataset. Should not include a file extension. title (str): title of the file in (human friendly format) description (str): long text box to describe file (background/use notes) tags (list): Must be a list of strings. functional_area (str): The functional area. target (str): The target. target_type (str): The target type of the dataset. activity (str): The activity of the dataset. assay_category (str): The assay category of the dataset. data_df (DataFrame): DataFrame containing SMILES to be uploaded. mleqonly_fileID (str): Source file id used to generate data_df. data_origin (str): The origin of the dataset e.g. journal. species (str): The species of the dataset e.g. human, rat, dog. force_update (bool): Overwrite existing datasets in the datastore. Returns: str: datastore OID of the uploaded dataset. """ bucket = 'public' #he6: this used to say _dtc_mleqonly.csv filename = '%s_excape_mleqonly_class.csv' % dset_name dataset_key = 'dskey_' + filename kv = { 'file_category': 'experimental', 'activity': activity, 'assay_category': assay_category, ## seems like this should be called 'kinase_activity' 'assay_endpoint' : 'pic50', 'curation_level': 'ml_ready', 'data_origin' : data_origin, 'functional_area' : functional_area, 'matrix' : 'multiple values', 'journal_doi' : 'https://dx.doi.org/10.1186%2Fs13321-017-0203-5', # ExCAPE-DB 'sample_type' : 'in_vitro', 'species' : species, 'target' : target, 'target_type' : target_type, 'id_col' : 'compound_id', 'response_col' : 'binary_class', 'prediction_type' : 'classification', 'num_classes' : 2, 'class_names' : ['inactive','active'], 'smiles_col' : 'rdkit_smiles', 'units' : 'unitless', 'source_file_id' : mleqonly_fileID } #uploaded_file = dsf.upload_file_to_DS(bucket=bucket, filepath=file_path, filename=filename, title = title, description=description, tags=tags, key_values=kv, client=None, dataset_key=dataset_key, override_check=False, return_metadata=True) ds_client = dsf.config_client() if force_update or not dsf.dataset_key_exists(dataset_key, bucket, ds_client): uploaded_file = dsf.upload_df_to_DS(bucket=bucket, filename=filename,df=data_df, title = title, description=description, tags=tags, key_values=kv, client=None, dataset_key=dataset_key, override_check=False, return_metadata=True) #uploaded_file = dsf.upload_file_to_DS(bucket=bucket, filepath=file_path, filename=filename, title = title, description=description, tags=tags, key_values=kv, client=None, dataset_key=dataset_key, override_check=False, return_metadata=True) print("Uploaded raw dataset with key %s" % dataset_key) else: uploaded_file = dsf.retrieve_dataset_by_datasetkey(dataset_key, bucket, ds_client, return_metadata=True) print("Raw dataset %s is already in datastore, skipping upload." % dataset_key) raw_dset_oid = uploaded_file['dataset_oid'] return raw_dset_oid