"""Functions to generate matrices or vectors of distances between compounds"""
import os, sys
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from scipy.spatial.distance import pdist
from scipy.spatial.distance import cdist
from scipy.spatial.distance import squareform
import pandas as pd
import inspect
from atomsci.ddm.pipeline import dist_metrics
[docs]
def calc_dist_smiles(feat_type, dist_met, smiles_arr1, smiles_arr2=None, calc_type='nearest', num_nearest=1, **metric_kwargs):
"""Returns an array of distances between compounds given as SMILES strings, either between all pairs of compounds in a
single dataset or between two datasets.
Args:
feat_type (str): How the data is to be featurized, if dist_met is not 'mcs'. The only option supported currently is 'ECFP'.
dist_met (str): What distance metric to use. Current options include 'tanimoto' and 'mcs'.
smiles_arr1 (list): First list of SMILES strings.
smiles_arr2 (list): Optional, second list of SMILES strings. Can have only 1 member if wanting compound to
matrix comparison.
calc_type (str): Type of summarization to perform on rows of distance matrix. See function calc_summary for options.
num_nearest (int): Additional parameter for calc_types nearest, nth_nearest and avg_n_nearest.
metric_kwargs: Additional arguments to be passed to functions that calculate metrics.
Returns:
dists: vector or array of distances
Todo:
Fix the function _get_descriptors(), which is broken, and re-enable the 'descriptors' option for feat_type. Will need
to add a parameter to indicate what kind of descriptors should be computed.
Allow other metrics for ECFP features, as in calc_dist_diskdataset().
"""
within_dset = False
if feat_type in ['ECFP','ecfp'] and dist_met=='tanimoto':
mols1 = [Chem.MolFromSmiles(s) for s in smiles_arr1]
fprints1 = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, 1024) for mol in mols1]
if smiles_arr2 is not None:
if len(smiles_arr2) == 1:
cpd_mol = Chem.MolFromSmiles(smiles_arr2[0])
cpd_fprint = AllChem.GetMorganFingerprintAsBitVect(cpd_mol, 2, 1024)
# Vector of distances
return calc_summary(dist_metrics.tanimoto_single(cpd_fprint, fprints1)[0], calc_type,
num_nearest, within_dset)
else:
mols2 = [Chem.MolFromSmiles(s) for s in smiles_arr2]
fprints2 = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, 1024) for mol in mols2]
else:
fprints2 = None
within_dset = True
return calc_summary(dist_metrics.tanimoto(fprints1, fprints2), calc_type, num_nearest, within_dset)
elif dist_met == 'mcs':
mols1 = [Chem.MolFromSmiles(s) for s in smiles_arr1]
n_atms = [mol.GetNumAtoms() for mol in mols1]
if smiles_arr2 is not None:
if len(smiles_arr2) == 1:
cpd_mol = Chem.MolFromSmiles(smiles_arr2[0])
# Vector of distances
return calc_summary(dist_metrics.mcs_single(
cpd_mol, mols1, n_atms)[0], calc_type, num_nearest, within_dset)
else:
mols2 = [Chem.MolFromSmiles(s) for s in smiles_arr2]
else:
mols2 = None
return calc_summary(dist_metrics.mcs(mols1, mols2), calc_type, num_nearest, within_dset=True)
elif feat_type in ['descriptors', 'moe']:
raise ValueError("Descriptor features are not currently supported by calc_dist_smiles().")
feats1 = _get_descriptors(smiles_arr1)
if feats1 is not None:
if smiles_arr2 is not None:
feats2 = _get_descriptors(smiles_arr2)
if feats2 is None:
return
return calc_summary(cdist(feats1, feats2, dist_met), calc_type, num_nearest, within_dset)
else:
return calc_summary(pdist(feats1, dist_met, **metric_kwargs), calc_type, num_nearest, within_dset=True)
[docs]
def calc_dist_diskdataset(feat_type, dist_met, dataset1, dataset2=None, calc_type='nearest', num_nearest=1, **metric_kwargs):
"""Returns an array of distances, either between all compounds in a single dataset or between two datasets, given
as DeepChem Dataset objects.
Args:
feat_type (str): How the data was featurized. Current options are 'ECFP' or 'descriptors'.
dist_met (str): What distance metric to use. Current options include tanimoto, cosine, cityblock, euclidean, or any
other metric supported by scipy.spatial.distance.pdist().
dataset1 (deepchem.Dataset): Dataset containing features of compounds to be compared.
dataset2 (deepchem.Dataset, optional): Second dataset, if two datasets are to be compared.
calc_type (str): Type of summarization to perform on rows of distance matrix. See function calc_summary for options.
num_nearest (int): Additional parameter for calc_types nearest, nth_nearest and avg_n_nearest.
metric_kwargs: Additional arguments to be passed to functions that calculate metrics.
Returns:
np.ndarray: Vector or matrix of distances between feature vectors.
"""
if dataset2 is not None:
return calc_dist_feat_array(feat_type, dist_met, dataset1.X, dataset2.X, calc_type, num_nearest, **metric_kwargs)
else:
return calc_dist_feat_array(feat_type, dist_met, dataset1.X, None, calc_type, num_nearest, **metric_kwargs)
[docs]
def calc_dist_feat_array(feat_type, dist_met, feat1, feat2=None, calc_type='nearest', num_nearest=1, **metric_kwargs):
"""Returns a vector or array of distances, either between all compounds in a single dataset or between two datasets,
given the feature matrices for the dataset(s).
Args:
feat_type (str): How the data was featurized. Current options are 'ECFP' or 'descriptors'.
dist_met (str): What distance metric to use. Current options include tanimoto, cosine, cityblock, euclidean, or any
other metric supported by scipy.spatial.distance.pdist().
feat1: feature matrix as a numpy array
feat2: Optional, second feature matrix
calc_type (str): Type of summarization to perform on rows of distance matrix. See function calc_summary for options.
num_nearest (int): Additional parameter for calc_types nearest, nth_nearest and avg_n_nearest.
metric_kwargs: Additional arguments to be passed to functions that calculate metrics.
Returns:
dists: vector or array of distances
"""
within_dset = False
if feat_type in ['ECFP', 'ecfp']:
if dist_met == 'tanimoto':
if feat2 is not None:
if feat2.shape[0] == 1:
# Vector of distances
return calc_summary(dist_metrics.tanimoto_single(feat2, feat1)[0], calc_type,
num_nearest)
return calc_summary(dist_metrics.tanimoto(feat1, feat2), calc_type, num_nearest)
else:
return calc_summary(dist_metrics.tanimoto(feat1), calc_type, num_nearest, within_dset=True)
else:
if feat2 is not None:
return calc_summary(cdist(feat1, feat2, dist_met), calc_type, num_nearest)
return calc_summary(pdist(feat1, dist_met, **metric_kwargs), calc_type, num_nearest, within_dset=True)
elif feat_type == 'descriptors':
if feat2 is not None:
return calc_summary(cdist(feat1, feat2, dist_met), calc_type, num_nearest)
return calc_summary(pdist(feat1, dist_met, **metric_kwargs), calc_type, num_nearest, within_dset=True)
[docs]
def calc_summary(dist_arr, calc_type, num_nearest=1, within_dset=False):
"""Returns a summary of the distances in dist_arr, depending on calc_type.
Args:
dist_arr: (np.array): Either a 2D distance matrix, or a 1D condensed distance matrix (flattened upper triangle).
calc_type (str): The type of summary values to return:
all: The distance matrix itself
nearest: The distances to the num_nearest nearest neighbors of each compound (except compound itself)
nth_nearest: The distance to the num_nearest'th nearest neighbor
avg_n_nearest: The average of the num_nearest nearest neighbor distances
farthest: The distance to the farthest neighbor
avg: The average of all distances for each compound
num_nearest (int): Additional parameter for calc_types nearest, nth_nearest and avg_n_nearest.
within_dset (bool): True if input distances are between compounds in the same dataset.
Returns:
dists (np.array): A numpy array of distances. For calc_type 'nearest' with num_nearest > 1, this is a 2D array
with a row for each compound; otherwise it is a 1D array.
"""
if calc_type == 'all':
return dist_arr
if len(dist_arr.shape) == 1:
dist_mat = squareform(dist_arr)
else:
dist_mat = dist_arr
if calc_type == 'farthest':
return dist_mat.max(axis=1)
if calc_type == 'avg':
return dist_mat.mean(axis=1)
if calc_type == 'nearest':
nn_dist = np.sort(dist_mat)
if within_dset:
# Exclude the zero distances between each compound and itself. But don't exclude
# zero distances between different compounds!
nn_dist = nn_dist[:,1:(num_nearest+1)]
else:
nn_dist = nn_dist[:,:num_nearest]
if num_nearest == 1:
return nn_dist[:,0]
else:
return nn_dist
if calc_type == 'nth_nearest':
nn_dist = np.sort(dist_mat)
if within_dset:
return nn_dist[:,num_nearest]
else:
return nn_dist[:,num_nearest-1]
if calc_type == 'avg_n_nearest':
if within_dset:
return np.sort(dist_mat)[:,1:(num_nearest+1)].mean(axis=1)
else:
return np.sort(dist_mat)[:,:num_nearest].mean(axis=1)
else:
print("calc_type %s is not valid" % calc_type)
sys.exit(1)
def _get_descriptors(smiles_arr):
"""DEPRECATED. This function is guaranteed not to work, since it refers to datasets that no longer exist."""
from atomsci.ddm.utils import datastore_functions as dsf
ds_client = dsf.config_client()
full_feature_matrix_key = '/ds/projdata/gsk_data/GSK_datasets/eXP_Panel_Min_100_Cmpds/scaled_descriptors/' \
'subset_all_GSK_Compound_2D_3D_MOE_Descriptors_Scaled_With_Smiles_And_Inchi_HTR2A_5_' \
'HT2A_Human_Antagonist_HEK_Luminescence_f_PIC50.csv'
full_feature_matrix = dsf.retrieve_dataset_by_datasetkey(full_feature_matrix_key, 'gskdata', ds_client)
smiles_df = pd.DataFrame(smiles_arr)
#df = full_feature_matrix.merge(
# smiles_df, how='inner', left_on='smiles', right_on=smiles_df.columns[0])
df = full_feature_matrix.head(20)
del full_feature_matrix
descriptor_features = [x for x in df.columns.values.tolist() if x not in
['compound_id', 'inchi_key', 'smiles', 'smiles_out',
'lost_frags', 'inchi_string', 'pxc50', 'rdkit_smiles',
'HTR2A_5_HT2A_Human_Antagonist_HEK_Luminescence_f_PIC50']]
#TODO this probably doesn't work
return df[descriptor_features]
[docs]
def upload_distmatrix_to_DS(
dist_matrix,feature_type,compound_ids,bucket,title,description,tags,key_values,filepath="./",dataset_key=None):
"""Uploads distance matrix in the data store with the appropriate tags
Args:
dist_matrix (np.ndarray): The distance matrix.
feature_type (str): How the data was featurized.
dist_met (str): What distance metric was used.
compound_ids (list): list of compound ids corresponding to the distance matrix (assumes that distance matrix is square
and is the distance between all compounds in a dataset)
bucket (str): bucket the file will be put in
title (str): title of the file in (human friendly format)
description (str): long text box to describe file (background/use notes)
tags (list): List of tags to assign to datastore object.
key_values (dict): Dictionary of key:value pairs to include in the datastore object's metadata.
filepath (str): local path where you want to store the pickled dataframe
dataset_key (str): If updating a file already in the datastore enter the corresponding dataset_key.
If not, leave as 'none' and the dataset_key will be automatically generated.
Returns:
None
"""
from atomsci.ddm.utils import datastore_functions as dsf
dist_df = pd.DataFrame(dist_matrix)
dist_df.index = compound_ids
dist_df.columns = compound_ids
fnm = "distmatrix_nm"
filename = fn.replace("nm",feature_type)
dist_pkl = dist_df.to_pickle(filepath+filename)
dsf.upload_file_to_DS(bucket, title, description, tags, key_values, filepath, filename, dataset_key, client=None)