import pandas as pd
[docs]
def has_nans(df, col):
total = len(df)
after = len(df[col].dropna())
return total!=after
[docs]
def no_nan_ids_or_smiles(df, smiles_col, id_col):
if has_nans(df, smiles_col):
raise NANSMILESException('NANs found in SMILES column')
if has_nans(df, id_col):
raise NANCompoundIDException('NANs found in ID column')
return True
[docs]
def many_to_one(fn, smiles_col, id_col):
df = pd.read_csv(fn)
return many_to_one_df(df, smiles_col, id_col)
[docs]
def many_to_one_df(df, smiles_col, id_col):
"""AMPL requires that SMILES and compound_ids have a many to one mapping.
This function opens the dataset and checks this restraint. It will also
check if any SMILES or compound_ids are empty/nan
Arguments:
df (pd.DataFrame): The DataFrame in question.
smiles_col (str): The column containing SMILES.
id_col (str): The column containing compound ids
Returns:
True if there is a many to one mapping. Raises one of 3 errors if it:
- Has nan compound_ids
- Has nan SMILES
- Is not a many to one mapping between compound_ids and SMILES
"""
no_nan_ids_or_smiles(df, smiles_col, id_col)
# if a compound id is associated with more than one SMILES
id_one = df.drop_duplicates(subset=[smiles_col, id_col]).groupby(id_col)[smiles_col].count().max()
if id_one > 1:
raise ManyToOneException('SMILES and Compound IDs do not have a many to one mapping.')
# SMILES can be associated with many compound ids no need to check them
return True
[docs]
class ManyToOneException(Exception):
pass
[docs]
class NANCompoundIDException(Exception):
pass
[docs]
class NANSMILESException(Exception):
pass