Source code for utils.many_to_one

import pandas as pd


[docs]
def has_nans(df, col):
    total = len(df)
    after = len(df[col].dropna())
    return total!=after



[docs]
def no_nan_ids_or_smiles(df, smiles_col, id_col):
    if has_nans(df, smiles_col):
        raise NANSMILESException('NANs found in SMILES column')
    
    if has_nans(df, id_col):
        raise NANCompoundIDException('NANs found in ID column')

    return True



[docs]
def many_to_one(fn, smiles_col, id_col):
    df = pd.read_csv(fn)
    return many_to_one_df(df, smiles_col, id_col)



[docs]
def many_to_one_df(df, smiles_col, id_col):
    """AMPL requires that SMILES and compound_ids have a many to one mapping.
    This function opens the dataset and checks this restraint. It will also
    check if any SMILES or compound_ids are empty/nan

    Arguments:
        df (pd.DataFrame): The DataFrame in question.
        smiles_col (str): The column containing SMILES.
        id_col (str): The column containing compound ids

    Returns:
        True if there is a many to one mapping. Raises one of 3 errors if it:
            - Has nan compound_ids
            - Has nan SMILES
            - Is not a many to one mapping between compound_ids and SMILES
    """
    no_nan_ids_or_smiles(df, smiles_col, id_col)

    # if a compound id is associated with more than one SMILES
    id_one = df.drop_duplicates(subset=[smiles_col, id_col]).groupby(id_col)[smiles_col].count().max()
    if id_one > 1:
        raise ManyToOneException('SMILES and Compound IDs do not have a many to one mapping.')

    # SMILES can be associated with many compound ids no need to check them
    return True



[docs]
class ManyToOneException(Exception):
    pass



[docs]
class NANCompoundIDException(Exception):
    pass



[docs]
class NANSMILESException(Exception):
    pass