Source code for utils.rdkit_easy

"""Utilities for clustering and visualizing compound structures using RDKit."""
# Mostly written by Logan Van Ravenswaay, with additions and edits by Ben Madej and Kevin McLoughlin.

import os
import pdb

from IPython.display import SVG, HTML, display
from base64 import b64encode
import io

import logging

import pandas as pd
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import MolToImage, rdMolDraw2D
from rdkit.ML.Cluster import Butina
from rdkit.ML.Descriptors import MoleculeDescriptors

logging.basicConfig(format='%(asctime)-15s %(message)s')


[docs]
def setup_notebook():
    """Set up current notebook for displaying plots and Bokeh output using full width of window"""
    from bokeh.plotting import output_notebook

    get_ipython().run_line_magic('matplotlib', 'inline')
    # Bokeh option
    output_notebook()
    display(HTML("<style>.container { width:100% !important; }</style>"))

    pd.set_option('display.max_columns', None)
    pd.set_option('max_seq_items', None)



[docs]
def add_mol_column(df, smiles_col, molecule_col='mol'):
    """Converts SMILES strings in a data frame to RDKit Mol objects and adds them as a new column in the data frame.

    Args:
        df (pd.DataFrame): Data frame to add column to.

        smiles_col (str): Column containing SMILES strings.

        molecule_col (str): Name of column to create to hold Mol objects.

    Returns:
        pd.DataFrame: Modified data frame.

    """
    PandasTools.AddMoleculeColumnToFrame(df, smiles_col, molecule_col, includeFingerprints=True)
    return df




[docs]
def calculate_descriptors(df, molecule_column='mol'):
    """Uses RDKit to compute various descriptors for compounds specified by Mol objects in the given data frame.

    Args:
        df (pd.DataFrame): Data frame containing molecules.

        molecule_column (str): Name of column containing Mol objects for compounds.

    Returns:
        pd.DataFrame: Modified data frame with added columns for the descriptors.

    """

    descriptors = [x[0] for x in Descriptors._descList]
    calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descriptors)
    cds=[]
    for mol in df[molecule_column]:
        cd = calculator.CalcDescriptors(mol)
        cds.append(cd)
    df2=pd.DataFrame(cds, columns=descriptors)
    df=df.join(df2, lsuffix='', rsuffix='_rdk')
    return df



[docs]
def cluster_dataframe(df, molecule_column='mol', cluster_column='cluster', cutoff=0.2):
    """Performs Butina clustering on compounds specified by Mol objects in a data frame.

    Modifies the input dataframe to add a column 'cluster_column' containing the cluster
    index for each molecule.

    From RDKit cookbook http://rdkit.org/docs_temp/Cookbook.html.

    Args:
        df (pd.DataFrame): Data frame containing compounds to cluster.

        molecule_column (str): Name of column containing rdkit Mol objects for compounds.

        cluster_column (str): Column that will be created to hold cluster indices.

        cutoff (float): Maximum Tanimoto distance parameter used by Butina algorithm to identify neighbors of each molecule.

    Returns:
        None. Input data frame will be modified in place.

    """
    df[cluster_column] = -1
    df2 = df.reset_index()
    df2['df_index'] = df.index
    mols = df2[[molecule_column]].values.tolist()
    fingerprints = [AllChem.GetMorganFingerprintAsBitVect(x[0], 2, 1024) for x in mols]
    clusters = cluster_fingerprints(fingerprints, cutoff=cutoff)
    for i in range(len(clusters)):
        c = clusters[i]
        for j in c:
            df_index = df2.at[j, 'df_index']
            df.at[df_index, cluster_column] = i
    df=df.copy()



[docs]
def cluster_fingerprints(fps, cutoff=0.2):
    """Performs Butina clustering on compounds specified by a list of fingerprint bit vectors.

    From RDKit cookbook http://rdkit.org/docs_temp/Cookbook.html.

    Args:
        fps (list of rdkit.ExplicitBitVect): List of fingerprint bit vectors.

        cutoff (float): Cutoff distance parameter used to seed clusters in Butina algorithm.

    Returns:
        tuple of tuple: Indices of fingerprints assigned to each cluster.

    """

    # first generate the distance matrix:
    dists = []
    nfps = len(fps)
    for i in range(1, nfps):
        sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
        dists.extend([1-x for x in sims])

    # now cluster the data:
    cs = Butina.ClusterData(dists, nfps, cutoff, isDistData=True)
    return cs




[docs]
def mol_to_html(mol, highlight=None, name='', type='svg', directory='rdkit_svg', embed=False, width=400, height=200):
    """Creates an image displaying the given molecule's 2D structure, and generates an HTML
    tag for it. The image can be embedded directly into the HTML tag or saved to a file.

    Args:
        mol (rdkit.Chem.Mol): Object representing molecule.

        highlight (rdkit.Chem.Mol): Optional object representing a set of atoms and bonds to be highlighted in
        the image.

        name (str): Filename of image file to create, relative to 'directory'; only used if embed=False.

        type (str): Image format; must be 'png' or 'svg'.

        directory (str): Path relative to notebook directory of subdirectory where image file will be written.
        The directory will be created if necessary. Note that absolute paths will not work in notebooks. Ignored if embed=True.

        embed (bool): If True, image data will be embedded in the generated HTML tag. Otherwise it will be written to a
        file determined by the `directory` and `name` arguments.

        width (int): Width of image bounding box.

        height (int): Height of image bounding box.

    Returns:
        str: HTML image tag referencing the image file.

    """
    log = logging.getLogger('ATOM')
    
    if type.lower() not in ['png','svg']:
        log.warning('Image type not recognized. Choose between png and svg.')
        return ''
    
    # Handle Mol generated from invalid SMILES string
    if mol is None:
        return ''

    if embed:
        if type.lower() == 'png':
            img=mol_to_pil(mol, size=(width,height), highlight=highlight)
            imgByteArr = io.BytesIO()
            img.save(imgByteArr, format=img.format)
            imgByteArr = imgByteArr.getvalue()
            data_url = f'data:image/png;base64,' + b64encode(imgByteArr).decode()
        elif type.lower() == 'svg':
            img=mol_to_svg(mol, size=(width,height), highlight=highlight).encode('utf-8')
            data_url = f'data:image/svg+xml;base64,' + b64encode(img).decode()
        return f"<img src='{data_url}' style='width:{width}px;'>" 
    
    else:
        img_file = '%s/%s' % (directory, name)
        os.makedirs(directory, exist_ok=True)
        if type.lower() == 'png':
            save_png(mol, img_file, size=(width,height), highlight=highlight)
        elif type.lower()=='svg':
            save_svg(mol, img_file, size=(width,height), highlight=highlight)
        return f"<img src='{img_file}' style='width:{width}px;'>"



[docs]
def matching_atoms_and_bonds(mol, match_mol):
    """Returns lists of indices of atoms and bonds within molecule `mol` that are part of the substructure
    matched by `match_mol`.

    Args:
        mol (rdkit.Chem.Mol): Object representing molecule.

        match_mol (rdkit.Chem.Mol): Object representing a substructure or SMARTS pattern to be
        compared against `mol`, typically created by `Chem.MolFromSmiles()` or `Chem.MolFromSmarts()`.

    Returns:
        match_atoms, match_bonds (tuple(list(int), list(int))): Lists of indices of atoms and bonds
        within `mol` contained in the substructure (if any) matched by `match_mol`. Returns empty lists
        if there is no match.
    """
    match_atoms = []
    match_bonds = []
    if match_mol is not None:
        match_atoms = list(mol.GetSubstructMatch(match_mol))
        if len(match_atoms) > 1:
            for bond in match_mol.GetBonds():
               aid1 = match_atoms[bond.GetBeginAtomIdx()]
               aid2 = match_atoms[bond.GetEndAtomIdx()]
               match_bonds.append(mol.GetBondBetweenAtoms(aid1,aid2).GetIdx())
    return match_atoms, match_bonds




[docs]
def mol_to_pil(mol, size=(400, 200), highlight=None):
    """Returns a Python Image Library (PIL) object containing an image of the given molecule's structure.

    Args:
        mol (rdkit.Chem.Mol): Object representing molecule.

        size (tuple): Width and height of bounding box of image.

        highlight (rdkit.Chem.Mol): Object representing substructure to highlight on molecule.

    Returns:
        PIL.PngImageFile: An object containing an image of the molecule's structure.

    """
    highlight_atoms, highlight_bonds = matching_atoms_and_bonds(mol, highlight)
    pil = MolToImage(mol, size=(size[0], size[1]), highlightAtoms=highlight_atoms, highlightBonds=highlight_bonds)
    return pil




[docs]
def save_png(mol, name, size=(400, 200), highlight=None):
    """Draws the molecule mol into a PNG file with filename 'name' and with the given size
    in pixels.

    Args:
        mol (rdkit.Chem.Mol): Object representing molecule.

        name (str): Path to write PNG file to.

        size (tuple): Width and height of bounding box of image.

        highlight (rdkit.Chem.Mol): Object representing substructure to highlight on molecule.

    """
    pil = mol_to_pil(mol, size, highlight=highlight)
    pil.save(name, 'PNG')




[docs]
def mol_to_svg(mol, size=(400,200), highlight=None):
    """Returns a RDKit MolDraw2DSVG object containing an image of the given molecule's structure.

    Args:
        mol (rdkit.Chem.Mol): Object representing molecule.

        size (tuple): Width and height of bounding box of image.

        highlight (rdkit.Chem.Mol): Object representing substructure to highlight on molecule.

    Returns:
       RDKit.rdMolDraw2D.MolDraw2DSVG text (str): An SVG object containing an image of the molecule's structure.

    """
    img_wd, img_ht = size
    highlight_atoms, highlight_bonds = matching_atoms_and_bonds(mol, highlight)
    try:
        mol.GetAtomWithIdx(0).GetExplicitValence()
    except RuntimeError:
        mol.UpdatePropertyCache(False)
    try:
        mc_mol = rdMolDraw2D.PrepareMolForDrawing(mol, kekulize=True)
    except ValueError:
        # can happen on a kekulization failure
        mc_mol = rdMolDraw2D.PrepareMolForDrawing(mol, kekulize=False)
    drawer = rdMolDraw2D.MolDraw2DSVG(img_wd, img_ht)
    rdMolDraw2D.PrepareAndDrawMolecule(drawer, mc_mol, highlightAtoms=highlight_atoms,
                                                    highlightBonds=highlight_bonds)
    drawer.FinishDrawing()
    svg = drawer.GetDrawingText()
    svg = svg.replace('svg:','')
    svg = svg.replace('xmlns:svg','xmlns')
    return svg




[docs]
def save_svg(mol, name, size=(400,200), highlight=None):
    """Draws the molecule mol into an SVG file with filename 'name' and with the given size
    in pixels.

    Args:
        mol (rdkit.Chem.Mol): Object representing molecule.

        name (str): Path to write SVG file to.

        size (tuple): Width and height of bounding box of image.

        highlight (rdkit.Chem.Mol): Object representing substructure to highlight on molecule.

    """
    svg = mol_to_svg(mol, size, highlight=highlight)
    with open(name, 'w') as img_out:
        img_out.write(svg)




[docs]
def show_df(df):
    """Convenience function to display a pandas DataFrame in the current notebook window
    with HTML images rendered in table cells.

    Args:
        df (pd.DataFrame): Data frame to display.

    Returns:
        None

    """
    return HTML(df.to_html(escape=False))




[docs]
def show_html(html):
    """Convenience function to display an HTML image specified by image tag 'html'.

    Args:
        html (str): HTML image tag to render.

    Returns:
        None

    """
    return HTML(html)