Source code for utils.pubchem_utils

import urllib.request as urlreq
import io,json
import pandas as pd

# ******************************************************************************************************************************************

[docs]
def download_smiles(myList,intv=1) :
    """Retrieve canonical SMILES strings for a list of input INCHIKEYS.
    Will return only one SMILES string per INCHIKEY.  If there are multiple values returned, the first is retained and the others are returned in a the discard_lst.  INCHIKEYS that fail to return a SMILES string are put in the fail_lst

    Args:
        myList (list): List of INCHIKEYS

        intv (1): number of INCHIKEYS to submit queries for in one request, default is 1

    Returns:
        list of SMILES strings corresponding to INCHIKEYS

        list of INCHIKEYS, which failed to return a SMILES string

        list of CIDs and SMILES, which were returned beyond the first CID and SMILE found for input INCHIKEY
    """
    ncmpds=len(myList)
    smiles_lst,cid_lst,inchikey_lst=[],[],[]
    sublst=""
    fail_lst=[]
    discard_lst=[]
    for it in range(0,ncmpds,intv) :
        if (it+intv) > ncmpds :
            upbnd=ncmpds
        else :
            upbnd=it+intv
        sublst=myList[it:upbnd]
        inchikey = ','.join(map(str,sublst)) 
        url="https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/"+inchikey+"/property/CanonicalSMILES/CSV"
        try :
            response = urlreq.urlopen(url)
            html = response.read()
        except :
            fail_lst.append(inchikey)
            continue
        f=io.BytesIO(html)
        cnt=0
        for l in f :
            l=l.decode("utf-8") 
            l=l.rstrip()
            vals=l.split(',')
            if vals[0] == '"CID"' :
                continue
            if cnt > 0:
                #print("more than one SMILES returned, discarding. Appear to be multiple CID values",vals)
                #print("using",cid_lst[-1],smiles_lst[-1],inchikey_lst[-1])
                discard_lst.append(vals)
                break
            
            cid_lst.append(vals[0])
            sstr=vals[1].replace('"','')
            smiles_lst.append(vals[1])    
            inchikey_lst.append(myList[it+cnt])
            cnt+=1
        if cnt != len(sublst) :
            print("warning, multiple SMILES for this inchikey key",cnt,len(sublst),sublst)
    save_smiles_df=pd.DataFrame( {'CID' : cid_lst, 'standard_inchi_key' :inchikey_lst, 'smiles' : smiles_lst})
    return save_smiles_df,fail_lst,discard_lst



#******************************************************************************************************************************************

[docs]
def download_bioactivity_assay(myList,intv=1) :
    """Retrieve summary info on bioactivity assays.

    Args:
        myList (list): List of PubChem AIDs (bioactivity assay ids)

        intv (1): number of INCHIKEYS to submit queries for in one request, default is 1

    Returns:
        Nothing returned yet, will return basic stats to help decide whether to use assay or not
    """
    ncmpds=len(myList)
    smiles_lst,cid_lst,inchikey_lst=[],[],[]
    sublst=""
    fail_lst=[]
    jsn_lst=[]
    for it in range(0,ncmpds,intv) :
        if (it+intv) > ncmpds :
            upbnd=ncmpds
        else :
            upbnd=it+intv
        sublst=myList[it:upbnd]
        inchikey = ','.join(map(str,sublst)) 
        url="https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/"+inchikey+"/summary/JSON"
        try :
            response = urlreq.urlopen(url)
            html = response.read()
        except :
            fail_lst.append(inchikey)
            continue
        f=io.BytesIO(html)
        cnt=0
        json_str=""
        for l in f :
            l=l.decode("utf-8") 
            l=l.rstrip()
            json_str += l
        jsn_lst.append(json_str)
    return jsn_lst

#    save_smiles_df=pd.DataFrame( {'CID' : cid_lst, 'standard_inchi_key' :inchikey_lst, 'smiles' : smiles_lst})
#    return save_smiles_df,fail_lst,discard_lst
     
#******************************************************************************************************************************************

[docs]
def download_SID_from_bioactivity_assay(bioassayid) :
    """Retrieve summary info on bioactivity assays.

    Args:
        a single bioactivity id: PubChem AIDs (bioactivity assay ids)

    Returns:
        Returns the sids tested on this assay
    """
    myList=[bioassayid]
    ncmpds=len(myList)
    smiles_lst,cid_lst,inchikey_lst=[],[],[]
    sublst=""
    fail_lst=[]
    jsn_lst=[]
    intv=1
    for it in range(0,ncmpds,intv) :
        if (it+intv) > ncmpds :
            upbnd=ncmpds
        else :
            upbnd=it+intv
        sublst=myList[it:upbnd]
        inchikey = ','.join(map(str,sublst)) 
        url="https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/"+inchikey+"/sids/JSON"
        try :
            response = urlreq.urlopen(url)
            html = response.read()
        except :
            fail_lst.append(inchikey)
            continue
        f=io.BytesIO(html)
        cnt=0
        json_str=""
        for l in f :
            l=l.decode("utf-8") 
            l=l.rstrip()
            json_str += l
        jsn_lst.append(json_str)
    res=json.loads(jsn_lst[0])
    res_lst=res["InformationList"]['Information'][0]['SID']
    return res_lst

     
#https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/504526/doseresponse/CSV?sid=104169547,109967232

#******************************************************************************************************************************************

[docs]
def download_dose_response_from_bioactivity(aid,sidlst) :
    """Retrieve data for assays for a select list of sids.

    Args:
        myList (list): a bioactivity id (aid)

        sidlst (list): list of sids specified as integers

    Returns:
        Nothing returned yet, will return basic stats to help decide whether to use assay or not
    """
    sidstr= "," . join(str(val) for val in sidlst)
    myList=[sidstr]
    ncmpds=len(myList)
    smiles_lst,cid_lst,inchikey_lst=[],[],[]
    sublst=""
    fail_lst=[]
    jsn_lst=[]
    intv=1
    for it in range(0,ncmpds,intv) :
        if (it+intv) > ncmpds :
            upbnd=ncmpds
        else :
            upbnd=it+intv
        sublst=myList[it:upbnd]
        inchikey = ','.join(map(str,sublst)) 
        url="https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/"+aid+"/doseresponse/CSV?sid="+inchikey
        try :
            response = urlreq.urlopen(url)
            html = response.read()
        except :
            fail_lst.append(inchikey)
            continue
        f=io.BytesIO(html)
        cnt=0
        json_str=""
        df=pd.read_csv(f)
        jsn_lst.append(df)
    return jsn_lst



#******************************************************************************************************************************************

[docs]
def download_activitytype(aid,sid) :
    """Retrieve data for assays for a select list of sids.

    Args:
        myList (list): a bioactivity id (aid)

        sidlst (list): list of sids specified as integers

    Returns:
        Nothing returned yet, will return basic stats to help decide whether to use assay or not
    """
    myList=[sid]
    ncmpds=len(myList)
    smiles_lst,cid_lst,inchikey_lst=[],[],[]
    sublst=""
    fail_lst=[]
    jsn_lst=[]
    intv=1
    for it in range(0,ncmpds,intv) :
        if (it+intv) > ncmpds :
            upbnd=ncmpds
        else :
            upbnd=it+intv
        sublst=myList[it:upbnd]
        inchikey = ','.join(map(str,sublst)) 
        
        
        url="https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/"+aid+"/CSV?sid="+inchikey 
        #url="https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/"+aid+"/doseresponse/CSV?sid="+inchikey
        try :
            response = urlreq.urlopen(url)
            html = response.read()
        except :
            fail_lst.append(inchikey)
            continue
        f=io.BytesIO(html)
        cnt=0
        json_str=""
        df=pd.read_csv(f)
        jsn_lst.append(df)
    return jsn_lst