Package rdkit :: Package Chem :: Module PandasTools
[hide private]
[frames] | no frames]

Source Code for Module rdkit.Chem.PandasTools

  1  ''' 
  2  Importing pandasTools enables several features that allow for using RDKit molecules as columns of a Pandas dataframe. 
  3  If the dataframe is containing a molecule format in a column (e.g. smiles), like in this example: 
  4  >>> from rdkit.Chem import PandasTools 
  5  >>> import pandas as pd 
  6  >>> import os 
  7  >>> from rdkit import RDConfig 
  8  >>> antibiotics = pd.DataFrame(columns=['Name','Smiles']) 
  9  >>> antibiotics = antibiotics.append({'Smiles':'CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C','Name':'Penicilline G'}, ignore_index=True)#Penicilline G 
 10  >>> antibiotics = antibiotics.append({'Smiles':'CC1(C2CC3C(C(=O)C(=C(C3(C(=O)C2=C(C4=C1C=CC=C4O)O)O)O)C(=O)N)N(C)C)O','Name':'Tetracycline'}, ignore_index=True)#Tetracycline 
 11  >>> antibiotics = antibiotics.append({'Smiles':'CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O)O)C','Name':'Ampicilline'}, ignore_index=True)#Ampicilline 
 12  >>> print([str(x) for x in  antibiotics.columns]) 
 13  ['Name', 'Smiles'] 
 14  >>> print(antibiotics) 
 15              Name                                             Smiles 
 16  0  Penicilline G    CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C 
 17  1   Tetracycline  CC1(C2CC3C(C(=O)C(=C(C3(C(=O)C2=C(C4=C1C=CC=C4... 
 18  2  Ampicilline  CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O... 
 19   
 20  a new column can be created holding the respective RDKit molecule objects. The fingerprint can be included to accelerate substructure searches on the dataframe. 
 21   
 22  >>> PandasTools.AddMoleculeColumnToFrame(antibiotics,'Smiles','Molecule',includeFingerprints=True) 
 23  >>> print([str(x) for x in  antibiotics.columns]) 
 24  ['Name', 'Smiles', 'Molecule'] 
 25   
 26  A substructure filter can be applied on the dataframe using the RDKit molecule column, because the ">=" operator has been modified to work as a substructure check. 
 27  Such the antibiotics containing the beta-lactam ring "C1C(=O)NC1" can be obtained by 
 28   
 29  >>> beta_lactam = Chem.MolFromSmiles('C1C(=O)NC1') 
 30  >>> beta_lactam_antibiotics = antibiotics[antibiotics['Molecule'] >= beta_lactam] 
 31  >>> print(beta_lactam_antibiotics[['Name','Smiles']]) 
 32              Name                                             Smiles 
 33  0  Penicilline G    CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C 
 34  2  Ampicilline  CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O... 
 35   
 36   
 37  It is also possible to load an SDF file can be load into a dataframe. 
 38   
 39  >>> sdfFile = os.path.join(RDConfig.RDDataDir,'NCI/first_200.props.sdf') 
 40  >>> frame = PandasTools.LoadSDF(sdfFile,smilesName='SMILES',molColName='Molecule',includeFingerprints=True) 
 41  >>> frame.info # doctest: +SKIP 
 42  <bound method DataFrame.info of <class 'pandas.core.frame.DataFrame'> 
 43  Int64Index: 200 entries, 0 to 199 
 44  Data columns: 
 45  AMW                       200  non-null values 
 46  CLOGP                     200  non-null values 
 47  CP                        200  non-null values 
 48  CR                        200  non-null values 
 49  DAYLIGHT.FPG              200  non-null values 
 50  DAYLIGHT_CLOGP            200  non-null values 
 51  FP                        200  non-null values 
 52  ID                        200  non-null values 
 53  ISM                       200  non-null values 
 54  LIPINSKI_VIOLATIONS       200  non-null values 
 55  NUM_HACCEPTORS            200  non-null values 
 56  NUM_HDONORS               200  non-null values 
 57  NUM_HETEROATOMS           200  non-null values 
 58  NUM_LIPINSKIHACCEPTORS    200  non-null values 
 59  NUM_LIPINSKIHDONORS       200  non-null values 
 60  NUM_RINGS                 200  non-null values 
 61  NUM_ROTATABLEBONDS        200  non-null values 
 62  P1                        30  non-null values 
 63  SMILES                    200  non-null values 
 64  Molecule                  200  non-null values 
 65  dtypes: object(20)> 
 66   
 67  In order to support rendering the molecules as images in the HTML export of the dataframe, the __str__ method is monkey-patched to return a base64 encoded PNG: 
 68  >>> molX = Chem.MolFromSmiles('Fc1cNc2ccccc12') 
 69  >>> print(molX) # doctest: +SKIP 
 70  <img src="data:image/png;base64,..." alt="Mol"/> 
 71  This can be reverted using the ChangeMoleculeRendering method 
 72  >>> ChangeMoleculeRendering(renderer='String') 
 73  >>> print(molX) # doctest: +SKIP 
 74  <rdkit.Chem.rdchem.Mol object at 0x10d179440> 
 75  >>> ChangeMoleculeRendering(renderer='PNG') 
 76  >>> print(molX) # doctest: +SKIP 
 77  <img src="data:image/png;base64,..." alt="Mol"/> 
 78  ''' 
 79  from __future__ import print_function 
 80   
 81  from base64 import b64encode 
 82  import types,copy 
 83   
 84  from rdkit.six import BytesIO 
 85  from rdkit import Chem 
 86  from rdkit.Chem import Draw 
 87   
 88  try: 
 89    import pandas as pd 
 90    v = pd.version.version.split('.') 
 91    if v[0]=='0' and int(v[1])<10: 
 92      pd = None 
 93    else: 
 94      if 'display.width' in  pd.core.config._registered_options: 
 95        pd.set_option('display.width',1000000000) 
 96      if 'display.max_rows' in  pd.core.config._registered_options: 
 97        pd.set_option('display.max_rows',1000000000) 
 98      elif 'display.height' in  pd.core.config._registered_options: 
 99        pd.set_option('display.height',1000000000) 
100      if 'display.max_colwidth' in  pd.core.config._registered_options: 
101        pd.set_option('display.max_colwidth',1000000000) 
102      #saves the default pandas rendering to allow restauration 
103      defPandasRendering = pd.core.frame.DataFrame.to_html 
104  except Exception as e: 
105    pd = None 
106   
107  highlightSubstructures=True 
108   
109   
110 -def patchPandasHTMLrepr(self,**kwargs):
111 ''' 112 Patched default escaping of HTML control characters to allow molecule image rendering dataframes 113 ''' 114 formatter = pd.core.format.DataFrameFormatter(self,buf=None,columns=None,col_space=None,colSpace=None,header=True,index=True, 115 na_rep='NaN',formatters=None,float_format=None,sparsify=None,index_names=True, 116 justify = None, force_unicode=None,bold_rows=True,classes=None,escape=False) 117 formatter.to_html() 118 html = formatter.buf.getvalue() 119 return html
120
121 -def patchPandasHeadMethod(self,n=5):
122 '''Ensure inheritance of patched to_html in "head" subframe 123 ''' 124 df = self[:n] 125 df.to_html = types.MethodType(patchPandasHTMLrepr,df) 126 df.head = types.MethodType(patchPandasHeadMethod,df) 127 return df
128
129 -def _get_image(x):
130 """displayhook function for PIL Images, rendered as PNG""" 131 import pandas as pd 132 bio = BytesIO() 133 x.save(bio,format='PNG') 134 s = b64encode(bio.getvalue()).decode('ascii') 135 pd.set_option('display.max_columns',len(s)+1000) 136 pd.set_option('display.max_rows',len(s)+1000) 137 if len(s)+100 > pd.get_option("display.max_colwidth"): 138 pd.set_option("display.max_colwidth",len(s)+1000) 139 return s
140 141 from rdkit import DataStructs 142 143 try: 144 from rdkit.Avalon import pyAvalonTools as pyAvalonTools 145 _fingerprinter=lambda x,y:pyAvalonTools.GetAvalonFP(x,isQuery=y,bitFlags=pyAvalonTools.avalonSSSBits) 146 except ImportError: 147 _fingerprinter=lambda x,y:Chem.PatternFingerprint(x,fpSize=2048) 148
149 -def _molge(x,y):
150 """Allows for substructure check using the >= operator (X has substructure Y -> X >= Y) by 151 monkey-patching the __ge__ function 152 This has the effect that the pandas/numpy rowfilter can be used for substructure filtering (filtered = dframe[dframe['RDKitColumn'] >= SubstructureMolecule]) 153 """ 154 if x is None or y is None: return False 155 if hasattr(x,'_substructfp'): 156 if not hasattr(y,'_substructfp'): 157 y._substructfp=_fingerprinter(y,True) 158 if not DataStructs.AllProbeBitsMatch(y._substructfp,x._substructfp): 159 return False 160 match = x.GetSubstructMatch(y) 161 if match: 162 if highlightSubstructures: 163 x.__sssAtoms=list(match) 164 else: 165 x.__sssAtoms=[] 166 return True 167 else: 168 return False
169 170 171 Chem.Mol.__ge__ = _molge # lambda x,y: x.HasSubstructMatch(y) 172
173 -def PrintAsBase64PNGString(x,renderer = None):
174 '''returns the molecules as base64 encoded PNG image 175 ''' 176 if highlightSubstructures and hasattr(x,'__sssAtoms'): 177 highlightAtoms=x.__sssAtoms 178 else: 179 highlightAtoms=[] 180 return '<img src="data:image/png;base64,%s" alt="Mol"/>'%_get_image(Draw.MolToImage(x,highlightAtoms=highlightAtoms))
181 182
183 -def PrintDefaultMolRep(x):
184 return str(x.__repr__())
185 186 #Chem.Mol.__str__ = lambda x: '<img src="data:image/png;base64,%s" alt="Mol"/>'%get_image(Draw.MolToImage(x)) 187 Chem.Mol.__str__ = PrintAsBase64PNGString 188
189 -def _MolPlusFingerprint(m):
190 '''Precomputes fingerprints and stores results in molecule objects to accelerate substructure matching 191 ''' 192 #m = Chem.MolFromSmiles(smi) 193 if m is not None: 194 m._substructfp=_fingerprinter(m,False) 195 return m
196
197 -def RenderImagesInAllDataFrames(images=True):
198 '''Changes the default dataframe rendering to not escape HTML characters, thus allowing rendered images in all dataframes. 199 IMPORTANT: THIS IS A GLOBAL CHANGE THAT WILL AFFECT TO COMPLETE PYTHON SESSION. If you want to change the rendering only 200 for a single dataframe use the "ChangeMoleculeRendering" method instead. 201 ''' 202 if images: 203 pd.core.frame.DataFrame.to_html = patchPandasHTMLrepr 204 else: 205 pd.core.frame.DataFrame.to_html = defPandasRendering
206 207
208 -def AddMoleculeColumnToFrame(frame, smilesCol='Smiles', molCol = 'ROMol',includeFingerprints=False):
209 '''Converts the molecules contains in "smilesCol" to RDKit molecules and appends them to the dataframe "frame" using the specified column name. 210 If desired, a fingerprint can be computed and stored with the molecule objects to accelerate substructure matching 211 ''' 212 if not includeFingerprints: 213 frame[molCol]=frame.apply(lambda x: Chem.MolFromSmiles(x[smilesCol]), axis=1) 214 else: 215 frame[molCol]=frame.apply(lambda x: _MolPlusFingerprint(Chem.MolFromSmiles(x[smilesCol])), axis=1) 216 RenderImagesInAllDataFrames(images=True)
217 #frame.to_html = types.MethodType(patchPandasHTMLrepr,frame) 218 #frame.head = types.MethodType(patchPandasHeadMethod,frame) 219 220
221 -def ChangeMoleculeRendering(frame=None, renderer='PNG'):
222 '''Allows to change the rendering of the molecules between base64 PNG images and string representations. 223 This serves two purposes: First it allows to avoid the generation of images if this is not desired and, secondly, it allows to enable image rendering for 224 newly created dataframe that already contains molecules, without having to rerun the time-consuming AddMoleculeColumnToFrame. Note: this behaviour is, because some pandas methods, e.g. head() 225 returns a new dataframe instance that uses the default pandas rendering (thus not drawing images for molecules) instead of the monkey-patched one. 226 ''' 227 if renderer == 'String': 228 Chem.Mol.__str__ = PrintDefaultMolRep 229 else: 230 Chem.Mol.__str__ = PrintAsBase64PNGString 231 if frame is not None: 232 frame.to_html = types.MethodType(patchPandasHTMLrepr,frame)
233
234 -def LoadSDF(filename, idName='ID',molColName = 'ROMol',includeFingerprints=False, isomericSmiles=False, smilesName=None):
235 """ Read file in SDF format and return as Pandas data frame """ 236 df = None 237 if type(filename) is str: 238 f = open(filename, 'rb') #'rU') 239 else: 240 f = filename 241 for i, mol in enumerate(Chem.ForwardSDMolSupplier(f)): 242 if mol is None: continue 243 row = dict((k, mol.GetProp(k)) for k in mol.GetPropNames()) 244 if mol.HasProp('_Name'): row[idName] = mol.GetProp('_Name') 245 if smilesName is not None: 246 row[smilesName] = Chem.MolToSmiles(mol, isomericSmiles=isomericSmiles) 247 if not includeFingerprints: 248 row[molColName] = mol 249 else: 250 row[molColName] = _MolPlusFingerprint(mol) 251 row = pd.DataFrame(row, index=[i]) 252 if df is None: 253 df = row 254 else: 255 df = df.append(row) 256 f.close() 257 RenderImagesInAllDataFrames(images=True) 258 return df
259 260 from rdkit.Chem import SDWriter 261
262 -def WriteSDF(df,out,molColumn,properties=None,allNumeric=False,titleColumn=None):
263 '''Write an SD file for the molecules in the dataframe. Dataframe columns can be exported as SDF tags if specific in the "properties" list. 264 The "allNumeric" flag allows to automatically include all numeric columns in the output. 265 "titleColumn" can be used to select a column to serve as molecule title. It can be set to "RowID" to use the dataframe row key as title. 266 ''' 267 writer = SDWriter(out) 268 if properties is None: 269 properties=[] 270 if allNumeric: 271 properties.extend([dt for dt in df.dtypes.keys() if (np.issubdtype(df.dtypes[dt],float) or np.issubdtype(df.dtypes[dt],int))]) 272 273 if molColumn in properties: 274 properties.remove(molColumn) 275 if titleColumn in properties: 276 properties.remove(titleColumn) 277 writer.SetProps(properties) 278 for row in df.iterrows(): 279 mol = copy.deepcopy(row[1][molColumn]) 280 if titleColumn is not None: 281 if titleColumn == 'RowID': 282 mol.SetProp('_Name',str(row[0])) 283 else: 284 mol.SetProp('_Name',row[1][titleColumn]) 285 for p in properties: 286 mol.SetProp(p,str(row[1][p])) 287 writer.write(mol) 288 writer.close()
289 290 291 292 from rdkit.Chem import SaltRemover 293 remover = SaltRemover.SaltRemover() 294
295 -def RemoveSaltsFromFrame(frame, molCol = 'ROMol'):
296 ''' 297 Removes salts from mols in pandas DataFrame's ROMol column 298 ''' 299 frame[molCol] = frame.apply(lambda x: remover.StripMol(x[molCol]), axis = 1)
300
301 -def SaveSMILESFromFrame(frame, outFile, molCol='ROMol', NamesCol='', isomericSmiles=False):
302 ''' 303 Saves smi file. SMILES are generated from column with RDKit molecules. Column with names is optional. 304 ''' 305 w = Chem.SmilesWriter(outFile, isomericSmiles=isomericSmiles) 306 if NamesCol != '': 307 for m,n in zip(frame[molCol], map(str,frame[NamesCol])): 308 m.SetProp('_Name',n) 309 w.write(m) 310 w.close() 311 else: 312 for m in frame[molCol]: 313 w.write(m) 314 w.close()
315 316 import numpy as np 317 import os 318 from rdkit.six.moves import cStringIO as StringIO 319
320 -def SaveXlsxFromFrame(frame, outFile, molCol='ROMol', size=(300,300)):
321 """ 322 Saves pandas DataFrame as a xlsx file with embedded images. 323 It maps numpy data types to excel cell types: 324 int, float -> number 325 datetime -> datetime 326 object -> string (limited to 32k character - xlsx limitations) 327 328 Cells with compound images are a bit larger than images due to excel. 329 Column width weirdness explained (from xlsxwriter docs): 330 The width corresponds to the column width value that is specified in Excel. 331 It is approximately equal to the length of a string in the default font of Calibri 11. 332 Unfortunately, there is no way to specify "AutoFit" for a column in the Excel file format. 333 This feature is only available at runtime from within Excel. 334 """ 335 336 import xlsxwriter # don't want to make this a RDKit dependency 337 338 cols = list(frame.columns) 339 cols.remove(molCol) 340 dataTypes = dict(frame.dtypes) 341 342 workbook = xlsxwriter.Workbook(outFile) # New workbook 343 worksheet = workbook.add_worksheet() # New work sheet 344 worksheet.set_column('A:A', size[0]/6.) # column width 345 346 # Write first row with column names 347 c2 = 1 348 for x in cols: 349 worksheet.write_string(0, c2, x) 350 c2 += 1 351 352 c = 1 353 for index, row in frame.iterrows(): 354 image_data = StringIO() 355 img = Draw.MolToImage(row[molCol], size=size) 356 img.save(image_data, format='PNG') 357 358 worksheet.set_row(c, height=size[1]) # looks like height is not in px? 359 worksheet.insert_image(c, 0, "f", {'image_data': image_data}) 360 361 c2 = 1 362 for x in cols: 363 if str(dataTypes[x]) == "object": 364 worksheet.write_string(c, c2, str(row[x])[:32000]) # string length is limited in xlsx 365 elif ('float' in str(dataTypes[x])) or ('int' in str(dataTypes[x])): 366 if (row[x] != np.nan) or (row[x] != np.inf): 367 worksheet.write_number(c, c2, row[x]) 368 elif 'datetime' in str(dataTypes[x]): 369 worksheet.write_datetime(c, c2, row[x]) 370 c2 += 1 371 c += 1 372 373 workbook.close() 374 image_data.close()
375 376
377 -def FrameToGridImage(frame, column = 'ROMol', legendsCol=None, **kwargs):
378 ''' 379 Draw grid image of mols in pandas DataFrame. 380 ''' 381 if legendsCol: 382 img = Draw.MolsToGridImage(frame[column], legends=map(str, list(frame[legendsCol])), **kwargs) 383 else: 384 img = Draw.MolsToGridImage(frame[column], **kwargs) 385 return img
386 387 from rdkit.Chem.Scaffolds import MurckoScaffold 388
389 -def AddMurckoToFrame(frame, molCol = 'ROMol', MurckoCol = 'Murcko_SMILES', Generic = False):
390 ''' 391 Adds column with SMILES of Murcko scaffolds to pandas DataFrame. Generic set to true results in SMILES of generic framework. 392 ''' 393 if Generic: 394 frame[MurckoCol] = frame.apply(lambda x: Chem.MolToSmiles(MurckoScaffold.MakeScaffoldGeneric(MurckoScaffold.GetScaffoldForMol(x[molCol]))), axis=1) 395 else: 396 frame[MurckoCol] = frame.apply(lambda x: Chem.MolToSmiles(MurckoScaffold.GetScaffoldForMol(x[molCol])), axis=1)
397 398 399 from rdkit.Chem import AllChem 400
401 -def AlignMol(mol,scaffold):
402 """ 403 Aligns mol (RDKit mol object) to scaffold (SMILES string) 404 """ 405 scaffold = Chem.MolFromSmiles(scaffold) 406 AllChem.Compute2DCoords(scaffold) 407 AllChem.GenerateDepictionMatching2DStructure(mol,scaffold) 408 return mol
409
410 -def AlignToScaffold(frame, molCol='ROMol', scaffoldCol='Murcko_SMILES'):
411 ''' 412 Aligns molecules in molCol to scaffolds in scaffoldCol 413 ''' 414 frame[molCol] = frame.apply(lambda x: AlignMol(x[molCol],x[scaffoldCol]), axis=1)
415 416 417 if __name__ == "__main__": 418 import sys 419 if pd is None: 420 print("pandas installation not found, skipping tests", file=sys.stderr) 421 else: 422 v = pd.version.version.split('.') 423 if v[0]=='0' and int(v[1])<10: 424 print("pandas installation >=0.10 not found, skipping tests", 425 file=sys.stderr) 426 else: 427 import doctest 428 failed,tried=doctest.testmod(optionflags=doctest.ELLIPSIS+doctest.NORMALIZE_WHITESPACE) 429 if failed: 430 sys.exit(failed) 431 432 # $Id$ 433 # 434 # Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc. 435 # All rights reserved. 436 # 437 # Redistribution and use in source and binary forms, with or without 438 # modification, are permitted provided that the following conditions are 439 # met: 440 # 441 # * Redistributions of source code must retain the above copyright 442 # notice, this list of conditions and the following disclaimer. 443 # * Redistributions in binary form must reproduce the above 444 # copyright notice, this list of conditions and the following 445 # disclaimer in the documentation and/or other materials provided 446 # with the distribution. 447 # * Neither the name of Novartis Institutes for BioMedical Research Inc. 448 # nor the names of its contributors may be used to endorse or promote 449 # products derived from this software without specific prior written permission. 450 # 451 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 452 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 453 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 454 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 455 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 456 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 457 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 458 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 459 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 460 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 461 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 462 # 463