1 '''
2 Importing pandasTools enables several features that allow for using RDKit molecules as columns of a Pandas dataframe.
3 If the dataframe is containing a molecule format in a column (e.g. smiles), like in this example:
4 >>> from rdkit.Chem import PandasTools
5 >>> import pandas as pd
6 >>> import os
7 >>> from rdkit import RDConfig
8 >>> antibiotics = pd.DataFrame(columns=['Name','Smiles'])
9 >>> antibiotics = antibiotics.append({'Smiles':'CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C','Name':'Penicilline G'}, ignore_index=True)#Penicilline G
10 >>> antibiotics = antibiotics.append({'Smiles':'CC1(C2CC3C(C(=O)C(=C(C3(C(=O)C2=C(C4=C1C=CC=C4O)O)O)O)C(=O)N)N(C)C)O','Name':'Tetracycline'}, ignore_index=True)#Tetracycline
11 >>> antibiotics = antibiotics.append({'Smiles':'CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O)O)C','Name':'Ampicilline'}, ignore_index=True)#Ampicilline
12 >>> print([str(x) for x in antibiotics.columns])
13 ['Name', 'Smiles']
14 >>> print(antibiotics)
15 Name Smiles
16 0 Penicilline G CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C
17 1 Tetracycline CC1(C2CC3C(C(=O)C(=C(C3(C(=O)C2=C(C4=C1C=CC=C4...
18 2 Ampicilline CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...
19
20 a new column can be created holding the respective RDKit molecule objects. The fingerprint can be included to accelerate substructure searches on the dataframe.
21
22 >>> PandasTools.AddMoleculeColumnToFrame(antibiotics,'Smiles','Molecule',includeFingerprints=True)
23 >>> print([str(x) for x in antibiotics.columns])
24 ['Name', 'Smiles', 'Molecule']
25
26 A substructure filter can be applied on the dataframe using the RDKit molecule column, because the ">=" operator has been modified to work as a substructure check.
27 Such the antibiotics containing the beta-lactam ring "C1C(=O)NC1" can be obtained by
28
29 >>> beta_lactam = Chem.MolFromSmiles('C1C(=O)NC1')
30 >>> beta_lactam_antibiotics = antibiotics[antibiotics['Molecule'] >= beta_lactam]
31 >>> print(beta_lactam_antibiotics[['Name','Smiles']])
32 Name Smiles
33 0 Penicilline G CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C
34 2 Ampicilline CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...
35
36
37 It is also possible to load an SDF file can be load into a dataframe.
38
39 >>> sdfFile = os.path.join(RDConfig.RDDataDir,'NCI/first_200.props.sdf')
40 >>> frame = PandasTools.LoadSDF(sdfFile,smilesName='SMILES',molColName='Molecule',includeFingerprints=True)
41 >>> frame.info # doctest: +SKIP
42 <bound method DataFrame.info of <class 'pandas.core.frame.DataFrame'>
43 Int64Index: 200 entries, 0 to 199
44 Data columns:
45 AMW 200 non-null values
46 CLOGP 200 non-null values
47 CP 200 non-null values
48 CR 200 non-null values
49 DAYLIGHT.FPG 200 non-null values
50 DAYLIGHT_CLOGP 200 non-null values
51 FP 200 non-null values
52 ID 200 non-null values
53 ISM 200 non-null values
54 LIPINSKI_VIOLATIONS 200 non-null values
55 NUM_HACCEPTORS 200 non-null values
56 NUM_HDONORS 200 non-null values
57 NUM_HETEROATOMS 200 non-null values
58 NUM_LIPINSKIHACCEPTORS 200 non-null values
59 NUM_LIPINSKIHDONORS 200 non-null values
60 NUM_RINGS 200 non-null values
61 NUM_ROTATABLEBONDS 200 non-null values
62 P1 30 non-null values
63 SMILES 200 non-null values
64 Molecule 200 non-null values
65 dtypes: object(20)>
66
67 In order to support rendering the molecules as images in the HTML export of the dataframe, the __str__ method is monkey-patched to return a base64 encoded PNG:
68 >>> molX = Chem.MolFromSmiles('Fc1cNc2ccccc12')
69 >>> print(molX) # doctest: +SKIP
70 <img src="data:image/png;base64,..." alt="Mol"/>
71 This can be reverted using the ChangeMoleculeRendering method
72 >>> ChangeMoleculeRendering(renderer='String')
73 >>> print(molX) # doctest: +SKIP
74 <rdkit.Chem.rdchem.Mol object at 0x10d179440>
75 >>> ChangeMoleculeRendering(renderer='PNG')
76 >>> print(molX) # doctest: +SKIP
77 <img src="data:image/png;base64,..." alt="Mol"/>
78 '''
79 from __future__ import print_function
80
81 from base64 import b64encode
82 import types,copy
83
84 from rdkit.six import BytesIO
85 from rdkit import Chem
86 from rdkit.Chem import Draw
87
88 try:
89 import pandas as pd
90 v = pd.version.version.split('.')
91 if v[0]=='0' and int(v[1])<10:
92 pd = None
93 else:
94 if 'display.width' in pd.core.config._registered_options:
95 pd.set_option('display.width',1000000000)
96 if 'display.max_rows' in pd.core.config._registered_options:
97 pd.set_option('display.max_rows',1000000000)
98 elif 'display.height' in pd.core.config._registered_options:
99 pd.set_option('display.height',1000000000)
100 if 'display.max_colwidth' in pd.core.config._registered_options:
101 pd.set_option('display.max_colwidth',1000000000)
102
103 defPandasRendering = pd.core.frame.DataFrame.to_html
104 except Exception as e:
105 pd = None
106
107 highlightSubstructures=True
108
109
111 '''
112 Patched default escaping of HTML control characters to allow molecule image rendering dataframes
113 '''
114 formatter = pd.core.format.DataFrameFormatter(self,buf=None,columns=None,col_space=None,colSpace=None,header=True,index=True,
115 na_rep='NaN',formatters=None,float_format=None,sparsify=None,index_names=True,
116 justify = None, force_unicode=None,bold_rows=True,classes=None,escape=False)
117 formatter.to_html()
118 html = formatter.buf.getvalue()
119 return html
120
122 '''Ensure inheritance of patched to_html in "head" subframe
123 '''
124 df = self[:n]
125 df.to_html = types.MethodType(patchPandasHTMLrepr,df)
126 df.head = types.MethodType(patchPandasHeadMethod,df)
127 return df
128
130 """displayhook function for PIL Images, rendered as PNG"""
131 import pandas as pd
132 bio = BytesIO()
133 x.save(bio,format='PNG')
134 s = b64encode(bio.getvalue()).decode('ascii')
135 pd.set_option('display.max_columns',len(s)+1000)
136 pd.set_option('display.max_rows',len(s)+1000)
137 if len(s)+100 > pd.get_option("display.max_colwidth"):
138 pd.set_option("display.max_colwidth",len(s)+1000)
139 return s
140
141 from rdkit import DataStructs
142
143 try:
144 from rdkit.Avalon import pyAvalonTools as pyAvalonTools
145 _fingerprinter=lambda x,y:pyAvalonTools.GetAvalonFP(x,isQuery=y,bitFlags=pyAvalonTools.avalonSSSBits)
146 except ImportError:
147 _fingerprinter=lambda x,y:Chem.PatternFingerprint(x,fpSize=2048)
148
150 """Allows for substructure check using the >= operator (X has substructure Y -> X >= Y) by
151 monkey-patching the __ge__ function
152 This has the effect that the pandas/numpy rowfilter can be used for substructure filtering (filtered = dframe[dframe['RDKitColumn'] >= SubstructureMolecule])
153 """
154 if x is None or y is None: return False
155 if hasattr(x,'_substructfp'):
156 if not hasattr(y,'_substructfp'):
157 y._substructfp=_fingerprinter(y,True)
158 if not DataStructs.AllProbeBitsMatch(y._substructfp,x._substructfp):
159 return False
160 match = x.GetSubstructMatch(y)
161 if match:
162 if highlightSubstructures:
163 x.__sssAtoms=list(match)
164 else:
165 x.__sssAtoms=[]
166 return True
167 else:
168 return False
169
170
171 Chem.Mol.__ge__ = _molge
172
174 '''returns the molecules as base64 encoded PNG image
175 '''
176 if highlightSubstructures and hasattr(x,'__sssAtoms'):
177 highlightAtoms=x.__sssAtoms
178 else:
179 highlightAtoms=[]
180 return '<img src="data:image/png;base64,%s" alt="Mol"/>'%_get_image(Draw.MolToImage(x,highlightAtoms=highlightAtoms))
181
182
185
186
187 Chem.Mol.__str__ = PrintAsBase64PNGString
188
190 '''Precomputes fingerprints and stores results in molecule objects to accelerate substructure matching
191 '''
192
193 if m is not None:
194 m._substructfp=_fingerprinter(m,False)
195 return m
196
198 '''Changes the default dataframe rendering to not escape HTML characters, thus allowing rendered images in all dataframes.
199 IMPORTANT: THIS IS A GLOBAL CHANGE THAT WILL AFFECT TO COMPLETE PYTHON SESSION. If you want to change the rendering only
200 for a single dataframe use the "ChangeMoleculeRendering" method instead.
201 '''
202 if images:
203 pd.core.frame.DataFrame.to_html = patchPandasHTMLrepr
204 else:
205 pd.core.frame.DataFrame.to_html = defPandasRendering
206
207
209 '''Converts the molecules contains in "smilesCol" to RDKit molecules and appends them to the dataframe "frame" using the specified column name.
210 If desired, a fingerprint can be computed and stored with the molecule objects to accelerate substructure matching
211 '''
212 if not includeFingerprints:
213 frame[molCol]=frame.apply(lambda x: Chem.MolFromSmiles(x[smilesCol]), axis=1)
214 else:
215 frame[molCol]=frame.apply(lambda x: _MolPlusFingerprint(Chem.MolFromSmiles(x[smilesCol])), axis=1)
216 RenderImagesInAllDataFrames(images=True)
217
218
219
220
222 '''Allows to change the rendering of the molecules between base64 PNG images and string representations.
223 This serves two purposes: First it allows to avoid the generation of images if this is not desired and, secondly, it allows to enable image rendering for
224 newly created dataframe that already contains molecules, without having to rerun the time-consuming AddMoleculeColumnToFrame. Note: this behaviour is, because some pandas methods, e.g. head()
225 returns a new dataframe instance that uses the default pandas rendering (thus not drawing images for molecules) instead of the monkey-patched one.
226 '''
227 if renderer == 'String':
228 Chem.Mol.__str__ = PrintDefaultMolRep
229 else:
230 Chem.Mol.__str__ = PrintAsBase64PNGString
231 if frame is not None:
232 frame.to_html = types.MethodType(patchPandasHTMLrepr,frame)
233
234 -def LoadSDF(filename, idName='ID',molColName = 'ROMol',includeFingerprints=False, isomericSmiles=False, smilesName=None):
235 """ Read file in SDF format and return as Pandas data frame """
236 df = None
237 if type(filename) is str:
238 f = open(filename, 'rb')
239 else:
240 f = filename
241 for i, mol in enumerate(Chem.ForwardSDMolSupplier(f)):
242 if mol is None: continue
243 row = dict((k, mol.GetProp(k)) for k in mol.GetPropNames())
244 if mol.HasProp('_Name'): row[idName] = mol.GetProp('_Name')
245 if smilesName is not None:
246 row[smilesName] = Chem.MolToSmiles(mol, isomericSmiles=isomericSmiles)
247 if not includeFingerprints:
248 row[molColName] = mol
249 else:
250 row[molColName] = _MolPlusFingerprint(mol)
251 row = pd.DataFrame(row, index=[i])
252 if df is None:
253 df = row
254 else:
255 df = df.append(row)
256 f.close()
257 RenderImagesInAllDataFrames(images=True)
258 return df
259
260 from rdkit.Chem import SDWriter
261
262 -def WriteSDF(df,out,molColumn,properties=None,allNumeric=False,titleColumn=None):
263 '''Write an SD file for the molecules in the dataframe. Dataframe columns can be exported as SDF tags if specific in the "properties" list.
264 The "allNumeric" flag allows to automatically include all numeric columns in the output.
265 "titleColumn" can be used to select a column to serve as molecule title. It can be set to "RowID" to use the dataframe row key as title.
266 '''
267 writer = SDWriter(out)
268 if properties is None:
269 properties=[]
270 if allNumeric:
271 properties.extend([dt for dt in df.dtypes.keys() if (np.issubdtype(df.dtypes[dt],float) or np.issubdtype(df.dtypes[dt],int))])
272
273 if molColumn in properties:
274 properties.remove(molColumn)
275 if titleColumn in properties:
276 properties.remove(titleColumn)
277 writer.SetProps(properties)
278 for row in df.iterrows():
279 mol = copy.deepcopy(row[1][molColumn])
280 if titleColumn is not None:
281 if titleColumn == 'RowID':
282 mol.SetProp('_Name',str(row[0]))
283 else:
284 mol.SetProp('_Name',row[1][titleColumn])
285 for p in properties:
286 mol.SetProp(p,str(row[1][p]))
287 writer.write(mol)
288 writer.close()
289
290
291
292 from rdkit.Chem import SaltRemover
293 remover = SaltRemover.SaltRemover()
294
296 '''
297 Removes salts from mols in pandas DataFrame's ROMol column
298 '''
299 frame[molCol] = frame.apply(lambda x: remover.StripMol(x[molCol]), axis = 1)
300
302 '''
303 Saves smi file. SMILES are generated from column with RDKit molecules. Column with names is optional.
304 '''
305 w = Chem.SmilesWriter(outFile, isomericSmiles=isomericSmiles)
306 if NamesCol != '':
307 for m,n in zip(frame[molCol], map(str,frame[NamesCol])):
308 m.SetProp('_Name',n)
309 w.write(m)
310 w.close()
311 else:
312 for m in frame[molCol]:
313 w.write(m)
314 w.close()
315
316 import numpy as np
317 import os
318 from rdkit.six.moves import cStringIO as StringIO
319
321 """
322 Saves pandas DataFrame as a xlsx file with embedded images.
323 It maps numpy data types to excel cell types:
324 int, float -> number
325 datetime -> datetime
326 object -> string (limited to 32k character - xlsx limitations)
327
328 Cells with compound images are a bit larger than images due to excel.
329 Column width weirdness explained (from xlsxwriter docs):
330 The width corresponds to the column width value that is specified in Excel.
331 It is approximately equal to the length of a string in the default font of Calibri 11.
332 Unfortunately, there is no way to specify "AutoFit" for a column in the Excel file format.
333 This feature is only available at runtime from within Excel.
334 """
335
336 import xlsxwriter
337
338 cols = list(frame.columns)
339 cols.remove(molCol)
340 dataTypes = dict(frame.dtypes)
341
342 workbook = xlsxwriter.Workbook(outFile)
343 worksheet = workbook.add_worksheet()
344 worksheet.set_column('A:A', size[0]/6.)
345
346
347 c2 = 1
348 for x in cols:
349 worksheet.write_string(0, c2, x)
350 c2 += 1
351
352 c = 1
353 for index, row in frame.iterrows():
354 image_data = StringIO()
355 img = Draw.MolToImage(row[molCol], size=size)
356 img.save(image_data, format='PNG')
357
358 worksheet.set_row(c, height=size[1])
359 worksheet.insert_image(c, 0, "f", {'image_data': image_data})
360
361 c2 = 1
362 for x in cols:
363 if str(dataTypes[x]) == "object":
364 worksheet.write_string(c, c2, str(row[x])[:32000])
365 elif ('float' in str(dataTypes[x])) or ('int' in str(dataTypes[x])):
366 if (row[x] != np.nan) or (row[x] != np.inf):
367 worksheet.write_number(c, c2, row[x])
368 elif 'datetime' in str(dataTypes[x]):
369 worksheet.write_datetime(c, c2, row[x])
370 c2 += 1
371 c += 1
372
373 workbook.close()
374 image_data.close()
375
376
378 '''
379 Draw grid image of mols in pandas DataFrame.
380 '''
381 if legendsCol:
382 img = Draw.MolsToGridImage(frame[column], legends=map(str, list(frame[legendsCol])), **kwargs)
383 else:
384 img = Draw.MolsToGridImage(frame[column], **kwargs)
385 return img
386
387 from rdkit.Chem.Scaffolds import MurckoScaffold
388
389 -def AddMurckoToFrame(frame, molCol = 'ROMol', MurckoCol = 'Murcko_SMILES', Generic = False):
397
398
399 from rdkit.Chem import AllChem
400
409
411 '''
412 Aligns molecules in molCol to scaffolds in scaffoldCol
413 '''
414 frame[molCol] = frame.apply(lambda x: AlignMol(x[molCol],x[scaffoldCol]), axis=1)
415
416
417 if __name__ == "__main__":
418 import sys
419 if pd is None:
420 print("pandas installation not found, skipping tests", file=sys.stderr)
421 else:
422 v = pd.version.version.split('.')
423 if v[0]=='0' and int(v[1])<10:
424 print("pandas installation >=0.10 not found, skipping tests",
425 file=sys.stderr)
426 else:
427 import doctest
428 failed,tried=doctest.testmod(optionflags=doctest.ELLIPSIS+doctest.NORMALIZE_WHITESPACE)
429 if failed:
430 sys.exit(failed)
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463