1
2
3
4
5
6
7
8
9
10
11 """ utility functionality for fingerprinting sets of molecules
12 includes a command line app for working with fingerprints
13 and databases
14
15
16 Sample Usage:
17
18 python FingerprintMols.py -d data.gdb \
19 -t 'raw_dop_data' --smilesName="Structure" --idName="Mol_ID" \
20 --outTable="daylight_sig"
21
22
23 """
24 from __future__ import print_function
25 from rdkit import Chem
26 from rdkit.Chem import MACCSkeys
27 from rdkit.ML.Cluster import Murtagh
28 from rdkit import DataStructs
29 import sys
30 from rdkit.six.moves import cPickle
31
32 _cvsVersion="$Id$"
33 idx1 = _cvsVersion.find(':')+1
34 idx2 = _cvsVersion.rfind('$')
35 __VERSION_STRING="%s"%(_cvsVersion[idx1:idx2])
36
37
42
47
49 nOn = fp.GetNumOnBits()
50 nTot = fp.GetNumBits()
51 while( float(nOn)/nTot < fpArgs['tgtDensity'] ):
52 if nTot / 2 > fpArgs['minSize']:
53 fp = DataStructs.FoldFingerprint(fp,2)
54 nOn = fp.GetNumOnBits()
55 nTot = fp.GetNumBits()
56 else:
57 break
58 return fp
59
63 if not fpArgs:
64 details = FingerprinterDetails()
65 fpArgs = details.__dict__
66
67 if fingerprinter != Chem.RDKFingerprint:
68 fp = fingerprinter(mol,**fpArgs)
69 fp = FoldFingerprintToTargetDensity(fp,**fpArgs)
70 else:
71 fp = fingerprinter(mol,fpArgs['minPath'],fpArgs['maxPath'],
72 fpArgs['fpSize'],fpArgs['bitsPerHash'],
73 fpArgs['useHs'],fpArgs['tgtDensity'],
74 fpArgs['minSize'])
75 return fp
76
77
78 -def FingerprintsFromSmiles(dataSource,idCol,smiCol,
79 fingerprinter=Chem.RDKFingerprint,
80 reportFreq=10,maxMols=-1,
81 **fpArgs):
82 """ fpArgs are passed as keyword arguments to the fingerprinter
83
84 Returns a list of 2-tuples: (id,fp)
85
86 """
87 res = []
88 nDone = 0
89 for entry in dataSource:
90 id,smi = str(entry[idCol]),str(entry[smiCol])
91 try:
92 mol = Chem.MolFromSmiles(smi)
93 except:
94 mol = None
95 if mol:
96 fp = FingerprintMol(mol,fingerprinter,**fpArgs)
97 res.append((id,fp))
98 nDone += 1
99 if reportFreq>0 and not nDone % reportFreq:
100 message('Done %d molecules\n'%(nDone))
101 if maxMols > 0 and nDone >= maxMols:
102 break
103 else:
104 error('Problems parsing SMILES: %s\n'%smi)
105 return res
106
111 """ fpArgs are passed as keyword arguments to the fingerprinter
112
113 Returns a list of 2-tuples: (id,fp)
114
115 """
116 res = []
117 nDone = 0
118 for id,mol in mols:
119 if mol:
120 fp = FingerprintMol(mol,fingerprinter,**fpArgs)
121 res.append((id,fp))
122 nDone += 1
123 if reportFreq>0 and not nDone % reportFreq:
124 message('Done %d molecules\n'%(nDone))
125 if maxMols > 0 and nDone >= maxMols:
126 break
127 else:
128 error('Problems parsing SMILES: %s\n'%smi)
129 return res
130
131 -def FingerprintsFromPickles(dataSource,idCol,pklCol,
132 fingerprinter=Chem.RDKFingerprint,
133 reportFreq=10,maxMols=-1,
134 **fpArgs):
135 """ fpArgs are passed as keyword arguments to the fingerprinter
136
137 Returns a list of 2-tuples: (id,fp)
138
139 """
140 res = []
141 nDone = 0
142 for entry in dataSource:
143 id,pkl = str(entry[idCol]),str(entry[pklCol])
144 try:
145 mol = Chem.Mol(pkl)
146 except:
147 mol = None
148 if mol:
149 fp = FingerprintMol(mol,fingerprinter,**fpArgs)
150 res.append((id,fp))
151 nDone += 1
152 if reportFreq>0 and not nDone % reportFreq:
153 message('Done %d molecules\n'%(nDone))
154 if maxMols > 0 and nDone >= maxMols:
155 break
156 else:
157 error('Problems parsing pickle for id: %s\n'%id)
158 return res
159
161 data = None
162 if details.dbName and details.tableName:
163 from rdkit.Dbase.DbConnection import DbConnect
164 from rdkit.Dbase import DbInfo
165 from rdkit.ML.Data import DataUtils
166 try:
167 conn = DbConnect(details.dbName,details.tableName)
168 except:
169 import traceback
170 error('Problems establishing connection to database: %s|%s\n'%(details.dbName,
171 details.tableName))
172 traceback.print_exc()
173 if not details.idName:
174 details.idName=DbInfo.GetColumnNames(details.dbName,details.tableName)[0]
175 dataSet = DataUtils.DBToData(details.dbName,details.tableName,
176 what='%s,%s'%(details.idName,details.smilesName))
177 idCol = 0
178 smiCol = 1
179 elif details.inFileName and details.useSmiles:
180 from rdkit.ML.Data import DataUtils
181 conn = None
182 if not details.idName:
183 details.idName='ID'
184 try:
185 dataSet = DataUtils.TextFileToData(details.inFileName,
186 onlyCols=[details.idName,details.smilesName])
187 except IOError:
188 import traceback
189 error('Problems reading from file %s\n'%(details.inFileName))
190 traceback.print_exc()
191
192 idCol = 0
193 smiCol = 1
194 elif details.inFileName and details.useSD:
195 conn = None
196 dataset=None
197 if not details.idName:
198 details.idName='ID'
199 dataSet = []
200 try:
201 s = Chem.SDMolSupplier(details.inFileName)
202 except:
203 import traceback
204 error('Problems reading from file %s\n'%(details.inFileName))
205 traceback.print_exc()
206 else:
207 while 1:
208 try:
209 m = s.next()
210 except StopIteration:
211 break
212 if m:
213 dataSet.append(m)
214 if reportFreq>0 and not len(dataSet) % reportFreq:
215 message('Read %d molecules\n'%(len(dataSet)))
216 if details.maxMols > 0 and len(dataSet) >= details.maxMols:
217 break
218
219 for i,mol in enumerate(dataSet):
220 if mol.HasProp(details.idName):
221 nm = mol.GetProp(details.idName)
222 else:
223 nm = mol.GetProp('_Name')
224 dataSet[i] = (nm,mol)
225 else:
226 dataSet = None
227
228 fps = None
229 if dataSet and not details.useSD:
230 data = dataSet.GetNamedData()
231 if not details.molPklName:
232 fps = apply(FingerprintsFromSmiles,(data,idCol,smiCol),
233 details.__dict__)
234 else:
235 fps = apply(FingerprintsFromPickles,(data,idCol,smiCol),
236 details.__dict__)
237 elif dataSet and details.useSD:
238 fps = apply(FingerprintsFromMols,(dataSet,),details.__dict__)
239
240 if fps:
241 if details.outFileName:
242 outF = open(details.outFileName,'wb+')
243 for i in range(len(fps)):
244 cPickle.dump(fps[i],outF)
245 outF.close()
246 dbName = details.outDbName or details.dbName
247 if details.outTableName and dbName:
248 from rdkit.Dbase.DbConnection import DbConnect
249 from rdkit.Dbase import DbInfo,DbUtils,DbModule
250 conn = DbConnect(dbName)
251
252
253
254
255 colTypes = DbUtils.TypeFinder(data,len(data),len(data[0]))
256 typeStrs = DbUtils.GetTypeStrings([details.idName,details.smilesName],colTypes,
257 keyCol=details.idName)
258 cols = '%s, %s %s'%(typeStrs[0],details.fpColName,DbModule.binaryTypeName)
259
260
261
262
263
264
265
266
267 if details.replaceTable or \
268 details.outTableName.upper() not in [x.upper() for x in conn.GetTableNames()]:
269 conn.AddTable(details.outTableName,cols)
270
271
272
273
274 for id,fp in fps:
275 tpl = id,DbModule.binaryHolder(fp.ToBinary())
276 conn.InsertData(details.outTableName,tpl)
277 conn.Commit()
278 return fps
279
280
281
282
283
284
286 """ class for storing the details of a fingerprinting run,
287 generates sensible defaults on construction
288
289 """
294
296 self.fingerprinter = Chem.RDKFingerprint
297 self.fpColName="AutoFragmentFP"
298 self.idName=''
299 self.dbName=''
300 self.outDbName=''
301 self.tableName=''
302 self.minSize=64
303 self.fpSize=2048
304 self.tgtDensity=0.3
305 self.minPath=1
306 self.maxPath=7
307 self.discrimHash=0
308 self.useHs=0
309 self.useValence=0
310 self.bitsPerHash=2
311 self.smilesName='SMILES'
312 self.maxMols=-1
313 self.outFileName=''
314 self.outTableName=''
315 self.inFileName=''
316 self.replaceTable=True
317 self.molPklName=''
318 self.useSmiles=True
319 self.useSD=False
320
322 self.metric = DataStructs.TanimotoSimilarity
323 self.doScreen=''
324 self.topN=10
325 self.screenThresh=0.75
326 self.doThreshold=0
327 self.smilesTableName=''
328 self.probeSmiles=''
329 self.probeMol=None
330 self.noPickle=0
331
333 self.clusterAlgo = Murtagh.WARDS
334 self.actTableName = ''
335 self.actName = ''
336
356
358 """ prints a usage string and exits
359
360 """
361 print(_usageDoc)
362 sys.exit(-1)
363
364 _usageDoc="""
365 Usage: FingerprintMols.py [args] <fName>
366
367 If <fName> is provided and no tableName is specified (see below),
368 data will be read from the text file <fName>. Text files delimited
369 with either commas (extension .csv) or tabs (extension .txt) are
370 supported.
371
372 Command line arguments are:
373 - -d _dbName_: set the name of the database from which
374 to pull input molecule information. If output is
375 going to a database, this will also be used for that
376 unless the --outDbName option is used.
377
378 - -t _tableName_: set the name of the database table
379 from which to pull input molecule information
380
381 - --smilesName=val: sets the name of the SMILES column
382 in the input database. Default is *SMILES*.
383
384 - --useSD: Assume that the input file is an SD file, not a SMILES
385 table.
386
387 - --idName=val: sets the name of the id column in the input
388 database. Defaults to be the name of the first db column
389 (or *ID* for text files).
390
391 - -o _outFileName_: name of the output file (output will
392 be a pickle file with one label,fingerprint entry for each
393 molecule).
394
395 - --outTable=val: name of the output db table used to store
396 fingerprints. If this table already exists, it will be
397 replaced.
398
399 - --outDbName: name of output database, if it's being used.
400 Defaults to be the same as the input db.
401
402 - --fpColName=val: name to use for the column which stores
403 fingerprints (in pickled format) in the output db table.
404 Default is *AutoFragmentFP*
405
406 - --maxSize=val: base size of the fingerprints to be generated
407 Default is *2048*
408
409 - --minSize=val: minimum size of the fingerprints to be generated
410 (limits the amount of folding that happens). Default is *64*
411
412 - --density=val: target bit density in the fingerprint. The
413 fingerprint will be folded until this density is
414 reached. Default is *0.3*
415
416 - --minPath=val: minimum path length to be included in
417 fragment-based fingerprints. Default is *1*.
418
419 - --maxPath=val: maximum path length to be included in
420 fragment-based fingerprints. Default is *7*.
421
422 - --nBitsPerHash: number of bits to be set in the output
423 fingerprint for each fragment. Default is *2*.
424
425 - --discrim: use of path-based discriminators to hash bits.
426 Default is *false*.
427
428 - -V: include valence information in the fingerprints
429 Default is *false*.
430
431 - -H: include Hs in the fingerprint
432 Default is *false*.
433
434 - --maxMols=val: sets the maximum number of molecules to be
435 fingerprinted.
436
437 - --useMACCS: use the public MACCS keys to do the fingerprinting
438 (instead of a daylight-type fingerprint)
439
440 """
441
443 """ parses the command line arguments and returns a
444 _FingerprinterDetails_ instance with the results.
445
446 **Note**:
447
448 - If you make modifications here, please update the global
449 _usageDoc string so the Usage message is up to date.
450
451 - This routine is used by both the fingerprinter, the clusterer and the
452 screener; not all arguments make sense for all applications.
453
454 """
455 import sys,getopt
456 try:
457 args = sys.argv[1:]
458 except:
459 Usage()
460 try:
461 args,extras = getopt.getopt(args,'HVs:d:t:o:h',
462 [
463 'minSize=','maxSize=',
464 'density=',
465 'minPath=','maxPath=',
466 'bitsPerHash=',
467 'smilesName=',
468 'molPkl=',
469 'useSD',
470 'idName=',
471 'discrim',
472 'outTable=',
473 'outDbName=',
474 'fpColName=',
475 'maxMols=',
476 'useMACCS',
477 'keepTable',
478
479 'smilesTable=',
480 'doScreen=',
481 'topN=',
482 'thresh=',
483 'smiles=',
484 'dice',
485 'cosine',
486
487 'actTable=',
488 'actName=',
489 'SLINK',
490 'CLINK',
491 'UPGMA',
492
493 ])
494 except:
495 import traceback
496 traceback.print_exc()
497 Usage()
498
499 if details is None:
500 details = FingerprinterDetails()
501 if len(extras):
502 details.inFileName=extras[0]
503
504 for arg,val in args:
505 if arg=='-H':
506 details.useHs=1
507 elif arg=='-V':
508 details.useValence=1
509 elif arg=='-d':
510 details.dbName = val
511 elif arg=='-t':
512 details.tableName = val
513 elif arg=='-o':
514 details.outFileName = val
515 elif arg=='--minSize':
516 details.minSize= int(val)
517 elif arg=='--maxSize':
518 details.fpSize= int(val)
519 elif arg=='--density':
520 details.tgtDensity = float(val)
521 elif arg=='--outTable':
522 details.outTableName = val
523 elif arg=='--outDbName':
524 details.outDbName = val
525 elif arg=='--fpColName':
526 details.fpColName = val
527 elif arg=='--minPath':
528 details.minPath= int(val)
529 elif arg=='--maxPath':
530 details.maxPath= int(val)
531 elif arg=='--nBitsPerHash':
532 details.bitsPerHash= int(val)
533 elif arg=='--discrim':
534 details.discrimHash=1
535 elif arg=='--smilesName':
536 details.smilesName = val
537 elif arg=='--molPkl':
538 details.molPklName = val
539 elif arg=='--useSD':
540 details.useSmiles=False
541 details.useSD=True
542 elif arg=='--idName':
543 details.idName = val
544 elif arg=='--maxMols':
545 details.maxMols = int(val)
546 elif arg=='--useMACCS':
547 details.fingerprinter = MACCSkeys.GenMACCSKeys
548 elif arg=='--keepTable':
549 details.replaceTable=False
550
551
552 elif arg=='--smilesTable':
553 details.smilesTableName=val;
554 elif arg=='--topN':
555 details.doThreshold=0
556 details.topN=int(val)
557 elif arg=='--thresh':
558 details.doThreshold=1
559 details.screenThresh=float(val)
560 elif arg=='--smiles':
561 details.probeSmiles=val;
562 elif arg=='--dice':
563 details.metric = DataStructs.DiceSimilarity
564 elif arg=='--cosine':
565 details.metric = DataStructs.CosineSimilarity
566
567
568 elif arg=='--SLINK':
569 details.clusterAlgo = Murtagh.SLINK
570 elif arg=='--CLINK':
571 details.clusterAlgo = Murtagh.CLINK
572 elif arg=='--UPGMA':
573 details.clusterAlgo = Murtagh.UPGMA
574 elif arg=='--actTable':
575 details.actTableName = val
576 elif arg=='--actName':
577 details.actName = val
578 elif arg=='-h':
579 Usage()
580 return details
581
582 if __name__ == '__main__':
583 message("This is FingerprintMols version %s\n\n"%(__VERSION_STRING))
584 details = ParseArgs()
585 FingerprintsFromDetails(details)
586