Package rdkit :: Package Chem :: Package Fingerprints :: Module FingerprintMols
[hide private]
[frames] | no frames]

Source Code for Module rdkit.Chem.Fingerprints.FingerprintMols

  1  # $Id$ 
  2  # 
  3  #  Copyright (c) 2003-2006 Rational Discovery LLC 
  4  # 
  5  #   @@ All Rights Reserved @@ 
  6  #  This file is part of the RDKit. 
  7  #  The contents are covered by the terms of the BSD license 
  8  #  which is included in the file license.txt, found at the root 
  9  #  of the RDKit source tree. 
 10  # 
 11  """ utility functionality for fingerprinting sets of molecules 
 12   includes a command line app for working with fingerprints 
 13   and databases 
 14   
 15   
 16  Sample Usage: 
 17   
 18    python FingerprintMols.py  -d data.gdb \ 
 19          -t 'raw_dop_data' --smilesName="Structure" --idName="Mol_ID"  \ 
 20          --outTable="daylight_sig" 
 21   
 22   
 23  """ 
 24  from __future__ import print_function 
 25  from rdkit import Chem 
 26  from rdkit.Chem import MACCSkeys 
 27  from rdkit.ML.Cluster import Murtagh 
 28  from rdkit import DataStructs 
 29  import sys 
 30  from rdkit.six.moves import cPickle 
 31   
 32  _cvsVersion="$Id$" 
 33  idx1 = _cvsVersion.find(':')+1 
 34  idx2 = _cvsVersion.rfind('$') 
 35  __VERSION_STRING="%s"%(_cvsVersion[idx1:idx2]) 
 36   
 37   
38 -def error(msg):
39 sys.stderr.write(msg)
40 -def message(msg):
41 sys.stderr.write(msg)
42
43 -def GetRDKFingerprint(mol):
44 """ uses default parameters """ 45 details = FingerprinterDetails() 46 return apply(FingerprintMol,(mol,),details.__dict__)
47
48 -def FoldFingerprintToTargetDensity(fp,**fpArgs):
49 nOn = fp.GetNumOnBits() 50 nTot = fp.GetNumBits() 51 while( float(nOn)/nTot < fpArgs['tgtDensity'] ): 52 if nTot / 2 > fpArgs['minSize']: 53 fp = DataStructs.FoldFingerprint(fp,2) 54 nOn = fp.GetNumOnBits() 55 nTot = fp.GetNumBits() 56 else: 57 break 58 return fp
59
60 -def FingerprintMol(mol, 61 fingerprinter=Chem.RDKFingerprint, 62 **fpArgs):
63 if not fpArgs: 64 details = FingerprinterDetails() 65 fpArgs = details.__dict__ 66 67 if fingerprinter != Chem.RDKFingerprint: 68 fp = fingerprinter(mol,**fpArgs) 69 fp = FoldFingerprintToTargetDensity(fp,**fpArgs) 70 else: 71 fp = fingerprinter(mol,fpArgs['minPath'],fpArgs['maxPath'], 72 fpArgs['fpSize'],fpArgs['bitsPerHash'], 73 fpArgs['useHs'],fpArgs['tgtDensity'], 74 fpArgs['minSize']) 75 return fp
76 77
78 -def FingerprintsFromSmiles(dataSource,idCol,smiCol, 79 fingerprinter=Chem.RDKFingerprint, 80 reportFreq=10,maxMols=-1, 81 **fpArgs):
82 """ fpArgs are passed as keyword arguments to the fingerprinter 83 84 Returns a list of 2-tuples: (id,fp) 85 86 """ 87 res = [] 88 nDone = 0 89 for entry in dataSource: 90 id,smi = str(entry[idCol]),str(entry[smiCol]) 91 try: 92 mol = Chem.MolFromSmiles(smi) 93 except: 94 mol = None 95 if mol: 96 fp = FingerprintMol(mol,fingerprinter,**fpArgs) 97 res.append((id,fp)) 98 nDone += 1 99 if reportFreq>0 and not nDone % reportFreq: 100 message('Done %d molecules\n'%(nDone)) 101 if maxMols > 0 and nDone >= maxMols: 102 break 103 else: 104 error('Problems parsing SMILES: %s\n'%smi) 105 return res
106
107 -def FingerprintsFromMols(mols, 108 fingerprinter=Chem.RDKFingerprint, 109 reportFreq=10,maxMols=-1, 110 **fpArgs):
111 """ fpArgs are passed as keyword arguments to the fingerprinter 112 113 Returns a list of 2-tuples: (id,fp) 114 115 """ 116 res = [] 117 nDone = 0 118 for id,mol in mols: 119 if mol: 120 fp = FingerprintMol(mol,fingerprinter,**fpArgs) 121 res.append((id,fp)) 122 nDone += 1 123 if reportFreq>0 and not nDone % reportFreq: 124 message('Done %d molecules\n'%(nDone)) 125 if maxMols > 0 and nDone >= maxMols: 126 break 127 else: 128 error('Problems parsing SMILES: %s\n'%smi) 129 return res
130
131 -def FingerprintsFromPickles(dataSource,idCol,pklCol, 132 fingerprinter=Chem.RDKFingerprint, 133 reportFreq=10,maxMols=-1, 134 **fpArgs):
135 """ fpArgs are passed as keyword arguments to the fingerprinter 136 137 Returns a list of 2-tuples: (id,fp) 138 139 """ 140 res = [] 141 nDone = 0 142 for entry in dataSource: 143 id,pkl = str(entry[idCol]),str(entry[pklCol]) 144 try: 145 mol = Chem.Mol(pkl) 146 except: 147 mol = None 148 if mol: 149 fp = FingerprintMol(mol,fingerprinter,**fpArgs) 150 res.append((id,fp)) 151 nDone += 1 152 if reportFreq>0 and not nDone % reportFreq: 153 message('Done %d molecules\n'%(nDone)) 154 if maxMols > 0 and nDone >= maxMols: 155 break 156 else: 157 error('Problems parsing pickle for id: %s\n'%id) 158 return res
159
160 -def FingerprintsFromDetails(details,reportFreq=10):
161 data = None 162 if details.dbName and details.tableName: 163 from rdkit.Dbase.DbConnection import DbConnect 164 from rdkit.Dbase import DbInfo 165 from rdkit.ML.Data import DataUtils 166 try: 167 conn = DbConnect(details.dbName,details.tableName) 168 except: 169 import traceback 170 error('Problems establishing connection to database: %s|%s\n'%(details.dbName, 171 details.tableName)) 172 traceback.print_exc() 173 if not details.idName: 174 details.idName=DbInfo.GetColumnNames(details.dbName,details.tableName)[0] 175 dataSet = DataUtils.DBToData(details.dbName,details.tableName, 176 what='%s,%s'%(details.idName,details.smilesName)) 177 idCol = 0 178 smiCol = 1 179 elif details.inFileName and details.useSmiles: 180 from rdkit.ML.Data import DataUtils 181 conn = None 182 if not details.idName: 183 details.idName='ID' 184 try: 185 dataSet = DataUtils.TextFileToData(details.inFileName, 186 onlyCols=[details.idName,details.smilesName]) 187 except IOError: 188 import traceback 189 error('Problems reading from file %s\n'%(details.inFileName)) 190 traceback.print_exc() 191 192 idCol = 0 193 smiCol = 1 194 elif details.inFileName and details.useSD: 195 conn = None 196 dataset=None 197 if not details.idName: 198 details.idName='ID' 199 dataSet = [] 200 try: 201 s = Chem.SDMolSupplier(details.inFileName) 202 except: 203 import traceback 204 error('Problems reading from file %s\n'%(details.inFileName)) 205 traceback.print_exc() 206 else: 207 while 1: 208 try: 209 m = s.next() 210 except StopIteration: 211 break 212 if m: 213 dataSet.append(m) 214 if reportFreq>0 and not len(dataSet) % reportFreq: 215 message('Read %d molecules\n'%(len(dataSet))) 216 if details.maxMols > 0 and len(dataSet) >= details.maxMols: 217 break 218 219 for i,mol in enumerate(dataSet): 220 if mol.HasProp(details.idName): 221 nm = mol.GetProp(details.idName) 222 else: 223 nm = mol.GetProp('_Name') 224 dataSet[i] = (nm,mol) 225 else: 226 dataSet = None 227 228 fps = None 229 if dataSet and not details.useSD: 230 data = dataSet.GetNamedData() 231 if not details.molPklName: 232 fps = apply(FingerprintsFromSmiles,(data,idCol,smiCol), 233 details.__dict__) 234 else: 235 fps = apply(FingerprintsFromPickles,(data,idCol,smiCol), 236 details.__dict__) 237 elif dataSet and details.useSD: 238 fps = apply(FingerprintsFromMols,(dataSet,),details.__dict__) 239 240 if fps: 241 if details.outFileName: 242 outF = open(details.outFileName,'wb+') 243 for i in range(len(fps)): 244 cPickle.dump(fps[i],outF) 245 outF.close() 246 dbName = details.outDbName or details.dbName 247 if details.outTableName and dbName: 248 from rdkit.Dbase.DbConnection import DbConnect 249 from rdkit.Dbase import DbInfo,DbUtils,DbModule 250 conn = DbConnect(dbName) 251 # 252 # We don't have a db open already, so we'll need to figure out 253 # the types of our columns... 254 # 255 colTypes = DbUtils.TypeFinder(data,len(data),len(data[0])) 256 typeStrs = DbUtils.GetTypeStrings([details.idName,details.smilesName],colTypes, 257 keyCol=details.idName) 258 cols = '%s, %s %s'%(typeStrs[0],details.fpColName,DbModule.binaryTypeName) 259 260 # FIX: we should really check to see if the table 261 # is already there and, if so, add the appropriate 262 # column. 263 264 # 265 # create the new table 266 # 267 if details.replaceTable or \ 268 details.outTableName.upper() not in [x.upper() for x in conn.GetTableNames()]: 269 conn.AddTable(details.outTableName,cols) 270 271 # 272 # And add the data 273 # 274 for id,fp in fps: 275 tpl = id,DbModule.binaryHolder(fp.ToBinary()) 276 conn.InsertData(details.outTableName,tpl) 277 conn.Commit() 278 return fps
279 # ------------------------------------------------ 280 # 281 # Command line parsing stuff 282 # 283 # ------------------------------------------------ 284
285 -class FingerprinterDetails(object):
286 """ class for storing the details of a fingerprinting run, 287 generates sensible defaults on construction 288 289 """
290 - def __init__(self):
291 self._fingerprinterInit() 292 self._screenerInit() 293 self._clusterInit()
294
295 - def _fingerprinterInit(self):
296 self.fingerprinter = Chem.RDKFingerprint 297 self.fpColName="AutoFragmentFP" 298 self.idName='' 299 self.dbName='' 300 self.outDbName='' 301 self.tableName='' 302 self.minSize=64 303 self.fpSize=2048 304 self.tgtDensity=0.3 305 self.minPath=1 306 self.maxPath=7 307 self.discrimHash=0 308 self.useHs=0 309 self.useValence=0 310 self.bitsPerHash=2 311 self.smilesName='SMILES' 312 self.maxMols=-1 313 self.outFileName='' 314 self.outTableName='' 315 self.inFileName='' 316 self.replaceTable=True 317 self.molPklName='' 318 self.useSmiles=True 319 self.useSD=False
320
321 - def _screenerInit(self):
322 self.metric = DataStructs.TanimotoSimilarity 323 self.doScreen='' 324 self.topN=10 325 self.screenThresh=0.75 326 self.doThreshold=0 327 self.smilesTableName='' 328 self.probeSmiles='' 329 self.probeMol=None 330 self.noPickle=0
331
332 - def _clusterInit(self):
333 self.clusterAlgo = Murtagh.WARDS 334 self.actTableName = '' 335 self.actName = ''
336
337 - def GetMetricName(self):
338 if self.metric == DataStructs.TanimotoSimilarity: 339 return 'Tanimoto' 340 elif self.metric == DataStructs.DiceSimilarity: 341 return 'Dice' 342 elif self.metric == DataStructs.CosineSimilarity: 343 return 'Cosine' 344 elif self.metric: 345 return self.metric 346 else: 347 return 'Unknown'
348 - def SetMetricFromName(self,name):
349 name = name.upper() 350 if name=="TANIMOTO": 351 self.metric = DataStructs.TanimotoSimilarity 352 elif name=="DICE": 353 self.metric = DataStructs.DiceSimilarity 354 elif name=="COSINE": 355 self.metric = DataStructs.CosineSimilarity
356
357 -def Usage():
358 """ prints a usage string and exits 359 360 """ 361 print(_usageDoc) 362 sys.exit(-1)
363 364 _usageDoc=""" 365 Usage: FingerprintMols.py [args] <fName> 366 367 If <fName> is provided and no tableName is specified (see below), 368 data will be read from the text file <fName>. Text files delimited 369 with either commas (extension .csv) or tabs (extension .txt) are 370 supported. 371 372 Command line arguments are: 373 - -d _dbName_: set the name of the database from which 374 to pull input molecule information. If output is 375 going to a database, this will also be used for that 376 unless the --outDbName option is used. 377 378 - -t _tableName_: set the name of the database table 379 from which to pull input molecule information 380 381 - --smilesName=val: sets the name of the SMILES column 382 in the input database. Default is *SMILES*. 383 384 - --useSD: Assume that the input file is an SD file, not a SMILES 385 table. 386 387 - --idName=val: sets the name of the id column in the input 388 database. Defaults to be the name of the first db column 389 (or *ID* for text files). 390 391 - -o _outFileName_: name of the output file (output will 392 be a pickle file with one label,fingerprint entry for each 393 molecule). 394 395 - --outTable=val: name of the output db table used to store 396 fingerprints. If this table already exists, it will be 397 replaced. 398 399 - --outDbName: name of output database, if it's being used. 400 Defaults to be the same as the input db. 401 402 - --fpColName=val: name to use for the column which stores 403 fingerprints (in pickled format) in the output db table. 404 Default is *AutoFragmentFP* 405 406 - --maxSize=val: base size of the fingerprints to be generated 407 Default is *2048* 408 409 - --minSize=val: minimum size of the fingerprints to be generated 410 (limits the amount of folding that happens). Default is *64* 411 412 - --density=val: target bit density in the fingerprint. The 413 fingerprint will be folded until this density is 414 reached. Default is *0.3* 415 416 - --minPath=val: minimum path length to be included in 417 fragment-based fingerprints. Default is *1*. 418 419 - --maxPath=val: maximum path length to be included in 420 fragment-based fingerprints. Default is *7*. 421 422 - --nBitsPerHash: number of bits to be set in the output 423 fingerprint for each fragment. Default is *2*. 424 425 - --discrim: use of path-based discriminators to hash bits. 426 Default is *false*. 427 428 - -V: include valence information in the fingerprints 429 Default is *false*. 430 431 - -H: include Hs in the fingerprint 432 Default is *false*. 433 434 - --maxMols=val: sets the maximum number of molecules to be 435 fingerprinted. 436 437 - --useMACCS: use the public MACCS keys to do the fingerprinting 438 (instead of a daylight-type fingerprint) 439 440 """ 441
442 -def ParseArgs(details=None):
443 """ parses the command line arguments and returns a 444 _FingerprinterDetails_ instance with the results. 445 446 **Note**: 447 448 - If you make modifications here, please update the global 449 _usageDoc string so the Usage message is up to date. 450 451 - This routine is used by both the fingerprinter, the clusterer and the 452 screener; not all arguments make sense for all applications. 453 454 """ 455 import sys,getopt 456 try: 457 args = sys.argv[1:] 458 except: 459 Usage() 460 try: 461 args,extras = getopt.getopt(args,'HVs:d:t:o:h', 462 [ 463 'minSize=','maxSize=', 464 'density=', 465 'minPath=','maxPath=', 466 'bitsPerHash=', 467 'smilesName=', 468 'molPkl=', 469 'useSD', 470 'idName=', 471 'discrim', 472 'outTable=', 473 'outDbName=', 474 'fpColName=', 475 'maxMols=', 476 'useMACCS', 477 'keepTable', 478 # SCREENING: 479 'smilesTable=', 480 'doScreen=', 481 'topN=', 482 'thresh=', 483 'smiles=', 484 'dice', 485 'cosine', 486 # CLUSTERING: 487 'actTable=', 488 'actName=', 489 'SLINK', 490 'CLINK', 491 'UPGMA', 492 493 ]) 494 except: 495 import traceback 496 traceback.print_exc() 497 Usage() 498 499 if details is None: 500 details = FingerprinterDetails() 501 if len(extras): 502 details.inFileName=extras[0] 503 504 for arg,val in args: 505 if arg=='-H': 506 details.useHs=1 507 elif arg=='-V': 508 details.useValence=1 509 elif arg=='-d': 510 details.dbName = val 511 elif arg=='-t': 512 details.tableName = val 513 elif arg=='-o': 514 details.outFileName = val 515 elif arg=='--minSize': 516 details.minSize= int(val) 517 elif arg=='--maxSize': 518 details.fpSize= int(val) 519 elif arg=='--density': 520 details.tgtDensity = float(val) 521 elif arg=='--outTable': 522 details.outTableName = val 523 elif arg=='--outDbName': 524 details.outDbName = val 525 elif arg=='--fpColName': 526 details.fpColName = val 527 elif arg=='--minPath': 528 details.minPath= int(val) 529 elif arg=='--maxPath': 530 details.maxPath= int(val) 531 elif arg=='--nBitsPerHash': 532 details.bitsPerHash= int(val) 533 elif arg=='--discrim': 534 details.discrimHash=1 535 elif arg=='--smilesName': 536 details.smilesName = val 537 elif arg=='--molPkl': 538 details.molPklName = val 539 elif arg=='--useSD': 540 details.useSmiles=False 541 details.useSD=True 542 elif arg=='--idName': 543 details.idName = val 544 elif arg=='--maxMols': 545 details.maxMols = int(val) 546 elif arg=='--useMACCS': 547 details.fingerprinter = MACCSkeys.GenMACCSKeys 548 elif arg=='--keepTable': 549 details.replaceTable=False 550 551 # SCREENER: 552 elif arg=='--smilesTable': 553 details.smilesTableName=val; 554 elif arg=='--topN': 555 details.doThreshold=0 556 details.topN=int(val) 557 elif arg=='--thresh': 558 details.doThreshold=1 559 details.screenThresh=float(val) 560 elif arg=='--smiles': 561 details.probeSmiles=val; 562 elif arg=='--dice': 563 details.metric = DataStructs.DiceSimilarity 564 elif arg=='--cosine': 565 details.metric = DataStructs.CosineSimilarity 566 567 # CLUSTERS: 568 elif arg=='--SLINK': 569 details.clusterAlgo = Murtagh.SLINK 570 elif arg=='--CLINK': 571 details.clusterAlgo = Murtagh.CLINK 572 elif arg=='--UPGMA': 573 details.clusterAlgo = Murtagh.UPGMA 574 elif arg=='--actTable': 575 details.actTableName = val 576 elif arg=='--actName': 577 details.actName = val 578 elif arg=='-h': 579 Usage() 580 return details
581 582 if __name__ == '__main__': 583 message("This is FingerprintMols version %s\n\n"%(__VERSION_STRING)) 584 details = ParseArgs() 585 FingerprintsFromDetails(details) 586