Package rdkit :: Package ML :: Package Descriptors :: Module Parser
[hide private]
[frames] | no frames]

Source Code for Module rdkit.ML.Descriptors.Parser

  1  # 
  2  #  Copyright (C) 2001-2004  greg Landrum and Rational Discovery LLC 
  3  #  All Rights Reserved 
  4  # 
  5   
  6  """ The "parser" for compound descriptors. 
  7   
  8  I almost hesitate to document this, because it's not the prettiest 
  9  thing the world has ever seen... but it does work (for at least some 
 10  definitions of the word). 
 11   
 12  Rather than getting into the whole mess of writing a parser for the 
 13  compound descriptor expressions, I'm just using string substitutions 
 14  and python's wonderful ability to *eval* code. 
 15   
 16  It would probably be a good idea at some point to replace this with a 
 17  real parser, if only for the flexibility and intelligent error 
 18  messages that would become possible. 
 19   
 20  The general idea is that we're going to deal with expressions where 
 21  atomic descriptors have some kind of method applied to them which 
 22  reduces them to a single number for the entire composition.  Compound 
 23  descriptors (those applicable to the compound as a whole) are not 
 24  operated on by anything in particular (except for standard math stuff). 
 25   
 26  Here's the general flow of things: 
 27   
 28    1) Composition descriptor references ($a, $b, etc.) are replaced with the 
 29       corresponding descriptor names using string subsitution. 
 30       (*_SubForCompoundDescriptors*) 
 31   
 32    2) Atomic descriptor references ($1, $2, etc) are replaced with lookups 
 33       into the atomic dict with "DEADBEEF" in place of the atom name. 
 34       (*_SubForAtomicVars*) 
 35   
 36    3) Calls to Calculator Functions are augmented with a reference to 
 37       the composition and atomic dictionary 
 38       (*_SubMethodArgs*) 
 39   
 40  **NOTE:** 
 41   
 42    anytime we don't know the answer for a descriptor, rather than 
 43    throwing a (completely incomprehensible) exception, we just return 
 44    -666.  So bad descriptor values should stand out like sore thumbs. 
 45   
 46  """ 
 47  from __future__ import print_function 
 48  __DEBUG=0 
 49  from rdkit import RDConfig 
 50   
 51  # we do this to allow the use of stuff in the math module 
 52  from math import * 
 53   
 54  #---------------------- 
 55  # atomic descriptor section 
 56  #---------------------- 
 57  # these are the methods which can be applied to ATOMIC descriptors. 
 58  knownMethods = ['SUM','MIN','MAX','MEAN','AVG','DEV','HAS'] 
 59   
60 -def HAS(strArg,composList,atomDict):
61 """ *Calculator Method* 62 63 does a string search 64 65 **Arguments** 66 67 - strArg: the arguments in string form 68 69 - composList: the composition vector 70 71 - atomDict: the atomic dictionary 72 73 **Returns** 74 75 1 or 0 76 77 """ 78 splitArgs = string.split(strArg,',') 79 if len(splitArgs)>1: 80 for atom,num in composList: 81 tStr = splitArgs[0].replace('DEADBEEF',atom) 82 where = eval(tStr) 83 what = eval(splitArgs[1]) 84 if where.find(what)!= -1: 85 return 1 86 return 0 87 else: 88 return -666
89
90 -def SUM(strArg,composList,atomDict):
91 """ *Calculator Method* 92 93 calculates the sum of a descriptor across a composition 94 95 **Arguments** 96 97 - strArg: the arguments in string form 98 99 - compos: the composition vector 100 101 - atomDict: the atomic dictionary 102 103 **Returns** 104 105 a float 106 107 """ 108 accum = 0.0 109 for atom,num in composList: 110 tStr = strArg.replace('DEADBEEF',atom) 111 accum = accum + eval(tStr)*num 112 return accum
113
114 -def MEAN(strArg,composList,atomDict):
115 """ *Calculator Method* 116 117 calculates the average of a descriptor across a composition 118 119 **Arguments** 120 121 - strArg: the arguments in string form 122 123 - compos: the composition vector 124 125 - atomDict: the atomic dictionary 126 127 **Returns** 128 129 a float 130 131 """ 132 accum = 0.0 133 nSoFar = 0 134 for atom,num in composList: 135 tStr = strArg.replace('DEADBEEF',atom) 136 accum = accum + eval(tStr)*num 137 nSoFar = nSoFar + num 138 return accum/nSoFar
139 AVG = MEAN 140
141 -def DEV(strArg,composList,atomDict):
142 """ *Calculator Method* 143 144 calculates the average deviation of a descriptor across a composition 145 146 **Arguments** 147 148 - strArg: the arguments in string form 149 150 - compos: the composition vector 151 152 - atomDict: the atomic dictionary 153 154 **Returns** 155 156 a float 157 158 """ 159 avg = MEAN(strArg,composList,atomDict) 160 accum = 0.0 161 nSoFar = 0.0 162 for atom,num in composList: 163 tStr = strArg.replace('DEADBEEF',atom) 164 accum = accum + abs(eval(tStr)-avg)*num 165 nSoFar = nSoFar + num 166 return accum/nSoFar
167
168 -def MIN(strArg,composList,atomDict):
169 """ *Calculator Method* 170 171 calculates the minimum value of a descriptor across a composition 172 173 **Arguments** 174 175 - strArg: the arguments in string form 176 177 - compos: the composition vector 178 179 - atomDict: the atomic dictionary 180 181 **Returns** 182 183 a float 184 185 """ 186 accum = [] 187 for atom,num in composList: 188 tStr = strArg.replace('DEADBEEF',atom) 189 accum.append(eval(tStr)) 190 return min(accum)
191
192 -def MAX(strArg,composList,atomDict):
193 """ *Calculator Method* 194 195 calculates the maximum value of a descriptor across a composition 196 197 **Arguments** 198 199 - strArg: the arguments in string form 200 201 - compos: the composition vector 202 203 - atomDict: the atomic dictionary 204 205 **Returns** 206 207 a float 208 209 """ 210 accum = [] 211 for atom,num in composList: 212 tStr = strArg.replace('DEADBEEF',atom) 213 accum.append(eval(tStr)) 214 return max(accum)
215 216 217 #------------------ 218 # string replacement routines 219 # these are not intended to be called by clients 220 #------------------ 221
222 -def _SubForAtomicVars(cExpr,varList,dictName):
223 """ replace atomic variables with the appropriate dictionary lookup 224 225 *Not intended for client use* 226 227 """ 228 for i in range(len(varList)): 229 cExpr = cExpr.replace('$%d'%(i+1), 230 '%s["DEADBEEF"]["%s"]'%(dictName,varList[i])) 231 return cExpr
232
233 -def _SubForCompoundDescriptors(cExpr,varList,dictName):
234 """ replace compound variables with the appropriate list index 235 236 *Not intended for client use* 237 238 """ 239 for i in range(len(varList)): 240 cExpr = cExpr.replace('$%s'%chr(ord('a')+i), 241 '%s["%s"]'%(dictName,varList[i])) 242 return cExpr
243
244 -def _SubMethodArgs(cExpr,knownMethods):
245 """ alters the arguments of calls to calculator methods 246 247 *Not intended for client use* 248 249 This is kind of putrid (and the code ain't so pretty either) 250 The general idea is that the various special methods for atomic 251 descriptors need two extra arguments (the composition and the atomic 252 dict). Rather than make the user type those in, we just find 253 invocations of these methods and fill out the function calls using 254 string replacements. 255 """ 256 res = cExpr 257 for method in knownMethods: 258 p = 0 259 while p != -1 and p < len(res): 260 p = res.find(method,p) 261 if p != -1: 262 p = p + len(method) + 1 263 start = p 264 parenCount = 1 265 while parenCount and p < len(res): 266 if res[p] == ')': 267 parenCount = parenCount - 1 268 elif res[p] == '(': 269 parenCount = parenCount + 1 270 p = p + 1 271 if p <= len(res): 272 res = res[0:start]+"'%s',compos,atomDict"%(res[start:p-1])+res[p-1:] 273 return res
274
275 -def CalcSingleCompoundDescriptor(compos,argVect,atomDict,propDict):
276 """ calculates the value of the descriptor for a single compound 277 278 **ARGUMENTS:** 279 280 - compos: a vector/tuple containing the composition 281 information... in the form: 282 '[("Fe",1.),("Pt",2.),("Rh",0.02)]' 283 284 - argVect: a vector/tuple with three elements: 285 286 1) AtomicDescriptorNames: a list/tuple of the names of the 287 atomic descriptors being used. These determine the 288 meaning of $1, $2, etc. in the expression 289 290 2) CompoundDescriptorNames: a list/tuple of the names of the 291 compound descriptors being used. These determine the 292 meaning of $a, $b, etc. in the expression 293 294 3) Expr: a string containing the expression to be used to 295 evaluate the final result. 296 297 - atomDict: 298 a dictionary of atomic descriptors. Each atomic entry is 299 another dictionary containing the individual descriptors 300 and their values 301 302 - propVect: 303 a list of descriptors for the composition. 304 305 **RETURNS:** 306 307 the value of the descriptor, -666 if a problem was encountered 308 309 **NOTE:** 310 311 - because it takes rather a lot of work to get everything set 312 up to calculate a descriptor, if you are calculating the 313 same descriptor for multiple compounds, you probably want to 314 be calling _CalcMultipleCompoundsDescriptor()_. 315 316 """ 317 try: 318 atomVarNames = argVect[0] 319 compositionVarNames = argVect[1] 320 formula = argVect[2] 321 formula = _SubForCompoundDescriptors(formula,compositionVarNames,'propDict') 322 formula = _SubForAtomicVars(formula,atomVarNames,'atomDict') 323 evalTarget = _SubMethodArgs(formula,knownMethods) 324 except: 325 if __DEBUG: 326 import sys,traceback 327 print('Sub Failure!') 328 traceback.print_exc() 329 print(evalTarget) 330 print(propDict) 331 raise RuntimeError('Failure 1') 332 else: 333 return -666 334 335 try: 336 v = eval(evalTarget) 337 except: 338 if __DEBUG: 339 import sys,traceback 340 outF = open(RDConfig.RDCodeDir+'/ml/descriptors/log.txt','a+') 341 outF.write('#------------------------------\n') 342 outF.write('formula: %s\n'%repr(formula)) 343 outF.write('target: %s\n'%repr(evalTarget)) 344 outF.write('propDict: %s\n'%(repr(propDict))) 345 try: 346 outF.write('keys: %s\n'%(repr(atomDict.keys()))) 347 except: 348 outF.write('no atomDict\n') 349 outF.close() 350 print('ick!') 351 print('formula:',formula) 352 print('target:',evalTarget) 353 print('propDict:',propDict) 354 print('keys:',atomDict.keys()) 355 traceback.print_exc() 356 raise RuntimeError('Failure 2') 357 else: 358 v = -666 359 return v
360
361 -def CalcMultipleCompoundsDescriptor(composVect,argVect,atomDict,propDictList):
362 """ calculates the value of the descriptor for a list of compounds 363 364 **ARGUMENTS:** 365 366 - composVect: a vector of vector/tuple containing the composition 367 information. 368 See _CalcSingleCompoundDescriptor()_ for an explanation of the elements. 369 370 - argVect: a vector/tuple with three elements: 371 372 1) AtomicDescriptorNames: a list/tuple of the names of the 373 atomic descriptors being used. These determine the 374 meaning of $1, $2, etc. in the expression 375 376 2) CompoundDsscriptorNames: a list/tuple of the names of the 377 compound descriptors being used. These determine the 378 meaning of $a, $b, etc. in the expression 379 380 3) Expr: a string containing the expression to be used to 381 evaluate the final result. 382 383 - atomDict: 384 a dictionary of atomic descriptors. Each atomic entry is 385 another dictionary containing the individual descriptors 386 and their values 387 388 - propVectList: 389 a vector of vectors of descriptors for the composition. 390 391 **RETURNS:** 392 393 a vector containing the values of the descriptor for each 394 compound. Any given entry will be -666 if problems were 395 encountered 396 397 """ 398 res = [-666]*len(composVect) 399 try: 400 atomVarNames = argVect[0] 401 compositionVarNames = argVect[1] 402 formula = argVect[2] 403 formula = _SubForCompoundDescriptors(formula,compositionVarNames,'propDict') 404 formula = _SubForAtomicVars(formula,atomVarNames,'atomDict') 405 evalTarget = _SubMethodArgs(formula,knownMethods) 406 except: 407 return res 408 for i in range(len(composVect)): 409 propDict = propDictList[i] 410 compos = composVect[i] 411 try: 412 v = eval(evalTarget) 413 except: 414 v = -666 415 res[i] = v 416 return res
417 418 #------------ 419 # Demo/testing code 420 #------------ 421 if __name__ == '__main__': 422 piece1 = [['d1','d2'],['d1','d2']] 423 aDict = {'Fe':{'d1':1.,'d2':2.},'Pt':{'d1':10.,'d2':20.}} 424 pDict = {'d1':100.,'d2':200.} 425 compos = [('Fe',1),('Pt',1)] 426 427 cExprs = ["SUM($1)","SUM($1)+SUM($2)","SUM($1)+SUM($1)","MEAN($1)","DEV($2)","MAX($1)","MIN($1)/MAX($1)", 428 "MIN($2)","SUM($1)/$a","sqrt($a+$b)","SUM((3.*$1)/($2))","foo"] 429 430 for cExpr in cExprs: 431 argVect = piece1 + [cExpr] 432 print(cExpr) 433 print(CalcSingleCompoundDescriptor(compos,argVect,aDict,pDict)) 434 print(CalcMultipleCompoundsDescriptor([compos,compos],argVect,aDict,[pDict,pDict])) 435