Package rdkit :: Package ML :: Package Descriptors :: Module CompoundDescriptors
[hide private]
[frames] | no frames]

Source Code for Module rdkit.ML.Descriptors.CompoundDescriptors

  1  # 
  2  #  Copyright (C) 2001,2002  greg Landrum and Rational Discovery LLC 
  3  # 
  4  """ descriptor calculator for compounds defined by a composition alone 
  5    (only the composition is required) 
  6   
  7  """ 
  8  from __future__ import print_function 
  9  from rdkit import RDConfig 
 10  from rdkit.utils import chemutils 
 11  import os 
 12  from rdkit.Dbase.DbConnection import DbConnect 
 13  from rdkit.ML.Descriptors import Parser,Descriptors 
 14  from rdkit.six.moves import xrange  
 15   
 16  # the list of possible ways to count valence electrons that we know 
 17  countOptions = [('NVAL','total number of valence electrons'), 
 18                  ('NVAL_NO_FULL_F','number of valence electrons neglecting filled f shells'), 
 19                  ('NVAL_NO_FULL_D','number of valence electrons neglecting filled d shells'), 
 20                  ('NVAL_NO_FULL','number of valence electrons neglecting filled f and d shells')] 
 21   
22 -def GetAllDescriptorNames(db,tbl1,tbl2,user='sysdba',password='masterkey'):
23 """ gets possible descriptor names from a database 24 25 **Arguments** 26 27 - db: the name of the database to use 28 29 - tbl1: the name of the table to be used for reading descriptor values 30 31 - tbl2: the name of the table to be used for reading notes about the 32 descriptors (*descriptions of the descriptors if you like*) 33 34 - user: the user name for DB access 35 36 - password: the password for DB access 37 38 **Returns** 39 40 a 2-tuple containing: 41 42 1) a list of column names 43 44 2) a list of column descriptors 45 46 **Notes** 47 48 - this uses _Dbase.DbInfo_ and Dfunctionality for querying the database 49 50 - it is assumed that tbl2 includes 'property' and 'notes' columns 51 52 """ 53 conn = DbConnect(db,user=user,password=password) 54 55 colNames = conn.GetColumnNames(table=tbl1) 56 colDesc = map(lambda x:(x[0].upper(),x[1]), 57 conn.GetColumns('property,notes',table=tbl2)) 58 for name,desc in countOptions: 59 colNames.append(name) 60 colDesc.append((name,desc)) 61 return colNames,colDesc
62
63 -class CompoundDescriptorCalculator(Descriptors.DescriptorCalculator):
64 """ used for calculating descriptors 65 66 This is the central point for descriptor calculation 67 68 **Notes** 69 70 - There are two kinds of descriptors this cares about: 71 72 1) *Simple Descriptors* can be calculated solely using atomic descriptor 73 values and the composition of the compound. The full list of possible 74 simple descriptors is determined by the types of *Calculator Methods* 75 (see below) and the contents of an atomic database. 76 77 Simple Descriptors can be marked as *nonZeroDescriptors*. These are used 78 to winnow out atom types where particular atomic descriptors are zero 79 (usually indicating that the value is unknown) 80 81 Simple Descriptors are maintained locally in the _simpleList_ 82 83 2) *Compound Descriptors* may rely upon more complicated computation schemes 84 and descriptors for the compound as a whole (e.g. structural variables, etc.). 85 The full list of compound descriptors is limitless. They are calculated using 86 the _ML.Descriptors.Parser_ module. 87 88 Compound Descriptors are maintained locally in the _compoundList_ 89 90 - This class has a some special methods which are labelled as *Calculator Method* 91 These are used internally to take atomic descriptors and reduce them to a single 92 simple descriptor value for a composition. They are primarily intended for internal use. 93 94 - a *composition vector* is a list of 2-tuples: '[(atom1name,atom1Num),...]' 95 where atom1Num is the contribution of the atom to the stoichiometry of the 96 compound. No assumption is made about the stoichiometries (i.e. they don't 97 have to be either integral or all sum to one). 98 99 """ 100 101 #------------ 102 # methods used to calculate descriptors 103 #------------ 104
105 - def SUM(self,desc,compos):
106 """ *Calculator Method* 107 108 sums the descriptor values across the composition 109 110 **Arguments** 111 112 - desc: the name of the descriptor 113 114 - compos: the composition vector 115 116 **Returns** 117 118 a float 119 120 """ 121 res = 0.0 122 for atom,num in compos: 123 res = res + self.atomDict[atom][desc]*num 124 return res
125 - def MEAN(self,desc,compos):
126 """ *Calculator Method* 127 128 averages the descriptor values across the composition 129 130 **Arguments** 131 132 - desc: the name of the descriptor 133 134 - compos: the composition vector 135 136 **Returns** 137 138 a float 139 140 """ 141 res = 0.0 142 nSoFar = 0.0 143 for atom,num in compos: 144 res = res + self.atomDict[atom][desc]*num 145 nSoFar = nSoFar + num 146 return res/nSoFar
147 - def DEV(self,desc,compos):
148 """ *Calculator Method* 149 150 average deviation of the descriptor values across the composition 151 152 **Arguments** 153 154 - desc: the name of the descriptor 155 156 - compos: the composition vector 157 158 **Returns** 159 160 a float 161 162 """ 163 mean = self.MEAN(desc,compos) 164 res = 0.0 165 nSoFar = 0.0 166 for atom,num in compos: 167 res = res + abs(self.atomDict[atom][desc]-mean)*num 168 nSoFar = nSoFar + num 169 return res/nSoFar
170 - def MIN(self,desc,compos):
171 """ *Calculator Method* 172 173 minimum of the descriptor values across the composition 174 175 **Arguments** 176 177 - desc: the name of the descriptor 178 179 - compos: the composition vector 180 181 **Returns** 182 183 a float 184 185 """ 186 return min(map(lambda x,y=desc,z=self:z.atomDict[x[0]][y],compos))
187 - def MAX(self,desc,compos):
188 """ *Calculator Method* 189 190 maximum of the descriptor values across the composition 191 192 **Arguments** 193 194 - desc: the name of the descriptor 195 196 - compos: the composition vector 197 198 **Returns** 199 200 a float 201 202 """ 203 return max(map(lambda x,y=desc,z=self:z.atomDict[x[0]][y],compos))
204 205 #------------ 206 # Other methods 207 #------------ 208
209 - def ProcessSimpleList(self):
210 """ Handles the list of simple descriptors 211 212 This constructs the list of _nonZeroDescriptors_ and _requiredDescriptors_. 213 214 There's some other magic going on that I can't decipher at the moment. 215 216 """ 217 global countOptions 218 219 self.nonZeroDescriptors = [] 220 lCopy = self.simpleList[:] 221 tList = map(lambda x:x[0],countOptions) 222 for i in xrange(len(lCopy)): 223 entry = lCopy[i] 224 if 'NONZERO' in entry[1]: 225 if entry[0] not in tList: 226 self.nonZeroDescriptors.append('%s != 0'%entry[0]) 227 if len(entry[1]) == 1: 228 self.simpleList.remove(entry) 229 else: 230 self.simpleList[self.simpleList.index(entry)][1].remove('NONZERO') 231 self.requiredDescriptors = map(lambda x:x[0],self.simpleList) 232 for entry in tList: 233 if entry in self.requiredDescriptors: 234 self.requiredDescriptors.remove(entry)
235
236 - def ProcessCompoundList(self):
237 """ Adds entries from the _compoundList_ to the list of _requiredDescriptors_ 238 239 Each compound descriptor is surveyed. Any atomic descriptors it requires 240 are added to the list of _requiredDescriptors_ to be pulled from the database. 241 242 """ 243 # add in the atomic descriptors we will need 244 for entry in self.compoundList: 245 for atomicDesc in entry[1]: 246 if atomicDesc != '' and atomicDesc not in self.requiredDescriptors: 247 self.requiredDescriptors.append(atomicDesc)
248
249 - def BuildAtomDict(self):
250 """ builds the local atomic dict 251 252 We don't want to keep around all descriptor values for all atoms, so this 253 method takes care of only pulling out the descriptors in which we are 254 interested. 255 256 **Notes** 257 258 - this uses _chemutils.GetAtomicData_ to actually pull the data 259 260 """ 261 self.ProcessSimpleList() 262 self.ProcessCompoundList() 263 264 self.atomDict = {} 265 whereString = ' and '.join(self.nonZeroDescriptors) 266 if whereString != '': 267 whereString = 'where ' + whereString 268 chemutils.GetAtomicData(self.atomDict,self.requiredDescriptors,self.dbName,self.dbTable, 269 whereString,self.dbUser,self.dbPassword, 270 includeElCounts=1)
271
272 - def CalcSimpleDescriptorsForComposition(self,compos='',composList=None):
273 """ calculates all simple descriptors for a given composition 274 275 **Arguments** 276 277 - compos: a string representation of the composition 278 279 - composList: a *composVect* 280 281 The client must provide either _compos_ or _composList_. If both are 282 provided, _composList_ takes priority. 283 284 **Returns** 285 the list of descriptor values 286 287 **Notes** 288 289 - when _compos_ is provided, this uses _chemutils.SplitComposition_ 290 to split the composition into its individual pieces 291 292 - if problems are encountered because of either an unknown descriptor or 293 atom type, a _KeyError_ will be raised. 294 295 """ 296 if composList is None: 297 composList = chemutils.SplitComposition(compos) 298 try: 299 res = [] 300 for i in xrange(len(self.simpleList)): 301 descName,targets = self.simpleList[i] 302 for target in targets: 303 try: 304 method = getattr(self,target) 305 except AttributeError: 306 print('Method %s does not exist'%(target)) 307 else: 308 res.append(method(descName,composList)) 309 except KeyError as msg: 310 print('composition %s caused problems'%composList) 311 raise KeyError(msg) 312 return res
313
314 - def CalcCompoundDescriptorsForComposition(self,compos='',composList=None, 315 propDict={}):
316 """ calculates all simple descriptors for a given composition 317 318 **Arguments** 319 320 - compos: a string representation of the composition 321 322 - composList: a *composVect* 323 324 - propDict: a dictionary containing the properties of the composition 325 as a whole (e.g. structural variables, etc.) 326 327 The client must provide either _compos_ or _composList_. If both are 328 provided, _composList_ takes priority. 329 330 **Returns** 331 the list of descriptor values 332 333 **Notes** 334 335 - when _compos_ is provided, this uses _chemutils.SplitComposition_ 336 to split the composition into its individual pieces 337 338 """ 339 if composList is None: 340 composList = chemutils.SplitComposition(compos) 341 res = [] 342 for i in xrange(len(self.compoundList)): 343 val = Parser.CalcSingleCompoundDescriptor(composList,self.compoundList[i][1:], 344 self.atomDict,propDict) 345 res.append(val) 346 return res
347
348 - def CalcDescriptorsForComposition(self,composVect,propDict):
349 """ calculates all descriptors for a given composition 350 351 **Arguments** 352 353 - compos: a string representation of the composition 354 355 - propDict: a dictionary containing the properties of the composition 356 as a whole (e.g. structural variables, etc.). These are used to 357 generate Compound Descriptors 358 359 **Returns** 360 the list of all descriptor values 361 362 **Notes** 363 364 - this uses _chemutils.SplitComposition_ 365 to split the composition into its individual pieces 366 367 """ 368 composList = chemutils.SplitComposition(composVect[0]) 369 try: 370 r1 = self.CalcSimpleDescriptorsForComposition(composList=composList) 371 except KeyError as msg: 372 res = [] 373 else: 374 r2 = self.CalcCompoundDescriptorsForComposition(composList=composList, 375 propDict=propDict) 376 res = r1+r2 377 378 return tuple(res)
379 CalcDescriptors = CalcDescriptorsForComposition 380
381 - def GetDescriptorNames(self):
382 """ returns a list of the names of the descriptors this calculator generates 383 384 """ 385 if self.descriptorNames is not None: 386 return self.descriptorNames 387 else: 388 res = [] 389 for i in xrange(len(self.simpleList)): 390 descName,targets = self.simpleList[i] 391 for target in targets: 392 try: 393 method = getattr(self,target) 394 except AttributeError: 395 print('Method %s does not exist'%(target)) 396 else: 397 res.append('%s_%s'%(target,descName)) 398 for entry in self.compoundList: 399 res.append(entry[0]) 400 self.descriptorNames = res[:] 401 return tuple(res)
402
403 - def __init__(self,simpleList,compoundList=None, 404 dbName=None, 405 dbTable='atomic_data',dbUser='sysdba',dbPassword='masterkey'):
406 """ Constructor 407 408 **Arguments** 409 410 - simpleList: list of simple descriptors to be calculated 411 (see below for format) 412 413 - compoundList: list of compound descriptors to be calculated 414 (see below for format) 415 416 - dbName: name of the atomic database to be used 417 418 - dbTable: name the table in _dbName_ which has atomic data 419 420 - dbUser: user name for DB access 421 422 - dbPassword: password for DB access 423 424 **Note** 425 426 - format of simpleList: 427 a list of 2-tuples containing: 428 429 1) name of the atomic descriptor 430 431 2) a list of operations on that descriptor (e.g. NonZero, Max, etc.) 432 These must correspond to the *Calculator Method* names above. 433 434 - format of compoundList: 435 a list of 2-tuples containing: 436 437 1) name of the descriptor to be calculated 438 439 2) list of selected atomic descriptor names (define $1, $2, etc.) 440 441 3) list of selected compound descriptor names (define $a, $b, etc.) 442 443 4) text formula defining the calculation (see _Parser_) 444 445 """ 446 447 if dbName is None: 448 dbName = RDConfig.RDDataDatabase 449 450 Descriptors.DescriptorCalculator.__init__(self) 451 #self.simpleList = map(lambda x:(string.upper(x[0]),map(string.upper,x[1])), 452 # simpleList) 453 self.simpleList = [(x[0].upper(), [y.upper() for y in x[1]]) 454 for x in simpleList] 455 self.descriptorNames = None 456 self.compoundList = compoundList 457 if self.compoundList is None: 458 self.compoundList = [] 459 self.dbName = dbName 460 self.dbTable = dbTable 461 self.dbUser = dbUser 462 self.dbPassword = dbPassword
463 464 465 if __name__ == '__main__': 466 d = [('DED',['NonZero','Mean','Dev']), 467 ('M_B_electroneg',['NonZero']), 468 ('Cov_rad',['Max','Min'])] 469 o = DescriptorCalculator(d) 470 o.BuildAtomDict() 471 print('len:',len(o.atomDict.keys())) 472 for key in o.atomDict.keys()[-4:-1]: 473 print(key,o.atomDict[key]) 474 475 print('descriptors:',o.GetDescriptorNames()) 476 composList = ['Nb','Nb3','NbPt','Nb2Pt'] 477 for compos in composList: 478 descs = o.CalcSimpleDescriptorsForComposition(compos) 479 print(compos,descs) 480