1
2
3
4
5
6
7
8 """ Utilities for data manipulation
9
10 **FILE FORMATS:**
11
12 - *.qdat files* contain quantized data suitable for
13 feeding to learning algorithms.
14
15 The .qdat file, written by _DecTreeGui_, is structured as follows:
16
17 1) Any number of lines which are ignored.
18
19 2) A line containing the string 'Variable Table'
20
21 any number of variable definitions in the format:
22
23 '# Variable_name [quant_bounds]'
24
25 where '[quant_bounds]' is a list of the boundaries used for quantizing
26 that variable. If the variable is inherently integral (i.e. not
27 quantized), this can be an empty list.
28
29 3) A line beginning with '# ----' which signals the end of the variable list
30
31 4) Any number of lines containing data points, in the format:
32
33 'Name_of_point var1 var2 var3 .... varN'
34
35 all variable values should be integers
36
37 Throughout, it is assumed that varN is the result
38
39 - *.dat files* contain the same information as .qdat files, but the variable
40 values can be anything (floats, ints, strings). **These files should
41 still contain quant_bounds!**
42
43 - *.qdat.pkl file* contain a pickled (binary) representation of
44 the data read in. They stores, in order:
45
46 1) A python list of the variable names
47
48 2) A python list of lists with the quantization bounds
49
50 3) A python list of the point names
51
52 4) A python list of lists with the data points
53
54 """
55 from __future__ import print_function
56 import re,csv
57 import random
58
59 from rdkit import six
60 from rdkit.six.moves import cPickle
61 from rdkit.six.moves import xrange, map
62 from rdkit import RDConfig
63 from rdkit.utils import fileutils
64 from rdkit.ML.Data import MLData
65 from rdkit.Dbase.DbConnection import DbConnect
66 from rdkit.DataStructs import BitUtils
67
72
73 -def WriteData(outFile,varNames,qBounds,examples):
74 """ writes out a .qdat file
75
76 **Arguments**
77
78 - outFile: a file object
79
80 - varNames: a list of variable names
81
82 - qBounds: the list of quantization bounds (should be the same length
83 as _varNames_)
84
85 - examples: the data to be written
86
87 """
88 outFile.write('# Quantized data from DataUtils\n')
89 outFile.write('# ----------\n')
90 outFile.write('# Variable Table\n')
91 for i in xrange(len(varNames)):
92 outFile.write('# %s %s\n'%(varNames[i],str(qBounds[i])))
93 outFile.write('# ----------\n')
94 for example in examples:
95 outFile.write(' '.join(map(str,example))+'\n')
96
97
99 """ reads the variables and quantization bounds from a .qdat or .dat file
100
101 **Arguments**
102
103 - inFile: a file object
104
105 **Returns**
106
107 a 2-tuple containing:
108
109 1) varNames: a list of the variable names
110
111 2) qbounds: the list of quantization bounds for each variable
112
113 """
114 varNames = []
115 qBounds = []
116 fileutils.MoveToMatchingLine(inFile,'Variable Table')
117 inLine = inFile.readline()
118 while inLine.find('# ----') == -1:
119 splitLine = inLine[2:].split('[')
120 varNames.append(splitLine[0].strip())
121 qBounds.append(splitLine[1][:-2])
122 inLine = inFile.readline()
123 for i in xrange(len(qBounds)):
124
125 if qBounds[i] != '':
126 l = qBounds[i].split(',')
127 qBounds[i] = []
128 for item in l:
129 qBounds[i].append(float(item))
130 else:
131 qBounds[i] = []
132 return varNames,qBounds
133
135 """ reads the examples from a .qdat file
136
137 **Arguments**
138
139 - inFile: a file object
140
141 **Returns**
142
143 a 2-tuple containing:
144
145 1) the names of the examples
146
147 2) a list of lists containing the examples themselves
148
149 **Note**
150
151 because this is reading a .qdat file, it assumed that all variable values
152 are integers
153
154 """
155 expr1 = re.compile(r'^#')
156 expr2 = re.compile(r'[\ ]*|[\t]*')
157 examples = []
158 names = []
159 inLine = inFile.readline()
160 while inLine:
161 if expr1.search(inLine) is None:
162 resArr = expr2.split(inLine)
163 if len(resArr)>1:
164 examples.append(list(map(lambda x: int(x),resArr[1:])))
165 names.append(resArr[0])
166 inLine = inFile.readline()
167 return names,examples
168
170 """ reads the examples from a .dat file
171
172 **Arguments**
173
174 - inFile: a file object
175
176 **Returns**
177
178 a 2-tuple containing:
179
180 1) the names of the examples
181
182 2) a list of lists containing the examples themselves
183
184 **Note**
185
186 - this attempts to convert variable values to ints, then floats.
187 if those both fail, they are left as strings
188
189 """
190 expr1 = re.compile(r'^#')
191 expr2 = re.compile(r'[\ ]*|[\t]*')
192 examples = []
193 names = []
194 inLine = inFile.readline()
195 while inLine:
196 if expr1.search(inLine) is None:
197 resArr = expr2.split(inLine)[:-1]
198 if len(resArr)>1:
199 for i in xrange(1,len(resArr)):
200 d = resArr[i]
201 try:
202 resArr[i] = int(d)
203 except ValueError:
204 try:
205 resArr[i] = float(d)
206 except ValueError:
207 pass
208 examples.append(resArr[1:])
209 names.append(resArr[0])
210 inLine = inFile.readline()
211 return names,examples
212
214 """ builds a data set from a .qdat file
215
216 **Arguments**
217
218 - fileName: the name of the .qdat file
219
220 **Returns**
221
222 an _MLData.MLQuantDataSet_
223
224 """
225 with open(fileName,'r') as inFile:
226 varNames,qBounds = ReadVars(inFile)
227 ptNames,examples = ReadQuantExamples(inFile)
228 data = MLData.MLQuantDataSet(examples,qBounds=qBounds,varNames=varNames,
229 ptNames=ptNames)
230 return data
231
232
234 """ builds a data set from a .dat file
235
236 **Arguments**
237
238 - fileName: the name of the .dat file
239
240 **Returns**
241
242 an _MLData.MLDataSet_
243
244 """
245 with open(fileName,'r') as inFile:
246 varNames,qBounds = ReadVars(inFile)
247 ptNames,examples = ReadGeneralExamples(inFile)
248 data = MLData.MLDataSet(examples,qBounds=qBounds,varNames=varNames,
249 ptNames=ptNames)
250 return data
251
252
254 """ calculates the number of possible values for each variable in a data set
255
256 **Arguments**
257
258 - data: a list of examples
259
260 - order: the ordering map between the variables in _data_ and _qBounds_
261
262 - qBounds: the quantization bounds for the variables
263
264 **Returns**
265
266 a list with the number of possible values each variable takes on in the data set
267
268 **Notes**
269
270 - variables present in _qBounds_ will have their _nPossible_ number read
271 from _qbounds
272
273 - _nPossible_ for other numeric variables will be calculated
274
275 """
276 numericTypes = [int, float]
277 if six.PY2:
278 numericTypes.append(long)
279
280 print('order:',order, len(order))
281 print('qB:',qBounds)
282
283 assert (qBounds and len(order)==len(qBounds)) or (nQBounds and len(order)==len(nQBounds)),\
284 'order/qBounds mismatch'
285 nVars = len(order)
286 nPossible = [-1]*nVars
287 cols = range(nVars)
288 for i in xrange(nVars):
289 if nQBounds and nQBounds[i] != 0:
290 nPossible[i] = -1
291 cols.remove(i)
292 elif len(qBounds[i])>0:
293 nPossible[i] = len(qBounds[i])
294 cols.remove(i)
295
296 nPts = len(data)
297 for i in xrange(nPts):
298 for col in cols[:]:
299 d = data[i][order[col]]
300 if type(d) in numericTypes:
301 if int(d) == d:
302 nPossible[col] = max(int(d),nPossible[col])
303 else:
304 nPossible[col] = -1
305 cols.remove(col)
306 else:
307 print('bye bye col %d: %s'%(col,repr(d)))
308 nPossible[col] = -1
309 cols.remove(col)
310
311 return list(map(lambda x:int(x)+1,nPossible))
312
313
314
316 """ writes either a .qdat.pkl or a .dat.pkl file
317
318 **Arguments**
319
320 - outName: the name of the file to be used
321
322 - data: either an _MLData.MLDataSet_ or an _MLData.MLQuantDataSet_
323
324 """
325 varNames = data.GetVarNames()
326 qBounds = data.GetQuantBounds()
327 ptNames = data.GetPtNames()
328 examples = data.GetAllData()
329 with open(outName,'wb+') as outFile:
330 cPickle.dump(varNames,outFile)
331 cPickle.dump(qBounds,outFile)
332 cPickle.dump(ptNames,outFile)
333 cPickle.dump(examples,outFile)
334
336 """
337
338 >>> v = [10,20,30,40,50]
339 >>> TakeEnsemble(v,(1,2,3))
340 [20, 30, 40]
341 >>> v = ['foo',10,20,30,40,50,1]
342 >>> TakeEnsemble(v,(1,2,3),isDataVect=True)
343 ['foo', 20, 30, 40, 1]
344
345
346
347 """
348 if isDataVect:
349 ensembleIds = [x+1 for x in ensembleIds]
350 vect = [vect[0]]+[vect[x] for x in ensembleIds]+[vect[-1]]
351 else:
352 vect = [vect[x] for x in ensembleIds]
353 return vect
354
355
356
357 -def DBToData(dbName,tableName,user='sysdba',password='masterkey',dupCol=-1,
358 what='*',where='',join='',pickleCol=-1,pickleClass=None,
359 ensembleIds=None):
360 """ constructs an _MLData.MLDataSet_ from a database
361
362 **Arguments**
363
364 - dbName: the name of the database to be opened
365
366 - tableName: the table name containing the data in the database
367
368 - user: the user name to be used to connect to the database
369
370 - password: the password to be used to connect to the database
371
372 - dupCol: if nonzero specifies which column should be used to recognize
373 duplicates.
374
375 **Returns**
376
377 an _MLData.MLDataSet_
378
379 **Notes**
380
381 - this uses Dbase.DataUtils functionality
382
383 """
384 conn = DbConnect(dbName,tableName,user,password)
385 res = conn.GetData(fields=what,where=where,join=join,removeDups=dupCol,
386 forceList=1)
387 nPts = len(res)
388 vals = [None]*nPts
389 ptNames = [None]*nPts
390 classWorks=True
391 for i in range(nPts):
392 tmp = list(res[i])
393 ptNames[i] = tmp.pop(0)
394 if pickleCol>=0:
395 if not pickleClass or not classWorks:
396 tmp[pickleCol] = cPickle.loads(str(tmp[pickleCol]))
397 else:
398 try:
399 tmp[pickleCol] = pickleClass(str(tmp[pickleCol]))
400 except:
401 tmp[pickleCol] = cPickle.loads(str(tmp[pickleCol]))
402 classWorks=False
403 if ensembleIds:
404 tmp[pickleCol] = BitUtils.ConstructEnsembleBV(tmp[pickleCol],ensembleIds)
405 else:
406 if ensembleIds:
407 tmp = TakeEnsemble(tmp,ensembleIds,isDataVect=True)
408 vals[i] = tmp
409 varNames = conn.GetColumnNames(join=join,what=what)
410 data = MLData.MLDataSet(vals,varNames=varNames,ptNames=ptNames)
411 return data
412
413 -def TextToData(reader,ignoreCols=[],onlyCols=None):
414 """ constructs an _MLData.MLDataSet_ from a bunch of text
415 #DOC
416 **Arguments**
417 - reader needs to be iterable and return lists of elements
418 (like a csv.reader)
419
420 **Returns**
421
422 an _MLData.MLDataSet_
423
424 """
425
426 varNames = next(reader)
427 if not onlyCols:
428 keepCols = []
429 for i,name in enumerate(varNames):
430 if name not in ignoreCols:
431 keepCols.append(i)
432 else:
433 keepCols = [-1]*len(onlyCols)
434 for i,name in enumerate(varNames):
435 if name in onlyCols:
436 keepCols[onlyCols.index(name)]=i
437
438 nCols = len(varNames)
439 varNames = tuple([varNames[x] for x in keepCols])
440 nVars = len(varNames)
441 vals = []
442 ptNames = []
443 for splitLine in reader:
444 if len(splitLine):
445 if len(splitLine)!=nCols:
446 raise ValueError('unequal line lengths')
447 tmp = [splitLine[x] for x in keepCols]
448 ptNames.append(tmp[0])
449 pt = [None]*(nVars-1)
450 for j in range(nVars-1):
451 try:
452 val = int(tmp[j+1])
453 except:
454 try:
455 val = float(tmp[j+1])
456 except:
457 val = str(tmp[j+1])
458 pt[j] = val
459 vals.append(pt)
460 data = MLData.MLDataSet(vals,varNames=varNames,ptNames=ptNames)
461 return data
462
463 -def TextFileToData(fName,onlyCols=None):
464 """
465 #DOC
466
467 """
468 ext = fName.split('.')[-1]
469 with open(fName,'r') as inF:
470 if ext.upper() == 'CSV':
471
472 splitter = csv.reader(inF)
473 else:
474 splitter = csv.reader(inF,delimiter='\t')
475 res = TextToData(splitter,onlyCols=onlyCols)
476 return res
477
479 """ Seeds the random number generators
480
481 **Arguments**
482
483 - seed: a 2-tuple containing integers to be used as the random number seeds
484
485 **Notes**
486
487 this seeds both the RDRandom generator and the one in the standard
488 Python _random_ module
489
490 """
491 from rdkit import RDRandom
492 RDRandom.seed(seed[0])
493 import random
494 random.seed(seed[0])
495
496 -def FilterData(inData,val,frac,col=-1,indicesToUse=None,indicesOnly=0):
497 """
498 #DOC
499 """
500 if frac<0 or frac>1: raise ValueError('filter fraction out of bounds')
501 try:
502 inData[0][col]
503 except IndexError:
504 raise ValueError('target column index out of range')
505
506
507
508 if indicesToUse:
509 tmp = [inData[x] for x in indicesToUse]
510 else:
511 tmp = list(inData)
512 nOrig = len(tmp)
513 sortOrder = list(xrange(nOrig))
514
515
516 sortOrder.sort(key=lambda x: tmp[x][col])
517 tmp = [tmp[x] for x in sortOrder]
518
519
520 start = 0
521 while start < nOrig and tmp[start][col] != val:
522 start += 1
523 if start >= nOrig:
524 raise ValueError('target value (%d) not found in data'%(val))
525
526
527 finish = start+1
528 while finish<nOrig and tmp[finish][col] ==val:
529 finish += 1
530
531
532 nWithVal = finish-start
533
534
535 nOthers = len(tmp)-nWithVal
536
537 currFrac = float(nWithVal) / nOrig
538 if currFrac < frac:
539
540
541
542
543
544 nTgtFinal = nWithVal
545 nFinal = int(round(nWithVal / frac))
546 nOthersFinal = nFinal - nTgtFinal
547
548
549
550
551
552
553 while float(nTgtFinal) / nFinal > frac:
554 nTgtFinal -= 1
555 nFinal -= 1
556
557 else:
558
559
560
561
562
563 nOthersFinal = nOthers
564 nFinal = int(round(nOthers/(1-frac)))
565 nTgtFinal = nFinal - nOthersFinal
566
567
568
569
570
571
572 while float(nTgtFinal) / nFinal < frac:
573 nOthersFinal -= 1
574 nFinal -= 1
575
576 others = list(xrange(start)) + list(xrange(finish,nOrig))
577 othersTake = permutation(nOthers)
578 others = [others[x] for x in othersTake[:nOthersFinal]]
579
580 targets = list(xrange(start,finish))
581 targetsTake = permutation(nWithVal)
582 targets = [targets[x] for x in targetsTake[:nTgtFinal]]
583
584
585 indicesToKeep = targets+others
586 nToKeep = len(indicesToKeep)
587 nRej = nOrig-nToKeep
588
589 res = []
590 rej = []
591
592 if not indicesOnly:
593 for i in permutation(nOrig):
594 if i in indicesToKeep:
595 res.append(tmp[i])
596 else:
597 rej.append(tmp[i])
598 else:
599
600 for i in permutation(nOrig):
601 if not indicesToUse:
602 idx = sortOrder[i]
603 else:
604 idx = indicesToUse[sortOrder[i]]
605 if i in indicesToKeep:
606 res.append(idx)
607 else:
608 rej.append(idx)
609 return res,rej
610
612 """ #DOC
613 """
614 counts = {}
615 for p in inData:
616 if not bounds:
617 r = p[col]
618 else:
619 act = p[col]
620 bound = 0
621 placed = 0
622 while not placed and bound < len(bounds):
623 if act < bounds[bound]:
624 r = bound
625 placed = 1
626 else:
627 bound += 1
628 if not placed:
629 r = bound
630
631 counts[r] = counts.get(r,0)+1
632 return counts
633
634
636 """ randomizes the activity values of a dataset
637
638 **Arguments**
639
640 - dataSet: a _ML.Data.MLQuantDataSet_, the activities here will be randomized
641
642 - shuffle: an optional toggle. If this is set, the activity values
643 will be shuffled (so the number in each class remains constant)
644
645 - runDetails: an optional CompositeRun object
646
647 **Note**
648
649 - _examples_ are randomized in place
650
651
652 """
653 nPossible = dataSet.GetNPossibleVals()[-1]
654 nPts = dataSet.GetNPts()
655 if shuffle:
656 if runDetails: runDetails.shuffled = 1
657 acts = dataSet.GetResults()[:]
658 random.shuffle(acts,random=random.random)
659 else:
660 if runDetails: runDetails.randomized = 1
661 acts = [random.randint(0,nPossible) for x in len(examples)]
662 for i in range(nPts):
663 tmp = dataSet[i]
664 tmp[-1] = acts[i]
665 dataSet[i] = tmp
666
667
668
669
670
671
672
673
674
676 import doctest,sys
677 return doctest.testmod(sys.modules["__main__"])
678
679 if __name__ == '__main__':
680 import sys
681 failed,tried = _test()
682 sys.exit(failed)
683