1
2
3
4
5 """ classes to be used to help work with data sets
6
7 """
8 from __future__ import print_function
9 import numpy
10 import math
11 import copy,types
12 from rdkit import six
13 from rdkit.six.moves import xrange
14
15 numericTypes = [int, float]
16 if six.PY2:
17 numericTypes.append(long)
18
20 """ A data set for holding general data (floats, ints, and strings)
21
22 **Note**
23 this is intended to be a read-only data structure
24 (i.e. after calling the constructor you cannot touch it)
25 """
26 - def __init__(self,data,nVars=None,nPts=None,nPossibleVals=None,
27 qBounds=None,varNames=None,ptNames=None,nResults=1):
28 """ Constructor
29
30 **Arguments**
31
32 - data: a list of lists containing the data. The data are copied, so don't worry
33 about us overwriting them.
34
35 - nVars: the number of variables
36
37 - nPts: the number of points
38
39 - nPossibleVals: an list containing the number of possible values
40 for each variable (should contain 0 when not relevant)
41 This is _nVars_ long
42
43 - qBounds: a list of lists containing quantization bounds for variables
44 which are to be quantized (note, this class does not quantize
45 the variables itself, it merely stores quantization bounds.
46 an empty sublist indicates no quantization for a given variable
47 This is _nVars_ long
48
49 - varNames: a list of the names of the variables.
50 This is _nVars_ long
51
52 - ptNames: the names (labels) of the individual data points
53 This is _nPts_ long
54
55 - nResults: the number of results columns in the data lists. This is usually
56 1, but can be higher.
57 """
58 self.data = [x[:] for x in data]
59 self.nResults = nResults
60 if nVars is None:
61 nVars = len(self.data[0])-self.nResults
62 self.nVars = nVars
63 if nPts is None:
64 nPts = len(data)
65 self.nPts = nPts
66 if qBounds is None:
67 qBounds = [[]]*len(self.data[0])
68 self.qBounds = qBounds
69 if nPossibleVals is None:
70 nPossibleVals = self._CalcNPossible(self.data)
71 self.nPossibleVals = nPossibleVals
72 if varNames is None:
73 varNames = ['']*self.nVars
74 self.varNames = varNames
75 if ptNames is None:
76 ptNames = ['']*self.nPts
77 self.ptNames = ptNames
78
80 """calculates the number of possible values of each variable (where possible)
81
82 **Arguments**
83
84 -data: a list of examples to be used
85
86 **Returns**
87
88 a list of nPossible values for each variable
89
90 """
91 nVars = self.GetNVars()+self.nResults
92 nPossible = [-1]*nVars
93 cols = list(xrange(nVars))
94 for i,bounds in enumerate(self.qBounds):
95 if len(bounds)>0:
96 nPossible[i] = len(bounds)
97 cols.remove(i)
98
99 nPts = self.GetNPts()
100 for i,pt in enumerate(self.data):
101 for col in cols[:]:
102 d = pt[col]
103 if type(d) in numericTypes:
104 if math.floor(d) == d:
105 nPossible[col] = max(math.floor(d),nPossible[col])
106 else:
107 nPossible[col] = -1
108 cols.remove(col)
109 else:
110 nPossible[col] = -1
111 cols.remove(col)
112 return [int(x)+1 for x in nPossible]
113
121 return self.nPossibleVals
124
126 res = [self.ptNames[idx]]+self.data[idx][:]
127 return res
129 if len(val) != self.GetNVars()+self.GetNResults()+1:
130 raise ValueError('bad value in assignment')
131 self.ptNames[idx] = val[0]
132 self.data[idx] = val[1:]
133 return val
134
136 """ returns a list of named examples
137
138 **Note**
139
140 a named example is the result of prepending the example
141 name to the data list
142
143 """
144 res = [None]*self.nPts
145 for i in xrange(self.nPts):
146 res[i] = [self.ptNames[i]]+self.data[i][:]
147 return res
148
150 """ returns a *copy* of the data
151
152 """
153 return copy.deepcopy(self.data)
165
167 """ Returns the result fields from each example
168
169 """
170 if self.GetNResults()>1:
171 v = self.GetNResults()
172 res = [x[-v:] for x in self.data]
173 else:
174 res = [x[-1] for x in self.data]
175 return res
176
181
183 self.data.append(pt[1:])
184 self.ptNames.append(pt[0])
185 self.nPts += 1
186
188 if len(pts)!=len(names):
189 raise ValueError("input length mismatch")
190 self.data += pts
191 self.ptNames += names
192 self.nPts = len(self.data)
193
195 """ a data set for holding quantized data
196
197
198 **Note**
199
200 this is intended to be a read-only data structure
201 (i.e. after calling the constructor you cannot touch it)
202
203 **Big differences to MLDataSet**
204
205 1) data are stored in a numpy array since they are homogenous
206
207 2) results are assumed to be quantized (i.e. no qBounds entry is required)
208
209 """
211 """calculates the number of possible values of each variable
212
213 **Arguments**
214
215 -data: a list of examples to be used
216
217 **Returns**
218
219 a list of nPossible values for each variable
220
221 """
222 return [max(x)+1 for x in numpy.transpose(data)]
223
225 """ returns a list of named examples
226
227 **Note**
228
229 a named example is the result of prepending the example
230 name to the data list
231
232 """
233 res = [None]*self.nPts
234 for i in xrange(self.nPts):
235 res[i] = [self.ptNames[i]]+self.data[i].tolist()
236 return res
237
239 """ returns a *copy* of the data
240
241 """
242 return self.data.tolist()
254 """ Returns the result fields from each example
255
256 """
257 if self.GetNResults()>1:
258 v = self.GetNResults()
259 res = [x[-v:] for x in self.data]
260 else:
261 res = [x[-1] for x in self.data]
262 return res
263
264
265 - def __init__(self,data,nVars=None,nPts=None,nPossibleVals=None,
266 qBounds=None,varNames=None,ptNames=None,nResults=1):
267 """ Constructor
268
269 **Arguments**
270
271 - data: a list of lists containing the data. The data are copied, so don't worry
272 about us overwriting them.
273
274 - nVars: the number of variables
275
276 - nPts: the number of points
277
278 - nPossibleVals: an list containing the number of possible values
279 for each variable (should contain 0 when not relevant)
280 This is _nVars_ long
281
282 - qBounds: a list of lists containing quantization bounds for variables
283 which are to be quantized (note, this class does not quantize
284 the variables itself, it merely stores quantization bounds.
285 an empty sublist indicates no quantization for a given variable
286 This is _nVars_ long
287
288 - varNames: a list of the names of the variables.
289 This is _nVars_ long
290
291 - ptNames: the names (labels) of the individual data points
292 This is _nPts_ long
293
294 - nResults: the number of results columns in the data lists. This is usually
295 1, but can be higher.
296 """
297 self.data = numpy.array(data)
298 self.nResults = nResults
299 if nVars is None:
300 nVars = len(data[0])-self.nResults
301 self.nVars = nVars
302 if nPts is None:
303 nPts = len(data)
304 self.nPts = nPts
305 if qBounds is None:
306 qBounds = [[]]*self.nVars
307 self.qBounds = qBounds
308 if nPossibleVals is None:
309 nPossibleVals = self._CalcNPossible(data)
310 self.nPossibleVals = nPossibleVals
311 if varNames is None:
312 varNames = ['']*self.nVars
313 self.varNames = varNames
314 if ptNames is None:
315 ptNames = ['']*self.nPts
316 self.ptNames = ptNames
317
318
319 if __name__ == '__main__':
320 import DataUtils
321 examples = [[0,0,0,0,0],
322 [0,0,0,1,0],
323 [1,0,0,0,1],
324 [2,1,0,0,1],
325 [2,2,1,0,1]
326 ]
327 varNames = ['foo1','foo2','foo3','foo4','res']
328 ptNames = ['p1','p2','p3','p4','p5']
329 set = MLQuantDataSet(examples,varNames=varNames,ptNames=ptNames)
330 DataUtils.WritePickledData('test_data/test.qdat.pkl',set)
331 print('nVars:',set.GetNVars())
332 print('nPts:',set.GetNPts())
333 print('nPoss:',set.GetNPossibleVals())
334 print('qBounds:',set.GetQuantBounds())
335 print('data:',set.GetAllData())
336 print('Input data:',set.GetInputData())
337 print('results:',set.GetResults())
338
339 print('nameddata:',set.GetNamedData())
340
341 examples = [
342 ['foo',1,1.0,1,1.1],
343 ['foo',2,1.0,1,2.1],
344 ['foo',3,1.2,1.1,3.1],
345 ['foo',4,1.0,1,4.1],
346 ['foo',5,1.1,1,5.1],
347 ]
348 qBounds = [[],[],[],[],[2,4]]
349 varNames = ['foo1','foo2','foo3','foo4','res']
350 ptNames = ['p1','p2','p3','p4','p5']
351 set = MLDataSet(examples,qBounds=qBounds)
352 DataUtils.WritePickledData('test_data/test.dat.pkl',set)
353 print('nVars:',set.GetNVars())
354 print('nPts:',set.GetNPts())
355 print('nPoss:',set.GetNPossibleVals())
356 print('qBounds:',set.GetQuantBounds())
357 print('data:',set.GetAllData())
358 print('Input data:',set.GetInputData())
359 print('results:',set.GetResults())
360
361 print('nameddata:',set.GetNamedData())
362