Package rdkit ::
Package ML ::
Module ScreenComposite
|
|
1
2
3
4
5
6
7
8
9
10
11 """ command line utility for screening composite models
12
13 **Usage**
14
15 _ScreenComposite [optional args] modelfile(s) datafile_
16
17 Unless indicated otherwise (via command line arguments), _modelfile_ is
18 a file containing a pickled composite model and _filename_ is a QDAT file.
19
20 **Command Line Arguments**
21
22 - -t *threshold value(s)*: use high-confidence predictions for the final
23 analysis of the hold-out data. The threshold value can be either a single
24 float or a list/tuple of floats. All thresholds should be between
25 0.0 and 1.0
26
27 - -D: do a detailed screen.
28
29 - -d *database name*: instead of reading the data from a QDAT file,
30 pull it from a database. In this case, the _datafile_ argument
31 provides the name of the database table containing the data set.
32
33 - -N *note*: use all models from the database which have this note.
34 The modelfile argument should contain the name of the table
35 with the models.
36
37 - -H: screen only the hold out set (works only if a version of
38 BuildComposite more recent than 1.2.2 was used).
39
40 - -T: screen only the training set (works only if a version of
41 BuildComposite more recent than 1.2.2 was used).
42
43 - -E: do a detailed Error analysis. This shows each misclassified
44 point and the number of times it was missed across all screened
45 composites. If the --enrich argument is also provided, only compounds
46 that have true activity value equal to the enrichment value will be
47 used.
48
49 - --enrich *enrichVal*: target "active" value to be used in calculating
50 enrichments.
51
52 - -A: show All predictions.
53
54 - -S: shuffle activity values before screening
55
56 - -R: randomize activity values before screening
57
58 - -F *filter frac*: filters the data before training to change the
59 distribution of activity values in the training set. *filter frac*
60 is the fraction of the training set that should have the target value.
61 **See note in BuildComposite help about data filtering**
62
63 - -v *filter value*: filters the data before training to change the
64 distribution of activity values in the training set. *filter value*
65 is the target value to use in filtering.
66 **See note in BuildComposite help about data filtering**
67
68 - -V: be verbose when screening multiple models
69
70 - -h: show this message and exit
71
72 - --OOB: Do out an "out-of-bag" generalization error estimate. This only
73 makes sense when applied to the original data set.
74
75 - --pickleCol *colId*: index of the column containing a pickled value
76 (used primarily for cases where fingerprints are used as descriptors)
77
78 *** Options for making Prediction (Hanneke) Plots ***
79
80 - --predPlot=<fileName>: triggers the generation of a Hanneke plot and
81 sets the name of the .txt file which will hold the output data.
82 A Gnuplot control file, <fileName>.gnu, will also be generated.
83
84 - --predActTable=<name> (optional): name of the database table
85 containing activity values. If this is not provided, activities
86 will be read from the same table containing the screening data
87
88 - --predActCol=<name> (optional): name of the activity column. If not
89 provided, the name of the last column in the activity table will
90 be used.
91
92 - --predLogScale (optional): If provided, the x axis of the
93 prediction plot (the activity axis) will be plotted using a log
94 scale
95
96 - --predShow: launch a gnuplot instance and display the prediction
97 plot (the plot will still be written to disk).
98
99 *** The following options are likely obsolete ***
100
101 - -P: read pickled data. The datafile argument should contain
102 a pickled data set. *relevant only to qdat files*
103
104 - -q: data are not quantized (the composite should take care of
105 quantization itself if it requires quantized data). *relevant only to
106 qdat files*
107
108
109
110 """
111 from __future__ import print_function
112 import sys, copy
113 import numpy
114 from rdkit.six.moves import cPickle
115 from rdkit import RDConfig
116 from rdkit import DataStructs
117
118 try:
119 from PIL import Image,ImageDraw
120 except ImportError:
121 hasPil=0
122 else:
123 hasPil=1
124
125 from rdkit.ML.Data import DataUtils,SplitData
126 from rdkit.ML import CompositeRun
127 from rdkit.Dbase.DbConnection import DbConnect
128 from rdkit.Dbase import DbModule
129 _details = CompositeRun.CompositeRun()
130
131 __VERSION_STRING="3.3.0"
132
134 """ emits messages to _sys.stdout_
135 override this in modules which import this one to redirect output
136
137 **Arguments**
138
139 - msg: the string to be displayed
140
141 """
142 if noRet:
143 sys.stdout.write('%s '%(msg))
144 else:
145 sys.stdout.write('%s\n'%(msg))
147 """ emits messages to _sys.stderr_
148 override this in modules which import this one to redirect output
149
150 **Arguments**
151
152 - msg: the string to be displayed
153
154 """
155 sys.stderr.write('ERROR: %s\n'%(msg))
156
158 if tgt<0 or tgt>=mat.shape[0]: return 0
159 nPts = float(sum(sum(mat)))
160 nTgtPred = float(sum(mat[:,tgt]))
161 if nTgtPred:
162 pctCorrect = mat[tgt,tgt]/nTgtPred
163 nTgtReal = float(sum(mat[tgt,:]))
164 pctOverall = nTgtReal/nPts
165 else:
166 return 0.0
167 return pctCorrect/pctOverall
168
169
170 -def CollectResults(indices,dataSet,composite,callback=None,appendExamples=0,
171 errorEstimate=0):
172 """ screens a set of examples through a composite and returns the
173 results
174 #DOC
175
176 **Arguments**
177
178 - examples: the examples to be screened (a sequence of sequences)
179 it's assumed that the last element in each example is it's "value"
180
181 - composite: the composite model to be used
182
183 - callback: (optional) if provided, this should be a function
184 taking a single argument that is called after each example is
185 screened with the number of examples screened so far as the
186 argument.
187
188 - appendExamples: (optional) this value is passed on to the
189 composite's _ClassifyExample()_ method.
190
191 - errorEstimate: (optional) calculate the "out of bag" error
192 estimate for the composite using Breiman's definition. This
193 only makes sense when screening the original data set!
194 [L. Breiman "Out-of-bag Estimation", UC Berkeley Dept of
195 Statistics Technical Report (1996)]
196
197 **Returns**
198
199 a list of 3-tuples _nExamples_ long:
200
201 1) answer: the value from the example
202
203 2) pred: the composite model's prediction
204
205 3) conf: the confidence of the composite
206
207 """
208
209
210
211 for j in range(len(composite)):
212 tmp = composite.GetModel(j)
213 if hasattr(tmp,'_trainIndices') and type(tmp._trainIndices)!=dict:
214 tis = {}
215 if hasattr(tmp,'_trainIndices'):
216 for v in tmp._trainIndices: tis[v]=1
217 tmp._trainIndices=tis
218
219
220 nPts = len(indices)
221 res = [None]*nPts
222 for i in range(nPts):
223 idx = indices[i]
224 example = dataSet[idx]
225 if errorEstimate:
226 use = []
227 for j in range(len(composite)):
228 mdl = composite.GetModel(j)
229 if not mdl._trainIndices.get(idx,0):
230 use.append(j)
231 else:
232 use = None
233
234 pred,conf = composite.ClassifyExample(example,appendExample=appendExamples,
235 onlyModels=use)
236 if composite.GetActivityQuantBounds():
237 answer = composite.QuantizeActivity(example)[-1]
238 else:
239 answer = example[-1]
240 res[i] = answer,pred,conf
241 if callback: callback(i)
242 return res
243
244 -def DetailedScreen(indices,data,composite,threshold=0,screenResults=None,
245 goodVotes=None,badVotes=None,noVotes=None,callback=None,
246 appendExamples=0,errorEstimate=0):
247 """ screens a set of examples cross a composite and breaks the
248 predictions into *correct*,*incorrect* and *unclassified* sets.
249 #DOC
250 **Arguments**
251
252 - examples: the examples to be screened (a sequence of sequences)
253 it's assumed that the last element in each example is its "value"
254
255 - composite: the composite model to be used
256
257 - threshold: (optional) the threshold to be used to decide whether
258 or not a given prediction should be kept
259
260 - screenResults: (optional) the results of screening the results
261 (a sequence of 3-tuples in the format returned by
262 _CollectResults()_). If this is provided, the examples will not
263 be screened again.
264
265 - goodVotes,badVotes,noVotes: (optional) if provided these should
266 be lists (or anything supporting an _append()_ method) which
267 will be used to pass the screening results back.
268
269 - callback: (optional) if provided, this should be a function
270 taking a single argument that is called after each example is
271 screened with the number of examples screened so far as the
272 argument.
273
274 - appendExamples: (optional) this value is passed on to the
275 composite's _ClassifyExample()_ method.
276
277 - errorEstimate: (optional) calculate the "out of bag" error
278 estimate for the composite using Breiman's definition. This
279 only makes sense when screening the original data set!
280 [L. Breiman "Out-of-bag Estimation", UC Berkeley Dept of
281 Statistics Technical Report (1996)]
282
283 **Notes**
284
285 - since this function doesn't return anything, if one or more of
286 the arguments _goodVotes_, _badVotes_, and _noVotes_ is not
287 provided, there's not much reason to call it
288
289 """
290 if screenResults is None:
291 screenResults = CollectResults(indices,data,composite,callback=callback,
292 appendExamples=appendExamples,
293 errorEstimate=errorEstimate)
294 if goodVotes is None: goodVotes = []
295 if badVotes is None: badVotes = []
296 if noVotes is None: noVotes = []
297 for i in range(len(screenResults)):
298 answer,pred,conf = screenResults[i]
299 if conf > threshold:
300 if pred != answer:
301 badVotes.append((answer,pred,conf,i))
302 else:
303 goodVotes.append((answer,pred,conf,i))
304 else:
305 noVotes.append((answer,pred,conf,i))
306
307 -def ShowVoteResults(indices,data,composite,nResultCodes,threshold,verbose=1,
308 screenResults=None,callback=None,appendExamples=0,
309 goodVotes=None,badVotes=None,noVotes=None,
310 errorEstimate=0):
311 """ screens the results and shows a detailed workup
312
313 The work of doing the screening and processing the results is
314 handled by _DetailedScreen()_
315 #DOC
316
317 **Arguments**
318
319 - examples: the examples to be screened (a sequence of sequences)
320 it's assumed that the last element in each example is its "value"
321
322 - composite: the composite model to be used
323
324 - nResultCodes: the number of possible results the composite can
325 return
326
327 - threshold: the threshold to be used to decide whether or not a
328 given prediction should be kept
329
330 - screenResults: (optional) the results of screening the results
331 (a sequence of 3-tuples in the format returned by
332 _CollectResults()_). If this is provided, the examples will not
333 be screened again.
334
335 - callback: (optional) if provided, this should be a function
336 taking a single argument that is called after each example is
337 screened with the number of examples screened so far as the
338 argument.
339
340 - appendExamples: (optional) this value is passed on to the
341 composite's _ClassifyExample()_ method.
342
343 - goodVotes,badVotes,noVotes: (optional) if provided these should
344 be lists (or anything supporting an _append()_ method) which
345 will be used to pass the screening results back.
346
347 - errorEstimate: (optional) calculate the "out of bag" error
348 estimate for the composite using Breiman's definition. This
349 only makes sense when screening the original data set!
350 [L. Breiman "Out-of-bag Estimation", UC Berkeley Dept of
351 Statistics Technical Report (1996)]
352
353 **Returns**
354
355 a 7-tuple:
356
357 1) the number of good (correct) predictions
358
359 2) the number of bad (incorrect) predictions
360
361 3) the number of predictions skipped due to the _threshold_
362
363 4) the average confidence in the good predictions
364
365 5) the average confidence in the bad predictions
366
367 6) the average confidence in the skipped predictions
368
369 7) the results table
370
371 """
372 nExamples = len(indices)
373 if goodVotes is None:
374 goodVotes = []
375 if badVotes is None:
376 badVotes = []
377 if noVotes is None:
378 noVotes = []
379 DetailedScreen(indices,data,composite,threshold,screenResults=screenResults,
380 goodVotes=goodVotes,badVotes=badVotes,noVotes=noVotes,callback=callback,
381 appendExamples=appendExamples,errorEstimate=errorEstimate)
382 nBad = len(badVotes)
383 nGood = len(goodVotes)
384 nClassified = nGood + nBad
385 if verbose:
386 print('\n\t*** Vote Results ***')
387 print('misclassified: %d/%d (%%%4.2f)\t%d/%d (%%%4.2f)' %
388 (nBad,nExamples,
389 100.*float(nBad)/nExamples,
390 nBad,nClassified,
391 100.*float(nBad)/nClassified))
392 nSkip = len(noVotes)
393 if nSkip > 0:
394 if verbose:
395 print('skipped: %d/%d (%%% 4.2f)'%(nSkip,nExamples,100.*float(nSkip)/nExamples))
396 noConf = numpy.array([x[2] for x in noVotes])
397 avgSkip = sum(noConf)/float(nSkip)
398 else:
399 avgSkip = 0.
400
401 if nBad > 0:
402 badConf = numpy.array([x[2] for x in badVotes])
403 avgBad = sum(badConf)/float(nBad)
404 else:
405 avgBad = 0.
406
407 if nGood > 0:
408 goodRes = [x[1] for x in goodVotes]
409 goodConf = numpy.array([x[2] for x in goodVotes])
410 avgGood = sum(goodConf)/float(nGood)
411 else:
412 goodRes = []
413 goodConf = []
414 avgGood = 0.
415
416 if verbose:
417 print()
418 print('average correct confidence: % 6.4f'%avgGood)
419 print('average incorrect confidence: % 6.4f'%avgBad)
420
421 voteTab = numpy.zeros((nResultCodes,nResultCodes),numpy.int)
422 for res in goodRes:
423 voteTab[res,res] += 1
424 for ans,res,conf,idx in badVotes:
425 voteTab[ans,res] += 1
426
427 if verbose:
428 print()
429 print('\tResults Table:')
430 vTab=voteTab.transpose()
431 colCounts = numpy.sum(vTab,0)
432 rowCounts = numpy.sum(vTab,1)
433 message('')
434 for i in range(nResultCodes):
435 if rowCounts[i]==0: rowCounts[i]=1
436 row = vTab[i]
437 message(' ',noRet=1)
438 for j in range(nResultCodes):
439 entry = row[j]
440 message(' % 6d'%entry,noRet=1)
441 message(' | % 4.2f'%(100.*vTab[i,i]/rowCounts[i]))
442 message(' ',noRet=1)
443 for i in range(nResultCodes):
444 message('-------',noRet=1)
445 message('')
446 message(' ',noRet=1)
447 for i in range(nResultCodes):
448 if colCounts[i]==0: colCounts[i]=1
449 message(' % 6.2f'%(100.*vTab[i,i]/colCounts[i]),noRet=1)
450 message('')
451
452
453
454 return nGood,nBad,nSkip,avgGood,avgBad,avgSkip,voteTab
455
456 -def ScreenIt(composite,indices,data,partialVote=0,voteTol=0.0,verbose=1,screenResults=None,
457 goodVotes=None,badVotes=None,noVotes=None):
458 """ screens a set of data using a composite model and prints out
459 statistics about the screen.
460 #DOC
461 The work of doing the screening and processing the results is
462 handled by _DetailedScreen()_
463
464 **Arguments**
465
466 - composite: the composite model to be used
467
468 - data: the examples to be screened (a sequence of sequences)
469 it's assumed that the last element in each example is its "value"
470
471 - partialVote: (optional) toggles use of the threshold value in
472 the screnning.
473
474 - voteTol: (optional) the threshold to be used to decide whether or not a
475 given prediction should be kept
476
477 - verbose: (optional) sets degree of verbosity of the screening
478
479 - screenResults: (optional) the results of screening the results
480 (a sequence of 3-tuples in the format returned by
481 _CollectResults()_). If this is provided, the examples will not
482 be screened again.
483
484 - goodVotes,badVotes,noVotes: (optional) if provided these should
485 be lists (or anything supporting an _append()_ method) which
486 will be used to pass the screening results back.
487
488
489 **Returns**
490
491 a 7-tuple:
492
493 1) the number of good (correct) predictions
494
495 2) the number of bad (incorrect) predictions
496
497 3) the number of predictions skipped due to the _threshold_
498
499 4) the average confidence in the good predictions
500
501 5) the average confidence in the bad predictions
502
503 6) the average confidence in the skipped predictions
504
505 7) None
506
507 """
508 if goodVotes is None:
509 goodVotes = []
510 if badVotes is None:
511 badVotes = []
512 if noVotes is None:
513 noVotes = []
514
515 if not partialVote:
516 voteTol = 0.0
517
518 DetailedScreen(indices,data,composite,voteTol,screenResults=screenResults,
519 goodVotes=goodVotes,badVotes=badVotes,noVotes=noVotes)
520
521 nGood = len(goodVotes)
522 goodAccum = 0.
523 for res,pred,conf,idx in goodVotes:
524 goodAccum += conf
525
526 misCount = len(badVotes)
527 badAccum = 0.
528 for res,pred,conf,idx in badVotes:
529 badAccum += conf
530
531 nSkipped = len(noVotes)
532 goodSkipped = 0
533 badSkipped = 0
534 skipAccum = 0.
535 for ans,pred,conf,idx in noVotes:
536 skipAccum += conf
537 if ans != pred:
538 badSkipped += 1
539 else:
540 goodSkipped += 1
541
542 nData = nGood + misCount + nSkipped
543 if verbose:
544 print('Total N Points:',nData)
545 if partialVote:
546 nCounted = nData-nSkipped
547 if verbose:
548 print('Misclassifications: %d (%%%4.2f)'%(misCount,100.*float(misCount)/nCounted))
549 print('N Skipped: %d (%%%4.2f)'%(nSkipped,100.*float(nSkipped)/nData))
550 print('\tGood Votes Skipped: %d (%%%4.2f)'%(goodSkipped,100.*float(goodSkipped)/nSkipped))
551 print('\tBad Votes Skipped: %d (%%%4.2f)'%(badSkipped,100.*float(badSkipped)/nSkipped))
552 else:
553 if verbose:
554 print('Misclassifications: %d (%%%4.2f)'%(misCount,100.*float(misCount)/nData))
555 print('Average Correct Vote Confidence: % 6.4f'%(goodAccum/(nData-misCount)))
556 print('Average InCorrect Vote Confidence: % 6.4f'%(badAccum/misCount))
557
558 avgGood=0
559 avgBad=0
560 avgSkip=0
561 if nGood:
562 avgGood = goodAccum/nGood
563 if misCount:
564 avgBad = badAccum/misCount
565 if nSkipped:
566 avgSkip = skipAccum/nSkipped
567 return nGood,misCount,nSkipped,avgGood,avgBad,avgSkip,None
568
570 """ *Internal Use Only*
571
572 converts a list of 4 tuples: (answer,prediction,confidence,idx) into
573 an alternate list: (answer,prediction,confidence,data point)
574
575 **Arguments**
576
577 - votes: a list of 4 tuples: (answer, prediction, confidence,
578 index)
579
580 - data: a _DataUtils.MLData.MLDataSet_
581
582
583 **Note**: alterations are done in place in the _votes_ list
584
585 """
586 for i in range(len(votes)):
587 ans,pred,conf,idx = votes[i]
588 votes[i] = (ans,pred,conf,data[idx])
589
591 if (hasattr(details,'doHoldout') and details.doHoldout) or \
592 (hasattr(details,'doTraining') and details.doTraining):
593 try:
594 splitF = model._splitFrac
595 except AttributeError:
596 pass
597 else:
598 if verbose:
599 message('s',noRet=1)
600
601 if hasattr(details,'errorEstimate') and details.errorEstimate and \
602 hasattr(details,'doHoldout') and details.doHoldout:
603 message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
604 message('****** WARNING: OOB screening should not be combined with doHoldout option.')
605 message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
606 trainIdx,testIdx = SplitData.SplitIndices(data.GetNPts(),splitF,silent=1)
607
608 if hasattr(details,'filterFrac') and details.filterFrac != 0.0:
609 if verbose:
610 message('f',noRet=1)
611 trainFilt,temp = DataUtils.FilterData(data,details.filterVal,
612 details.filterFrac,-1,
613 indicesToUse=trainIdx,
614 indicesOnly=1)
615 testIdx += temp
616 trainIdx = trainFilt
617 elif hasattr(details,'errorEstimate') and details.errorEstimate:
618
619
620 if hasattr(details,'filterFrac') and details.filterFrac != 0.0:
621 if verbose:
622 message('f',noRet=1)
623 testIdx,trainIdx = DataUtils.FilterData(data,details.filterVal,
624 details.filterFrac,-1,
625 indicesToUse=range(data.GetNPts()),
626 indicesOnly=1)
627 testIdx.extend(trainIdx)
628 else:
629 testIdx = range(data.GetNPts())
630 trainIdx = []
631 else:
632 testIdx = range(data.GetNPts())
633 trainIdx = []
634 if hasattr(details,'doTraining') and details.doTraining:
635 testIdx,trainIdx = trainIdx,testIdx
636 return trainIdx,testIdx
637
638 -def ScreenFromDetails(models,details,callback=None,setup=None,appendExamples=0,
639 goodVotes=None,badVotes=None,noVotes=None,data=None,
640 enrichments=None):
641 """ Screens a set of data using a a _CompositeRun.CompositeRun_
642 instance to provide parameters
643
644 # DOC
645
646 The actual data to be used are extracted from the database and
647 table specified in _details_
648
649 Aside from dataset construction, _ShowVoteResults()_ does most of
650 the heavy lifting here.
651
652 **Arguments**
653
654 - model: a composite model
655
656 - details: a _CompositeRun.CompositeRun_ object containing details
657 (options, parameters, etc.) about the run
658
659 - callback: (optional) if provided, this should be a function
660 taking a single argument that is called after each example is
661 screened with the number of examples screened so far as the
662 argument.
663
664 - setup: (optional) a function taking a single argument which is
665 called at the start of screening with the number of points to
666 be screened as the argument.
667
668 - appendExamples: (optional) this value is passed on to the
669 composite's _ClassifyExample()_ method.
670
671 - goodVotes,badVotes,noVotes: (optional) if provided these should
672 be lists (or anything supporting an _append()_ method) which
673 will be used to pass the screening results back.
674
675
676 **Returns**
677
678 a 7-tuple:
679
680 1) the number of good (correct) predictions
681
682 2) the number of bad (incorrect) predictions
683
684 3) the number of predictions skipped due to the _threshold_
685
686 4) the average confidence in the good predictions
687
688 5) the average confidence in the bad predictions
689
690 6) the average confidence in the skipped predictions
691
692 7) the results table
693
694 """
695 if data is None:
696 if hasattr(details,'pickleCol'):
697 data = details.GetDataSet(pickleCol=details.pickleCol,
698 pickleClass=DataStructs.ExplicitBitVect)
699 else:
700 data = details.GetDataSet()
701 if details.threshold>0.0:
702 partialVote = 1
703 else:
704 partialVote = 0
705
706 if type(models) not in [list, tuple]:
707 models = (models,)
708
709 nModels = len(models)
710
711 if setup is not None:
712 setup(nModels*data.GetNPts())
713
714 nGood = numpy.zeros(nModels,numpy.float)
715 nBad = numpy.zeros(nModels,numpy.float)
716 nSkip = numpy.zeros(nModels,numpy.float)
717 confGood = numpy.zeros(nModels,numpy.float)
718 confBad = numpy.zeros(nModels,numpy.float)
719 confSkip = numpy.zeros(nModels,numpy.float)
720 voteTab = None
721 if goodVotes is None:
722 goodVotes = []
723 if badVotes is None:
724 badVotes = []
725 if noVotes is None:
726 noVotes = []
727 if enrichments is None:
728 enrichments = [0.0]*nModels
729 badVoteDict = {}
730 noVoteDict = {}
731
732 for i in range(nModels):
733 if nModels>1:
734 goodVotes = []
735 badVotes=[]
736 noVotes=[]
737 model = models[i]
738
739 try:
740 seed = model._randomSeed
741 except AttributeError:
742 pass
743 else:
744 DataUtils.InitRandomNumbers(seed)
745
746 if (hasattr(details,'shuffleActivities') and details.shuffleActivities) or \
747 (hasattr(details,'randomActivities') and details.randomActivities ):
748 if hasattr(details,'shuffleActivities') and details.shuffleActivities:
749 shuffle = True
750 else:
751 shuffle = False
752 randomize=True
753 DataUtils.RandomizeActivities(data,shuffle=shuffle,
754 runDetails=details)
755 else:
756 randomize=False
757 shuffle=False
758
759 if hasattr(model,'_shuffleActivities') and \
760 model._shuffleActivities and \
761 not shuffle:
762 message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
763 message('****** WARNING: Shuffled model being screened with unshuffled data.')
764 message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
765 if hasattr(model,'_randomizeActivities') and \
766 model._randomizeActivities and \
767 not randomize:
768 message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
769 message('****** WARNING: Random model being screened with non-random data.')
770 message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
771
772 trainIdx,testIdx = PrepareDataFromDetails(model,details,data)
773
774 nPossible = model.GetQuantBounds()[1]
775 if callback:
776 cb = lambda x,y=callback,z=i*data.GetNPts():y(x+z)
777 else:
778 cb = None
779 if not hasattr(details,'errorEstimate') or not details.errorEstimate:
780 errorEstimate = 0
781 else:
782 errorEstimate = 1
783 g,b,s,aG,aB,aS,vT = ShowVoteResults(testIdx,data,model,nPossible[-1],
784 details.threshold,verbose=0,
785 callback=cb,appendExamples=appendExamples,
786 goodVotes=goodVotes,badVotes=badVotes,
787 noVotes=noVotes,
788 errorEstimate=errorEstimate)
789 if voteTab is None:
790 voteTab = numpy.zeros(vT.shape,numpy.float)
791 if hasattr(details,'errorAnalysis') and details.errorAnalysis:
792 for a,p,c,idx in badVotes:
793 label = testIdx[idx]
794 if hasattr(details,'enrichTgt') and details.enrichTgt >=0:
795 if a==details.enrichTgt:
796 badVoteDict[label] = badVoteDict.get(label,0)+1
797 else:
798 badVoteDict[label] = badVoteDict.get(label,0)+1
799 for a,p,c,idx in noVotes:
800 label = testIdx[idx]
801 if hasattr(details,'enrichTgt') and details.enrichTgt >=0:
802 if a==details.enrichTgt:
803 noVoteDict[label] = noVoteDict.get(label,0)+1
804 else:
805 noVoteDict[label] = noVoteDict.get(label,0)+1
806
807 voteTab += vT
808 nGood[i] = g
809 nBad[i] = b
810 nSkip[i] = s
811 confGood[i] = aG
812 confBad[i] = aB
813 confSkip[i] = aS
814
815 if hasattr(details,'enrichTgt') and details.enrichTgt >=0:
816 enrichments[i] = CalcEnrichment(vT,tgt=details.enrichTgt)
817
818 if nModels == 1:
819 return g,b,s,aG,aB,aS,vT
820 else:
821 voteTab /= nModels
822
823 avgNBad = sum(nBad)/nModels
824 devNBad = numpy.sqrt(sum((nBad-avgNBad)**2)/(nModels-1))
825
826 bestIdx = numpy.argsort(nBad)[0]
827
828 avgNGood = sum(nGood)/nModels
829 devNGood = numpy.sqrt(sum((nGood-avgNGood)**2)/(nModels-1))
830
831 avgNSkip = sum(nSkip)/nModels
832 devNSkip = numpy.sqrt(sum((nSkip-avgNSkip)**2)/(nModels-1))
833
834 avgConfBad = sum(confBad)/nModels
835 devConfBad = numpy.sqrt(sum((confBad-avgConfBad)**2)/(nModels-1))
836
837 avgConfGood = sum(confGood)/nModels
838 devConfGood = numpy.sqrt(sum((confGood-avgConfGood)**2)/(nModels-1))
839
840 avgConfSkip = sum(confSkip)/nModels
841 devConfSkip = numpy.sqrt(sum((confSkip-avgConfSkip)**2)/(nModels-1))
842 return (avgNGood,devNGood),(avgNBad,devNBad),(avgNSkip,devNSkip),\
843 (avgConfGood,devConfGood),(avgConfBad,devConfBad),(avgConfSkip,devConfSkip),\
844 voteTab
845
846
848 if not hasPil:
849 return None
850 try:
851 nTot = float(nGood)+float(nBad)+float(nRej)
852 except TypeError:
853 nGood = nGood[0]
854 nBad = nBad[0]
855 nRej = nRej[0]
856 nTot = float(nGood)+float(nBad)+float(nRej)
857
858 if not nTot:
859 return None
860 goodColor = (100,100,255)
861 badColor = (255,100,100)
862 rejColor = (255,255,100)
863
864 pctGood = float(nGood) / nTot
865 pctBad = float(nBad) / nTot
866 pctRej = float(nRej) / nTot
867
868 if size is None:
869 size = (100,100)
870 img = Image.new('RGB',size,(255,255,255))
871 draw = ImageDraw.Draw(img)
872 box = (0,0,size[0]-1,size[1]-1)
873
874 startP = -90
875 endP = int(startP + pctGood*360)
876 draw.pieslice(box,startP,endP,fill=goodColor)
877 startP = endP
878 endP = int(startP + pctBad*360)
879 draw.pieslice(box,startP,endP,fill=badColor)
880 startP = endP
881 endP = int(startP + pctRej*360)
882 draw.pieslice(box,startP,endP,fill=rejColor)
883
884 return img
885
886
887 -def ScreenToHtml(nGood,nBad,nRej,avgGood,avgBad,avgSkip,voteTable,imgDir='.',
888 fullPage=1,skipImg=0,includeDefs=1):
889 """ returns the text of a web page showing the screening details
890 #DOC
891 **Arguments**
892
893 - nGood: number of correct predictions
894
895 - nBad: number of incorrect predictions
896
897 - nRej: number of rejected predictions
898
899 - avgGood: average correct confidence
900
901 - avgBad: average incorrect confidence
902
903 - avgSkip: average rejected confidence
904
905 - voteTable: vote table
906
907 - imgDir: (optional) the directory to be used to hold the vote
908 image (if constructed)
909
910 **Returns**
911
912 a string containing HTML
913
914 """
915 if type(nGood) == tuple:
916 multModels=1
917 else:
918 multModels=0
919
920 if fullPage:
921 outTxt = ["""<html><body>"""]
922 outTxt.append('<center><h2>VOTE DETAILS</h2></center>')
923 else:
924 outTxt = []
925
926 outTxt.append('<font>')
927
928
929 if not skipImg:
930 img = GetScreenImage(nGood,nBad,nRej)
931 if img:
932 if imgDir:
933 imgFileName = '/'.join((imgDir,'votes.png'))
934 else:
935 imgFileName = 'votes.png'
936 img.save(imgFileName)
937 outTxt.append('<center><img src="%s"></center>'%(imgFileName))
938
939 nPoss = len(voteTable)
940 pureCounts = numpy.sum(voteTable,1)
941 accCounts = numpy.sum(voteTable,0)
942 pureVect = numpy.zeros(nPoss,numpy.float)
943 accVect = numpy.zeros(nPoss,numpy.float)
944 for i in range(nPoss):
945 if pureCounts[i]:
946 pureVect[i] = float(voteTable[i,i])/pureCounts[i]
947 if accCounts[i]:
948 accVect[i] = float(voteTable[i,i])/accCounts[i]
949
950 outTxt.append('<center><table border=1>')
951 outTxt.append('<tr><td></td>')
952 for i in range(nPoss):
953 outTxt.append('<th>%d</th>'%i)
954 outTxt.append('<th>% Accurate</th>')
955 outTxt.append('</tr>')
956
957 for i in range(nPoss):
958 outTxt.append('<tr><th>%d</th>'%(i))
959 for j in range(nPoss):
960 if i == j:
961 if not multModels:
962 outTxt.append('<td bgcolor="#A0A0FF">%d</td>'%(voteTable[j,i]))
963 else:
964 outTxt.append('<td bgcolor="#A0A0FF">%.2f</td>'%(voteTable[j,i]))
965 else:
966 if not multModels:
967 outTxt.append('<td>%d</td>'%(voteTable[j,i]))
968 else:
969 outTxt.append('<td>%.2f</td>'%(voteTable[j,i]))
970 outTxt.append('<td>%4.2f</td</tr>'%(100.0*accVect[i]))
971 if i == 0:
972 outTxt.append('<th rowspan=%d>Predicted</th></tr>'%(nPoss))
973 else:
974 outTxt.append('</tr>')
975 outTxt.append('<tr><th>% Pure</th>')
976 for i in range(nPoss):
977 outTxt.append('<td>%4.2f</td>'%(100.0*pureVect[i]))
978 outTxt.append('</tr>')
979 outTxt.append('<tr><td></td><th colspan=%d>Original</th>'%(nPoss))
980 outTxt.append('</table></center>')
981
982
983 if not multModels:
984 nTotal = nBad+nGood+nRej
985 nClass = nBad+nGood
986 if nClass:
987 pctErr = 100.*float(nBad)/nClass
988 else:
989 pctErr = 0.0
990
991 outTxt.append('<p>%d of %d examples were misclassified (%%%4.2f)'%(nBad,nGood+nBad,pctErr))
992 if nRej > 0:
993 pctErr = 100.*float(nBad)/(nGood+nBad+nRej)
994 outTxt.append('<p> %d of %d overall: (%%%4.2f)'%(nBad,nTotal,pctErr))
995 pctRej = 100.*float(nRej)/nTotal
996 outTxt.append('<p>%d of %d examples were rejected (%%%4.2f)'%(nRej,nTotal,pctRej))
997 if nGood != 0:
998 outTxt.append('<p>The correctly classified examples had an average confidence of %6.4f'%avgGood)
999
1000 if nBad != 0:
1001 outTxt.append('<p>The incorrectly classified examples had an average confidence of %6.4f'%avgBad)
1002 if nRej != 0:
1003 outTxt.append('<p>The rejected examples had an average confidence of %6.4f'%avgSkip)
1004 else:
1005 nTotal = nBad[0]+nGood[0]+nRej[0]
1006 nClass = nBad[0]+nGood[0]
1007 devClass = nBad[1]+nGood[1]
1008 if nClass:
1009 pctErr = 100.*float(nBad[0])/nClass
1010 devPctErr = 100.*float(nBad[1])/nClass
1011 else:
1012 pctErr = 0.0
1013 devPctErr = 0.0
1014
1015 outTxt.append('<p>%.2f(%.2f) of %.2f(%.2f) examples were misclassified (%%%4.2f(%4.2f))'%\
1016 (nBad[0],nBad[1],nClass,devClass,pctErr,devPctErr))
1017 if nRej > 0:
1018 pctErr = 100.*float(nBad[0])/nTotal
1019 devPctErr = 100.*float(nBad[1])/nTotal
1020 outTxt.append('<p> %.2f(%.2f) of %d overall: (%%%4.2f(%4.2f))'%\
1021 (nBad[0],nBad[1],nTotal,pctErr,devPctErr))
1022 pctRej = 100.*float(nRej[0])/nTotal
1023 devPctRej = 100.*float(nRej[1])/nTotal
1024 outTxt.append('<p>%.2f(%.2f) of %d examples were rejected (%%%4.2f(%4.2f))'%\
1025 (nRej[0],nRej[1],nTotal,pctRej,devPctRej))
1026 if nGood != 0:
1027 outTxt.append('<p>The correctly classified examples had an average confidence of %6.4f(%.4f)'%avgGood)
1028
1029 if nBad != 0:
1030 outTxt.append('<p>The incorrectly classified examples had an average confidence of %6.4f(%.4f)'%avgBad)
1031 if nRej != 0:
1032 outTxt.append('<p>The rejected examples had an average confidence of %6.4f(%.4f)'%avgSkip)
1033
1034
1035
1036 outTxt.append('</font>')
1037 if includeDefs:
1038 txt = """
1039 <p><b>Definitions:</b>
1040 <ul>
1041 <li> <i>% Pure:</i> The percentage of, for example, known positives predicted to be positive.
1042 <li> <i>% Accurate:</i> The percentage of, for example, predicted positives that actually
1043 are positive.
1044 </ul>
1045 """
1046 outTxt.append(txt)
1047
1048 if fullPage:
1049 outTxt.append("""</body></html>""")
1050 return '\n'.join(outTxt)
1051
1052
1053 -def MakePredPlot(details,indices,data,goodVotes,badVotes,nRes,idCol=0,verbose=0):
1054 """
1055
1056 **Arguments**
1057
1058 - details: a CompositeRun.RunDetails object
1059
1060 - indices: a sequence of integer indices into _data_
1061
1062 - data: the data set in question. We assume that the ids for
1063 the data points are in the _idCol_ column
1064
1065 - goodVotes/badVotes: predictions where the model was correct/incorrect.
1066 These are sequences of 4-tuples:
1067 (answer,prediction,confidence,index into _indices_)
1068
1069 """
1070 if not hasattr(details,'predPlot') or not details.predPlot:
1071 return
1072
1073 if verbose: message('\n-> Constructing Prediction (Hanneke) Plot')
1074 outF = open(details.predPlot,'w+')
1075 gnuF = open('%s.gnu'%details.predPlot,'w+')
1076
1077 ptIds = [data[x][idCol] for x in indices]
1078
1079
1080
1081 origConn = DbConnect(details.dbName,details.tableName,
1082 user=details.dbUser,password=details.dbPassword)
1083 colNames = origConn.GetColumnNames()
1084 idName = colNames[idCol]
1085 if not hasattr(details,'predActTable') or \
1086 not details.predActTable or \
1087 details.predActTable==details.tableName:
1088 actConn = origConn
1089 else:
1090 actConn = DbConnect(details.dbName,details.predActTable,
1091 user=details.dbUser,password=details.dbPassword)
1092 if verbose: message('\t-> Pulling Activity Data')
1093 pts = []
1094
1095 if type(ptIds[0]) not in [type(''),type(u'')]:
1096 ptIds = [str(x) for x in ptIds]
1097 whereL = [DbModule.placeHolder]*len(ptIds)
1098 if hasattr(details,'predActCol') and details.predActCol:
1099 actColName=details.predActCol
1100 else:
1101 actColName = actConn.GetColumnNames()[-1]
1102
1103 whereTxt = "%s in (%s)"%(idName,','.join(whereL))
1104 rawD = actConn.GetData(fields='%s,%s'%(idName,actColName),
1105 where=whereTxt,extras=ptIds)
1106
1107 if verbose: message('\t-> Creating Plot')
1108 acts = [None]*len(ptIds)
1109 for entry in rawD:
1110 id,act = entry
1111 idx = ptIds.index(id)
1112 acts[idx] = act
1113 outF.write('#ID Pred Conf %s\n'%(actColName))
1114 for ans,pred,conf,idx in goodVotes:
1115 act = acts[idx]
1116 if act!='None':
1117 act= float(act)
1118 else:
1119 act=0
1120 outF.write('%s %d %.4f %f\n'%(ptIds[idx],pred,conf,act))
1121 for ans,pred,conf,idx in badVotes:
1122 act = acts[idx]
1123 if act!='None':
1124 act= float(act)
1125 else:
1126 act=0
1127 outF.write('%s %d %.4f %f\n'%(ptIds[idx],pred,conf,act))
1128 outF.close()
1129 if not hasattr(details,'predLogScale') or not details.predLogScale:
1130 actLabel = actColName
1131 else:
1132 actLabel= 'log(%s)'%(actColName)
1133 actLabel = actLabel.replace('_',' ')
1134 gnuHdr="""# Generated by ScreenComposite.py version: %s
1135 set size square 0.7
1136 set yrange [:1]
1137 set data styl points
1138 set ylab 'confidence'
1139 set xlab '%s'
1140 set grid
1141 set nokey
1142 set term postscript enh color solid "Helvetica" 16
1143 set term X
1144 """%(__VERSION_STRING,actLabel)
1145 gnuF.write(gnuHdr)
1146 plots = []
1147 for i in range(nRes):
1148 if not hasattr(details,'predLogScale') or not details.predLogScale:
1149 plots.append("'%s' us 4:($2==%d?$3:0/0)"%(details.predPlot,i))
1150 else:
1151 plots.append("'%s' us (log10($4)):($2==%d?$3:0/0)"%(details.predPlot,i))
1152 gnuF.write("plot %s\n"%(','.join(plots)))
1153 gnuTail="""
1154 # EOF
1155 """
1156 gnuF.write(gnuTail)
1157 gnuF.close()
1158 if hasattr(details,'predShow') and details.predShow:
1159 try:
1160 import os
1161 from Gnuplot import Gnuplot
1162 p = Gnuplot()
1163 p('cd "%s"'%(os.getcwd()))
1164 p('load "%s.gnu"'%(details.predPlot))
1165 raw_input('press return to continue...\n')
1166 except:
1167 import traceback
1168 traceback.print_exc()
1169
1170
1171
1174
1188
1190 """ prints a list of arguments for when this is used from the
1191 command line and then exits
1192
1193 """
1194 print(__doc__)
1195 sys.exit(-1)
1196
1198 """ prints the version number of the program
1199
1200 """
1201 print('This is ScreenComposite.py version %s'%(__VERSION_STRING))
1202 if includeArgs:
1203 import sys
1204 print('command line was:')
1205 print(' '.join(sys.argv))
1206
1208 import getopt
1209 try:
1210 args,extras = getopt.getopt(sys.argv[1:],'EDd:t:VN:HThSRF:v:AX',
1211 ['predPlot=','predActCol=','predActTable=',
1212 'predLogScale','predShow',
1213 'OOB','pickleCol=','enrich=',
1214 ])
1215 except:
1216 import traceback
1217 traceback.print_exc()
1218 Usage()
1219
1220 fName = ''
1221 details.predPlot=''
1222 details.predActCol=''
1223 details.predActTable=''
1224 details.predLogScale=''
1225 details.predShow=0
1226 details.errorEstimate=0
1227 details.pickleCol=-1
1228 details.enrichTgt=-1
1229 for arg,val in args:
1230 if arg == '-d':
1231 details.dbName = val
1232 elif arg == '-D':
1233 details.detailedScreen = 1
1234 elif arg == '-t':
1235 details.partialVote = 1
1236 voteTol = eval(val)
1237 if type(voteTol) not in [type([]),type((1,1))]:
1238 voteTol = [voteTol]
1239 for tol in voteTol:
1240 if tol > 1 or tol < 0:
1241 error('Voting threshold must be between 0 and 1')
1242 sys.exit(-2)
1243 details.screenVoteTol=voteTol
1244 elif arg == '-N':
1245 details.note=val
1246 elif arg == '-H':
1247 details.doTraining=0
1248 details.doHoldout=1
1249 elif arg == '-T':
1250 details.doHoldout=0
1251 details.doTraining=1
1252 elif arg == '-E':
1253 details.errorAnalysis=1
1254 details.detailedScreen=1
1255 elif arg == '-A':
1256 details.showAll=1
1257 details.detailedScreen=1
1258 elif arg == '-S':
1259 details.shuffleActivities=1
1260 elif arg == '-R':
1261 details.randomActivities=1
1262 elif arg == '-h':
1263 Usage()
1264 elif arg == '-F':
1265 details.filterFrac=float(val)
1266 elif arg == '-v':
1267 details.filterVal=float(val)
1268 elif arg == '-V':
1269 verbose=1
1270 elif arg == '--predPlot':
1271 details.detailedScreen=1
1272 details.predPlot=val
1273 elif arg == '--predActCol':
1274 details.predActCol=val
1275 elif arg == '--predActTable':
1276 details.predActTable=val
1277 elif arg == '--predLogScale':
1278 details.predLogScale=1
1279 elif arg == '--predShow':
1280 details.predShow=1
1281 elif arg == '--predShow':
1282 details.predShow=1
1283 elif arg == '--OOB':
1284 details.errorEstimate=1
1285 elif arg == '--pickleCol':
1286 details.pickleCol=int(val)-1
1287 elif arg == '--enrich':
1288 details.enrichTgt=int(val)
1289 else:
1290 Usage()
1291
1292 if len(extras) < 1:
1293 Usage()
1294 return extras
1295
1296
1297 if __name__ == '__main__':
1298 details = SetDefaults()
1299 extras = ParseArgs(details)
1300 ShowVersion(includeArgs=1)
1301
1302 models = []
1303 if details.note and details.dbName:
1304 tblName = extras[0]
1305 message('-> Retrieving models from database')
1306 conn = DbConnect(details.dbName,tblName)
1307 blobs = conn.GetData(fields='model',where="where note='%s'"%(details.note))
1308 for blob in blobs:
1309 blob = blob[0]
1310 try:
1311 models.append(cPickle.loads(str(blob)))
1312 except:
1313 import traceback
1314 traceback.print_exc()
1315 message('Model load failed')
1316
1317 else:
1318 message('-> Loading model')
1319 modelFile=open(extras[0],'rb')
1320 models.append(cPickle.load(modelFile))
1321 if not len(models):
1322 error('No composite models found')
1323 sys.exit(-1)
1324 else:
1325 message('-> Working with %d models.'%len(models))
1326
1327 extras = extras[1:]
1328
1329 for fName in extras:
1330 if details.dbName != '':
1331 details.tableName = fName
1332 data = details.GetDataSet(pickleCol=details.pickleCol,
1333 pickleClass=DataStructs.ExplicitBitVect)
1334 else:
1335 data = DataUtils.BuildDataSet(fName)
1336 descNames = data.GetVarNames()
1337 nModels = len(models)
1338 screenResults = [None]*nModels
1339 dataSets = [None]*nModels
1340 message('-> Constructing and screening data sets')
1341 testIdx = range(data.GetNPts())
1342 trainIdx = testIdx
1343
1344 for modelIdx in range(nModels):
1345
1346 tmpD = data
1347 model = models[modelIdx]
1348 message('.',noRet=1)
1349
1350 try:
1351 seed = model._randomSeed
1352 except AttributeError:
1353 pass
1354 else:
1355 DataUtils.InitRandomNumbers(seed)
1356
1357 if details.shuffleActivities or details.randomActivities:
1358 shuffle = details.shuffleActivities
1359 randomize = 1
1360 DataUtils.RandomizeActivities(tmpD,shuffle=details.shuffleActivities,
1361 runDetails=details)
1362 else:
1363 randomize = False
1364 shuffle = False
1365
1366 if hasattr(model,'_shuffleActivities') and \
1367 model._shuffleActivities and \
1368 not shuffle:
1369 message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
1370 message('****** WARNING: Shuffled model being screened with unshuffled data.')
1371 message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
1372 if hasattr(model,'_randomizeActivities') and \
1373 model._randomizeActivities and \
1374 not randomize:
1375 message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
1376 message('****** WARNING: Random model being screened with non-random data.')
1377 message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
1378
1379 trainIdx,testIdx = PrepareDataFromDetails(model,details,tmpD,verbose=1)
1380 screenResults[modelIdx] = CollectResults(testIdx,tmpD,model,
1381 errorEstimate=details.errorEstimate)
1382 dataSets[modelIdx] = testIdx
1383 for tol in details.screenVoteTol:
1384 if len(details.screenVoteTol)>1:
1385 message('\n-----*****-----*****-----*****-----*****-----*****-----*****-----\n')
1386 message('Tolerance: %f'%tol)
1387 nGood = numpy.zeros(nModels,numpy.float)
1388 nBad = numpy.zeros(nModels,numpy.float)
1389 nSkip = numpy.zeros(nModels,numpy.float)
1390 confGood = numpy.zeros(nModels,numpy.float)
1391 confBad = numpy.zeros(nModels,numpy.float)
1392 confSkip = numpy.zeros(nModels,numpy.float)
1393 if details.enrichTgt >= 0:
1394 enrichments = numpy.zeros(nModels,numpy.float)
1395 goodVoteDict = {}
1396 badVoteDict = {}
1397 noVoteDict = {}
1398 voteTab = None
1399 for modelIdx in range(nModels):
1400 model = models[modelIdx]
1401 model.SetInputOrder(descNames)
1402 testIdx = dataSets[modelIdx]
1403 screenRes = screenResults[modelIdx]
1404 if not details.detailedScreen:
1405 g,b,s,aG,aB,aS,vT = ScreenIt(model,testIdx,tmpD,details.partialVote,tol,
1406 verbose=details.verbose,screenResults=screenRes)
1407 else:
1408 if model.GetActivityQuantBounds():
1409 nRes = len(model.GetActivityQuantBounds())+1
1410 else:
1411 nRes = model.GetQuantBounds()[1][-1]
1412 badVotes = []
1413 noVotes = []
1414 if (hasattr(details,'showAll') and details.showAll) or \
1415 (hasattr(details,'predPlot') and details.predPlot):
1416 goodVotes = []
1417 else:
1418 goodVotes = None
1419 g,b,s,aG,aB,aS,vT = ShowVoteResults(testIdx,tmpD,model,nRes,tol,
1420 verbose=details.verbose,
1421 screenResults=screenRes,
1422 badVotes=badVotes,noVotes=noVotes,
1423 goodVotes=goodVotes,
1424 errorEstimate=details.errorEstimate)
1425 if voteTab is None:
1426 voteTab = numpy.zeros(vT.shape,numpy.float)
1427 if details.errorAnalysis:
1428 for a,p,c,idx in badVotes:
1429 label = testIdx[idx]
1430 if hasattr(details,'enrichTgt') and details.enrichTgt >=0:
1431 if a==details.enrichTgt:
1432 badVoteDict[label] = badVoteDict.get(label,0)+1
1433 else:
1434 badVoteDict[label] = badVoteDict.get(label,0)+1
1435 for a,p,c,idx in noVotes:
1436 label = testIdx[idx]
1437 if hasattr(details,'enrichTgt') and details.enrichTgt >=0:
1438 if a==details.enrichTgt:
1439 noVoteDict[label] = noVoteDict.get(label,0)+1
1440 else:
1441 noVoteDict[label] = noVoteDict.get(label,0)+1
1442
1443 if hasattr(details,'showAll') and details.showAll:
1444 for a,p,c,idx in goodVotes:
1445 label = testIdx[idx]
1446 if details.enrichTgt >=0:
1447 if a==details.enrichTgt:
1448 goodVoteDict[label] = goodVoteDict.get(label,0)+1
1449 else:
1450 goodVoteDict[label] = goodVoteDict.get(label,0)+1
1451
1452 if details.enrichTgt>-1:
1453 enrichments[modelIdx] = CalcEnrichment(vT,tgt=details.enrichTgt)
1454
1455 voteTab += vT
1456 if details.detailedScreen and hasattr(details,'predPlot') and details.predPlot:
1457 MakePredPlot(details,testIdx,tmpD,goodVotes,badVotes,nRes,verbose=1)
1458
1459 if hasattr(details,'showAll') and details.showAll:
1460 print('-v-v-v-v-v-v-v- All Votes -v-v-v-v-v-v-v-')
1461 print('id, prediction, confidence, flag(-1=skipped,0=wrong,1=correct)')
1462 for ans,pred,conf,idx in goodVotes:
1463 pt = tmpD[testIdx[idx]]
1464 assert model.GetActivityQuantBounds() or pt[-1]==ans,\
1465 'bad point?: %s != %s'%(str(pt[-1]),str(ans))
1466 print('%s, %d, %.4f, 1'%(str(pt[0]),pred,conf))
1467 for ans,pred,conf,idx in badVotes:
1468 pt = tmpD[testIdx[idx]]
1469 assert model.GetActivityQuantBounds() or pt[-1]==ans,\
1470 'bad point?: %s != %s'%(str(pt[-1]),str(ans))
1471 print('%s, %d, %.4f, 0'%(str(pt[0]),pred,conf))
1472 for ans,pred,conf,idx in noVotes:
1473 pt = tmpD[testIdx[idx]]
1474 assert model.GetActivityQuantBounds() or pt[-1]==ans,\
1475 'bad point?: %s != %s'%(str(pt[-1]),str(ans))
1476 print('%s, %d, %.4f, -1'%(str(pt[0]),pred,conf))
1477 print('-^-^-^-^-^-^-^- -^-^-^-^-^-^-^-')
1478
1479 nGood[modelIdx] = g
1480 nBad[modelIdx] = b
1481 nSkip[modelIdx] = s
1482 confGood[modelIdx] = aG
1483 confBad[modelIdx] = aB
1484 confSkip[modelIdx] = aS
1485 print()
1486
1487 if nModels > 1:
1488 print('-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*')
1489 print('AVERAGES:')
1490
1491 avgNBad = sum(nBad)/nModels
1492 devNBad = numpy.sqrt(sum((nBad-avgNBad)**2)/(nModels-1))
1493
1494 bestIdx = numpy.argsort(nBad)[0]
1495
1496 avgNGood = sum(nGood)/nModels
1497 devNGood = numpy.sqrt(sum((nGood-avgNGood)**2)/(nModels-1))
1498
1499 avgNSkip = sum(nSkip)/nModels
1500 devNSkip = numpy.sqrt(sum((nSkip-avgNSkip)**2)/(nModels-1))
1501
1502 avgConfBad = sum(confBad)/nModels
1503 devConfBad = numpy.sqrt(sum((confBad-avgConfBad)**2)/(nModels-1))
1504
1505 avgConfGood = sum(confGood)/nModels
1506 devConfGood = numpy.sqrt(sum((confGood-avgConfGood)**2)/(nModels-1))
1507
1508 avgConfSkip = sum(confSkip)/nModels
1509 devConfSkip = numpy.sqrt(sum((confSkip-avgConfSkip)**2)/(nModels-1))
1510
1511 nClassified = avgNGood + avgNBad
1512 nExamples = nClassified + avgNSkip
1513 print('Misclassifications: \t%%%5.2f(%%%5.2f) %4.1f(%4.1f) / %d'%(100*avgNBad/nExamples,
1514 100*devNBad/nExamples,
1515 avgNBad,devNBad,
1516 nExamples))
1517 if avgNSkip>0:
1518 print('\tthreshold: \t%%%5.2f(%%%5.2f) %4.1f(%4.1f) / %d'%(100*avgNBad/nClassified,
1519 100*devNBad/nClassified,
1520 avgNBad,devNBad,
1521 nClassified))
1522 print()
1523 print('Number Skipped: %%%4.2f(%%%4.2f) %4.2f(%4.2f)'%(100*avgNSkip/nExamples,
1524 100*devNSkip/nExamples,
1525 avgNSkip,devNSkip))
1526
1527
1528 print()
1529 print('Confidences:')
1530 print('\tCorrect: \t%4.2f(%4.2f)'%(100*avgConfGood,100*devConfGood))
1531 print('\tIncorrect: \t%4.2f(%4.2f)'%(100*avgConfBad,100*devConfBad))
1532 if avgNSkip>0:
1533 print('\tSkipped: \t%4.2f(%4.2f)'%(100*avgConfSkip,100*devConfSkip))
1534
1535 if details.detailedScreen:
1536 message('Results Table:')
1537 voteTab = numpy.transpose(voteTab)/nModels
1538 nResultCodes = len(voteTab)
1539 colCounts = numpy.sum(voteTab,0)
1540 rowCounts = numpy.sum(voteTab,1)
1541 print()
1542 for i in range(nResultCodes):
1543 if rowCounts[i]==0: rowCounts[i]=1
1544 row = voteTab[i]
1545 message(' ',noRet=1)
1546 for j in range(nResultCodes):
1547 entry = row[j]
1548 message(' % 6.2f'%entry,noRet=1)
1549 message(' | % 4.2f'%(100.*voteTab[i,i]/rowCounts[i]))
1550 message(' ',noRet=1)
1551 for i in range(nResultCodes):
1552 message('-------',noRet=1)
1553 message('')
1554 message(' ',noRet=1)
1555 for i in range(nResultCodes):
1556 if colCounts[i]==0: colCounts[i]=1
1557 message(' % 6.2f'%(100.*voteTab[i,i]/colCounts[i]),noRet=1)
1558 message('')
1559 if details.enrichTgt >-1:
1560 mean = sum(enrichments)/nModels
1561 enrichments -= mean
1562 dev = numpy.sqrt(sum(enrichments*enrichments))/(nModels-1)
1563 message(' Enrichment of value %d: %.4f (%.4f)'%(details.enrichTgt,mean,dev))
1564 else:
1565 bestIdx=0
1566 print('------------------------------------------------')
1567 print('Best Model: ',bestIdx+1)
1568 bestBad = nBad[bestIdx]
1569 bestGood = nGood[bestIdx]
1570 bestSkip = nSkip[bestIdx]
1571 nClassified = bestGood + bestBad
1572 nExamples = nClassified + bestSkip
1573 print('Misclassifications: \t%%%5.2f %d / %d'%(100*bestBad/nExamples,
1574 bestBad,nExamples))
1575 if bestSkip>0:
1576 print('\tthreshold: \t%%%5.2f %d / %d'%(100*bestBad/nClassified,
1577 bestBad,nClassified))
1578 print()
1579 print('Number Skipped: %%%4.2f %d'%(100*bestSkip/nExamples,
1580 bestSkip))
1581
1582 print()
1583 print('Confidences:')
1584 print('\tCorrect: \t%4.2f'%(100*confGood[bestIdx]))
1585 print('\tIncorrect: \t%4.2f'%(100*confBad[bestIdx]))
1586 if bestSkip>0:
1587 print('\tSkipped: \t%4.2f'%(100*confSkip[bestIdx]))
1588
1589 if nModels == 1 and details.detailedScreen:
1590 message('')
1591 message('Results Table:')
1592 voteTab = numpy.transpose(vT)
1593 nResultCodes = len(vT)
1594 colCounts = numpy.sum(voteTab,0)
1595 rowCounts = numpy.sum(voteTab,1)
1596 message('')
1597 for i in range(nResultCodes):
1598 if rowCounts[i]==0: rowCounts[i]=1
1599 row = voteTab[i]
1600 message(' ',noRet=1)
1601 for j in range(nResultCodes):
1602 entry = row[j]
1603 message(' % 6.2f'%entry,noRet=1)
1604 message(' | % 4.2f'%(100.*voteTab[i,i]/rowCounts[i]))
1605 message(' ',noRet=1)
1606 for i in range(nResultCodes):
1607 message('-------',noRet=1)
1608 message('')
1609 message(' ',noRet=1)
1610 for i in range(nResultCodes):
1611 if colCounts[i]==0: colCounts[i]=1
1612 message(' % 6.2f'%(100.*voteTab[i,i]/colCounts[i]),noRet=1)
1613 message('')
1614 if details.errorAnalysis:
1615 message('\n*-*-*-*-*-*-*-*- ERROR ANALYSIS -*-*-*-*-*-*-*-*\n')
1616 ks = badVoteDict.keys()
1617 if len(ks):
1618 message(' ---> Bad Vote Counts')
1619 ks = noVoteDict.keys()
1620 if len(ks):
1621 message(' ---> Skipped Compound Counts')
1622 for k in ks:
1623 pt = data[k]
1624 message('%s,%d'%(str(pt[0]),noVoteDict[k]))
1625
1626 if hasattr(details,'showAll') and details.showAll:
1627 ks = goodVoteDict.keys()
1628 if len(ks):
1629 message(' ---> Good Vote Counts')
1630 for k in ks:
1631 pt = data[k]
1632 message('%s,%d'%(str(pt[0]),goodVoteDict[k]))
1633