Package rdkit :: Package ML :: Package NaiveBayes :: Module ClassificationModel
[hide private]
[frames] | no frames]

Source Code for Module rdkit.ML.NaiveBayes.ClassificationModel

  1  # $Id$ 
  2  # 
  3  #  Copyright (C) 2004-2008 Greg Landrum and Rational Discovery LLC 
  4  #      All Rights Reserved 
  5  # 
  6  """ Defines Naive Baysean classification model 
  7     Based on development in: Chapter 6 of "Machine Learning" by Tom Mitchell 
  8   
  9  """ 
 10  import numpy 
 11  from rdkit.ML.Data import Quantize 
 12  from rdkit.six import iteritems 
13 -def _getBinId(val, qBounds) :
14 bid = 0 15 for bnd in qBounds: 16 if (val > bnd) : 17 bid += 1 18 return bid
19 20 # FIX: this class has not been updated to new-style classes 21 # (RD Issue380) because that would break all of our legacy pickled 22 # data. Until a solution is found for this breakage, an update is 23 # impossible.
24 -class NaiveBayesClassifier :
25 """ 26 _NaiveBayesClassifier_s can save the following pieces of internal state, accessible via 27 standard setter/getter functions: 28 29 1) _Examples_: a list of examples which have been predicted 30 31 2) _TrainingExamples_: List of training examples - the descriptor value of these examples 32 are quantized based on info gain using ML/Data/Quantize.py if necessary 33 34 3) _TestExamples_: the list of examples used to test the model 35 36 4) _BadExamples_ : list of examples that were incorrectly classified 37 38 4) _QBoundVals_: Quant bound values for each varaible - a list of lists 39 40 5) _QBounds_ : Number of bounds for each variable 41 42 """ 43
44 - def __init__(self, attrs, nPossibleVals, 45 nQuantBounds, mEstimateVal=-1.0, useSigs=False):
46 """ Constructor 47 48 """ 49 self._attrs = attrs 50 self._mEstimateVal = mEstimateVal 51 self._useSigs=useSigs 52 53 self._classProbs = {} 54 55 self._examples = [] 56 self._trainingExamples = [] 57 self._testExamples = [] 58 self._badExamples = [] 59 self._QBoundVals = {} 60 self._nClasses = nPossibleVals[-1] 61 self._qBounds = nQuantBounds 62 self._nPosVals = nPossibleVals 63 self._needsQuant = 1 64 65 self._name = "" 66 self.mprob = -1.0 67 68 # for the sake a of efficiency lets try to change the conditional probabities 69 # to a numpy array instead of a dictionary. The three dimension array is indexed 70 # on the the activity class, the discriptor ID and the descriptor binID 71 #self._condProbs = {} 72 #self._condProbs = numpy.zeros((self._nClasses, max(self._attrs)+1, max(self._nPosVals)+1), 'd') 73 self._condProbs = [None]*self._nClasses 74 for i in range(self._nClasses): 75 if not (hasattr(self,'_useSigs') and self._useSigs): 76 nA = max(self._attrs)+1 77 self._condProbs[i] = [None]*nA 78 for j in range(nA): 79 nV = self._nPosVals[j] 80 if self._qBounds[j]: 81 nV = max(nV,self._qBounds[j]+1) 82 self._condProbs[i][j] = [0.0]*nV 83 else: 84 self._condProbs[i] = {} 85 for idx in self._attrs: 86 self._condProbs[i][idx] = [0.0]*2
87
88 - def GetName(self):
89 return self._name
90
91 - def SetName(self, name):
92 self._name = name
93
94 - def NameModel(self, varNames) :
95 self.SetName('NaiveBayesCalssifier')
96
97 - def GetExamples(self):
98 return self._examples
99
100 - def SetExamples(self, examples):
101 self._examples = examples
102
103 - def GetTrainingExamples(self):
104 return self._trainingExamples
105
106 - def SetTrainingExamples(self,examples):
107 self._trainingExamples = examples
108
109 - def GetTestExamples(self) :
110 return self._testExamples
111
112 - def SetTestExamples(self, examples) :
113 self._testExamples = examples
114
115 - def SetBadExamples(self, examples) :
116 self._badExamples = examples
117
118 - def GetBadExamples(self) :
119 return self._badExamples
120
121 - def _computeQuantBounds(self) :
122 neg = len(self._trainingExamples) 123 natr = len(self._attrs) 124 125 # make a list of results and values 126 allVals = numpy.zeros((neg, natr), 'd') 127 res = [] # list of y values 128 i = 0 129 for eg in self._trainingExamples: 130 res.append(eg[-1]) 131 j = 0 132 for ai in self._attrs: 133 val = eg[ai] 134 allVals[i,j] = val 135 j += 1 136 i += 1 137 138 # now loop over each of the columns and compute the bounds 139 # the number of bounds is determined by the maximum info gain 140 i = 0 141 for ai in self._attrs: 142 nbnds = self._qBounds[ai] 143 if nbnds > 0 : 144 mbnds = [] 145 mgain = -1.0 146 147 for j in range(1,nbnds+1): 148 bnds, igain = Quantize.FindVarMultQuantBounds(allVals[:,i], j, res, self._nClasses) 149 if (igain > mgain) : 150 mbnds = bnds 151 mgain = igain 152 self._QBoundVals[ai] = mbnds 153 i += 1
154
155 - def trainModel(self) :
156 """ We will assume at this point that the training examples have been set 157 158 We have to estmate the conditional probabilities for each of the (binned) descriptor 159 component give a outcome (or class). Also the probabilities for each class is estimated 160 """ 161 # first estimate the class probabilities 162 n = len(self._trainingExamples) 163 for i in range(self._nClasses): 164 self._classProbs[i] = 0.0 165 166 #for i in range(self._nClasses): 167 # self._classProbs[i] = float(self._classProbs[i])/n 168 169 # first find the bounds for each descriptor value if necessary 170 if not self._useSigs and max(self._qBounds)>0: 171 self._computeQuantBounds() 172 173 # now compute the probabilities 174 ncls = {} 175 176 177 incr = 1.0/n 178 for eg in self._trainingExamples : 179 cls = eg[-1] 180 self._classProbs[cls] += incr 181 ncls[cls] = ncls.get(cls,0)+1 182 tmp = self._condProbs[cls] 183 if not self._useSigs: 184 for ai in self._attrs: 185 bid = eg[ai] 186 if self._qBounds[ai] > 0 : 187 bid = _getBinId(bid, self._QBoundVals[ai]) 188 tmp[ai][bid] += 1.0 189 else: 190 for ai in self._attrs: 191 if eg[1].GetBit(ai): 192 tmp[ai][1] += 1.0 193 else: 194 tmp[ai][0] += 1.0 195 196 197 #for key in self._condProbs: 198 for cls in range(self._nClasses) : 199 if not cls in ncls: continue 200 #cls = key[0] 201 tmp = self._condProbs[cls] 202 for ai in self._attrs: 203 if not self._useSigs: 204 nbnds = self._nPosVals[ai] 205 if (self._qBounds[ai] > 0) : 206 nbnds = self._qBounds[ai] 207 else: 208 nbnds = 2 209 for bid in range(nbnds): 210 if self._mEstimateVal <= 0.0 : 211 # this is simple the fraction of of time this descriptor component assume 212 # this value for the examples that belong a specific class 213 #self._condProbs[key] = (float(self._condProbs[key]))/ncls[cls] 214 tmp[ai][bid] /= ncls[cls] 215 else : 216 # this a bit more complicated form - more appropriate for unbalanced data 217 # see "Machine Learning" by Tom Mitchell section 6.9.1.1 218 219 # this is the probability that this descriptor component can take this specific value 220 # in the lack of any other information is is simply the inverse of the number of 221 # possible values 'npossible' 222 # If we quantized this component then 223 # npossible = 1 + len(self._QBoundVals[ai]) 224 # else if we did no qunatize (the descriptor came quantized) 225 # npossible = nPossibleVals[ai] 226 #ai = key[1] 227 pdesc = 0.0 228 if self._qBounds[ai] > 0 : 229 pdesc = 1.0/(1 + len(self._QBoundVals[ai])) 230 elif (self._nPosVals[ai] > 0) : 231 pdesc = 1.0/(self._nPosVals[ai]) 232 else : 233 raise ValueError('Neither Bounds set nor data pre-quantized for attribute ' + str(ai)) 234 tmp[ai][bid] += (self._mEstimateVal)*pdesc 235 tmp[ai][bid] /= (ncls[cls] + self._mEstimateVal)
236
237 - def ClassifyExamples(self, examples, appendExamples=0):
238 preds = [] 239 for eg in examples: 240 pred = self.ClassifyExample(eg, appendExamples) 241 preds.append(int(pred)) 242 return preds
243
244 - def GetClassificationDetails(self):
245 """ returns the probability of the last prediction """ 246 return self.mprob
247
248 - def ClassifyExample(self, example, appendExamples=0) :
249 """ Classify an example by summing over the conditional probabilities 250 The most likely class is the one with the largest probability 251 """ 252 if appendExamples: 253 self._examples.append(example) 254 clsProb = {} 255 for key,prob in iteritems(self._classProbs): 256 clsProb[key] = prob 257 tmp = self._condProbs[key] 258 for ai in self._attrs: 259 if not (hasattr(self,'_useSigs') and self._useSigs): 260 bid = example[ai] 261 if self._qBounds[ai] > 0 : 262 bid = _getBinId(bid, self._QBoundVals[ai]) 263 else: 264 if example[1].GetBit(ai): 265 bid=1 266 else: 267 bid=0 268 clsProb[key] *= tmp[ai][bid] 269 270 mkey = -1 271 self.mprob = -1.0 272 for key,prob in iteritems(clsProb): 273 if (prob > self.mprob) : 274 mkey = key 275 self.mprob = prob 276 277 return mkey
278