Package rdkit :: Package ML :: Package InfoTheory :: Module entropy
[hide private]
[frames] | no frames]

Source Code for Module rdkit.ML.InfoTheory.entropy

  1  # 
  2  #  Copyright (C) 2000-2008  greg Landrum and Rational Discovery LLC 
  3  # 
  4   
  5  """ Informational Entropy functions 
  6   
  7    The definitions used are the same as those in Tom Mitchell's 
  8    book "Machine Learning" 
  9     
 10  """ 
 11  import numpy 
 12  import math 
 13   
 14  from rdkit.six.moves import xrange 
 15   
 16  # try to get the C versions of these routines 
 17  try: 
 18    import rdkit.ML.InfoTheory.rdInfoTheory as cEntropy 
 19  except: 
 20    hascEntropy=0 
 21  else: 
 22    hascEntropy=1 
 23   
 24  # it's pretty obvious what this is for ;-) 
 25  _log2 = math.log(2) 
 26   
27 -def PyInfoEntropy(results):
28 """ Calculates the informational entropy of a set of results. 29 30 **Arguments** 31 32 results is a 1D Numeric array containing the number of times a 33 given set hits each possible result. 34 For example, if a function has 3 possible results, and the 35 variable in question hits them 5, 6 and 1 times each, 36 results would be [5,6,1] 37 38 **Returns** 39 40 the informational entropy 41 42 """ 43 nInstances = float(sum(results)) 44 if nInstances == 0: 45 # to return zero or one... that is the question 46 return 0 47 probs = results/nInstances 48 49 #------- 50 # NOTE: this is a little hack to allow the use of Numeric 51 # functionality to calculate the informational entropy. 52 # The problem is that the system log function pitches a fit 53 # when you call log(0.0). We are perfectly happy with that 54 # returning *anything* because we're gonna mutiply by 0 anyway. 55 56 # Here's the risky (but marginally faster way to do it: 57 # add a small number to probs and hope it doesn't screw 58 # things up too much. 59 #t = probs+1e-10 60 61 # Here's a perfectly safe approach that's a little bit more obfuscated 62 # and a tiny bit slower 63 t = numpy.choose(numpy.greater(probs,0.0),(1,probs)) 64 return sum(-probs*numpy.log(t)/_log2)
65 66
67 -def PyInfoGain(varMat):
68 """ calculates the information gain for a variable 69 70 **Arguments** 71 72 varMat is a Numeric array with the number of possible occurances 73 of each result for reach possible value of the given variable. 74 75 So, for a variable which adopts 4 possible values and a result which 76 has 3 possible values, varMat would be 4x3 77 78 **Returns** 79 80 The expected information gain 81 """ 82 variableRes = numpy.sum(varMat,1) # indexed by variable, Sv in Mitchell's notation 83 overallRes = numpy.sum(varMat,0) # indexed by result, S in Mitchell's notation 84 85 term2 = 0 86 for i in xrange(len(variableRes)): 87 term2 = term2 + variableRes[i] * InfoEntropy(varMat[i]) 88 tSum = sum(overallRes) 89 if tSum != 0.0: 90 term2 = 1./tSum * term2 91 gain = InfoEntropy(overallRes) - term2 92 else: 93 gain = 0 94 return gain
95 96 # if we have the C versions, use them, otherwise use the python stuff 97 if hascEntropy: 98 InfoEntropy = cEntropy.InfoEntropy 99 InfoGain = cEntropy.InfoGain 100 else: 101 InfoEntropy = PyInfoEntropy 102 InfoGain = PyInfoGain 103