RDKit
Open-source cheminformatics and machine learning.
InfoGainFuncs.h
Go to the documentation of this file.
1 // $Id$
2 //
3 // Copyright (C) 2003 Rational Discovery LLC
4 //
5 
6 #ifndef INFOGAINFUNC_H
7 #define INFOGAINFUNC_H
8 
9 #include <RDGeneral/types.h>
10 
11 namespace RDInfoTheory {
12 
13  template<class T> double ChiSquare(T *dMat, long int dim1,long int dim2) {
14  // For a contingency matrix with each column corresponding to a class and each row to a
15  // the descriptor (or variable) state, the matrix looks something like for 3x3 problem
16  //
17  // 1 2 3 Totals
18  // 1 | N11 N12 N13 R1
19  // 2 | N21 N22 N23 R2
20  // 3 | N31 N32 N33 R3
21  // Totals | C1 C2 C3 N
22  //
23  // Th chi squere formula is
24  // chi = sum((N/Ri)*sum(Nij^2/Cj) ) -N
25  T *rowSums, *colSums;
26  int i, j, tSum;
27  // find the row sum
28  tSum = 0;
29  rowSums = new T[dim1];
30  for (i = 0; i < dim1; i++) {
31  int idx1 = i*dim2;
32  rowSums[i] = (T)0.0;
33  for (j = 0; j < dim2; j++) {
34  rowSums[i] += dMat[idx1 + j];
35  }
36  tSum += (int)rowSums[i];
37  }
38 
39  // find the column sums
40  colSums = new T[dim2];
41  for (i = 0; i < dim2; i++) {
42  colSums[i] = (T)0.0;
43  for (j = 0; j < dim1; j++) {
44  colSums[i] += dMat[j*dim2 + i];
45  }
46  }
47 
48  double chi = 0.0;
49  for ( i = 0; i < dim1; i++) {
50  double rchi = 0.0;
51  for (j = 0; j < dim2; j++) {
52  rchi += (pow((double)dMat[i*dim2 + j], 2)/colSums[j]);
53  }
54  chi += ( ((double)tSum/rowSums[i])*rchi );
55  }
56  chi -= tSum;
57  delete [] rowSums;
58  delete [] colSums;
59 
60  return chi;
61  }
62 
63  template<class T> double InfoEntropy(T *tPtr, long int dim) {
64  int i;
65  T nInstances = 0;
66  double accum=0.0,d;
67 
68  for(i=0;i<dim;i++){
69  nInstances += tPtr[i];
70  }
71 
72  if(nInstances != 0){
73  for(i=0;i<dim;i++){
74  d = (double)tPtr[i]/nInstances;
75  if(d != 0){
76  accum += -d*log(d);
77  }
78  }
79  }
80  return accum/log(2.0);
81  }
82 
83  template<class T> double InfoEntropyGain(T *dMat, long int dim1,long int dim2) {
84  T *variableRes, *overallRes;
85  double gain,term2;
86  int tSum;
87 
88  //std::cerr<<" --------\n ieg: "<<dim1<<" "<<dim2<<std::endl;
89  variableRes = new T[dim1];
90  for(long int i=0;i<dim1;i++){
91  long int idx1 = i*dim2;
92  variableRes[i] = (T)0.0;
93  for(long int j=0;j<dim2;j++){
94  variableRes[i] += dMat[idx1+j];
95  //std::cerr<<" "<<i<<" "<<j<<" "<<dMat[idx1+j]<<std::endl;
96  }
97  }
98 
99  overallRes = new T[dim2];
100  // do the col sums
101  for(long int i=0;i<dim2;i++){
102  overallRes[i] = (T)0.0;
103  for(long int j=0;j<dim1;j++){
104  overallRes[i] += dMat[j*dim2+i];
105  //std::cerr<<" "<<i<<" "<<j<<" "<<dMat[j*dim2+i]<<std::endl;
106  }
107  }
108 
109  term2 = 0.0;
110  for(long int i=0;i<dim1;i++) {
111  T *tPtr;
112  tPtr = dMat + i*dim2;
113  term2 += variableRes[i] * InfoEntropy(tPtr,dim2);
114  }
115  tSum = 0;
116  for(long int i=0;i<dim2;i++){
117  tSum += static_cast<int>(overallRes[i]);
118  }
119 
120  if(tSum != 0){
121  term2 /= tSum;
122  gain = InfoEntropy(overallRes,dim2) - term2;
123  }
124  else{
125  gain = 0.0;
126  }
127  //std::cerr<<" >gain> "<<gain<<std::endl;
128 
129  delete [] overallRes;
130  delete [] variableRes;
131  return gain;
132  }
133 
134 
135 }
136 #endif
137 
138 
Class used to rank bits based on a specified measure of infomation.
double InfoEntropyGain(T *dMat, long int dim1, long int dim2)
Definition: InfoGainFuncs.h:83
double ChiSquare(T *dMat, long int dim1, long int dim2)
Definition: InfoGainFuncs.h:13
double InfoEntropy(T *tPtr, long int dim)
Definition: InfoGainFuncs.h:63