RDKit
Open-source cheminformatics and machine learning.
InfoGainFuncs.h
Go to the documentation of this file.
1 // $Id$
2 //
3 // Copyright (C) 2003 Rational Discovery LLC
4 //
5 
6 #include <RDGeneral/export.h>
7 #ifndef INFOGAINFUNC_H
8 #define INFOGAINFUNC_H
9 
10 #include <RDGeneral/types.h>
11 
12 namespace RDInfoTheory {
13 
14 template <class T>
15 double ChiSquare(T *dMat, long int dim1, long int dim2) {
16  // For a contingency matrix with each column corresponding to a class and each
17  // row to a
18  // the descriptor (or variable) state, the matrix looks something like for 3x3
19  // problem
20  //
21  // 1 2 3 Totals
22  // 1 | N11 N12 N13 R1
23  // 2 | N21 N22 N23 R2
24  // 3 | N31 N32 N33 R3
25  // Totals | C1 C2 C3 N
26  //
27  // Th chi squere formula is
28  // chi = sum((N/Ri)*sum(Nij^2/Cj) ) -N
29  T *rowSums, *colSums;
30  int i, j, tSum;
31  // find the row sum
32  tSum = 0;
33  rowSums = new T[dim1];
34  for (i = 0; i < dim1; i++) {
35  int idx1 = i * dim2;
36  rowSums[i] = (T)0.0;
37  for (j = 0; j < dim2; j++) {
38  rowSums[i] += dMat[idx1 + j];
39  }
40  tSum += (int)rowSums[i];
41  }
42 
43  // find the column sums
44  colSums = new T[dim2];
45  for (i = 0; i < dim2; i++) {
46  colSums[i] = (T)0.0;
47  for (j = 0; j < dim1; j++) {
48  colSums[i] += dMat[j * dim2 + i];
49  }
50  }
51 
52  double chi = 0.0;
53  for (i = 0; i < dim1; i++) {
54  double rchi = 0.0;
55  for (j = 0; j < dim2; j++) {
56  rchi += (pow((double)dMat[i * dim2 + j], 2) / colSums[j]);
57  }
58  chi += (((double)tSum / rowSums[i]) * rchi);
59  }
60  chi -= tSum;
61  delete[] rowSums;
62  delete[] colSums;
63 
64  return chi;
65 }
66 
67 template <class T>
68 double InfoEntropy(T *tPtr, long int dim) {
69  int i;
70  T nInstances = 0;
71  double accum = 0.0, d;
72 
73  for (i = 0; i < dim; i++) {
74  nInstances += tPtr[i];
75  }
76 
77  if (nInstances != 0) {
78  for (i = 0; i < dim; i++) {
79  d = (double)tPtr[i] / nInstances;
80  if (d != 0) {
81  accum += -d * log(d);
82  }
83  }
84  }
85  return accum / log(2.0);
86 }
87 
88 template <class T>
89 double InfoEntropyGain(T *dMat, long int dim1, long int dim2) {
90  T *variableRes, *overallRes;
91  double gain, term2;
92  int tSum;
93 
94  // std::cerr<<" --------\n ieg: "<<dim1<<" "<<dim2<<std::endl;
95  variableRes = new T[dim1];
96  for (long int i = 0; i < dim1; i++) {
97  long int idx1 = i * dim2;
98  variableRes[i] = (T)0.0;
99  for (long int j = 0; j < dim2; j++) {
100  variableRes[i] += dMat[idx1 + j];
101  // std::cerr<<" "<<i<<" "<<j<<" "<<dMat[idx1+j]<<std::endl;
102  }
103  }
104 
105  overallRes = new T[dim2];
106  // do the col sums
107  for (long int i = 0; i < dim2; i++) {
108  overallRes[i] = (T)0.0;
109  for (long int j = 0; j < dim1; j++) {
110  overallRes[i] += dMat[j * dim2 + i];
111  // std::cerr<<" "<<i<<" "<<j<<" "<<dMat[j*dim2+i]<<std::endl;
112  }
113  }
114 
115  term2 = 0.0;
116  for (long int i = 0; i < dim1; i++) {
117  T *tPtr;
118  tPtr = dMat + i * dim2;
119  term2 += variableRes[i] * InfoEntropy(tPtr, dim2);
120  }
121  tSum = 0;
122  for (long int i = 0; i < dim2; i++) {
123  tSum += static_cast<int>(overallRes[i]);
124  }
125 
126  if (tSum != 0) {
127  term2 /= tSum;
128  gain = InfoEntropy(overallRes, dim2) - term2;
129  } else {
130  gain = 0.0;
131  }
132  // std::cerr<<" >gain> "<<gain<<std::endl;
133 
134  delete[] overallRes;
135  delete[] variableRes;
136  return gain;
137 }
138 }
139 #endif
Class used to rank bits based on a specified measure of infomation.
double InfoEntropyGain(T *dMat, long int dim1, long int dim2)
Definition: InfoGainFuncs.h:89
double ChiSquare(T *dMat, long int dim1, long int dim2)
Definition: InfoGainFuncs.h:15
double InfoEntropy(T *tPtr, long int dim)
Definition: InfoGainFuncs.h:68