RDKit
Open-source cheminformatics and machine learning.
InfoBitRanker.h
Go to the documentation of this file.
1 // $Id$
2 //
3 // Copyright (C) 2003-2007 Greg Landrum and Rational Discovery LLC
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 
11 #ifndef _RD_INFORANKER_H_
12 #define _RD_INFORANKER_H_
13 
14 #include <RDGeneral/types.h>
15 #include <DataStructs/BitVects.h>
16 #include <iostream>
17 
18 
19 /*! \brief Class used to rank bits based on a specified measure of infomation
20  *
21  * Basically a primitive mimic of the CombiChem "signal" functionality
22  * To use:
23  * - create an instance of this class
24  * - loop over the fingerprints in the dataset by calling accumulateVotes method
25  * - call getTopN to get the top n ranked bits
26  *
27  * Sample usage and results from the python wrapper:
28  * Here's a small set of vectors:
29  * >>> for i,bv in enumerate(bvs): print bv.ToBitString(),acts[i]
30  * ...
31  * 0001 0
32  * 0101 0
33  * 0010 1
34  * 1110 1
35  *
36  * Default ranker, using infogain:
37  * >>> ranker = InfoBitRanker(4,2)
38  * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
39  * ...
40  * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print int(bit),'%.3f'%gain,int(n0),int(n1)
41  * ...
42  * 3 1.000 2 0
43  * 2 1.000 0 2
44  * 0 0.311 0 1
45  *
46  * Using the biased infogain:
47  * >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.BIASENTROPY)
48  * >>> ranker.SetBiasList((1,))
49  * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
50  * ...
51  * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print int(bit),'%.3f'%gain,int(n0),int(n1)
52  * ...
53  * 2 1.000 0 2
54  * 0 0.311 0 1
55  * 1 0.000 1 1
56  *
57  * A chi squared ranker is also available:
58  * >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.CHISQUARE)
59  * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
60  * ...
61  * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print int(bit),'%.3f'%gain,int(n0),int(n1)
62  * ...
63  * 3 4.000 2 0
64  * 2 4.000 0 2
65  * 0 1.333 0 1
66  *
67  * As is a biased chi squared:
68  * >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.BIASCHISQUARE)
69  * >>> ranker.SetBiasList((1,))
70  * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
71  * ...
72  * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print int(bit),'%.3f'%gain,int(n0),int(n1)
73  * ...
74  * 2 4.000 0 2
75  * 0 1.333 0 1
76  * 1 0.000 1 1
77  */
78 namespace RDInfoTheory {
79  typedef std::vector<RDKit::USHORT> USHORT_VECT;
80  typedef std::vector<USHORT_VECT> VECT_USHORT_VECT;
81 
82  class InfoBitRanker {
83  public:
84 
85  /*! \brief the type of measure for information
86  *
87  */
88  typedef enum {
93  } InfoType;
94 
95  /*! \brief Constructor
96  *
97  * ARGUMENTS:
98  *
99  * - nBits: the dimension of the bit vectors or the fingerprint length
100  * - nClasses: the number of classes used in the classification problem (e.g. active,
101  * moderately active, inactive etc.). It is assumed that the classes are
102  * numbered from 0 to (nClasses - 1)
103  * - infoType: the type of information metric
104  */
105  InfoBitRanker(unsigned int nBits, unsigned int nClasses, InfoType infoType=InfoBitRanker::ENTROPY) :
106  d_dims(nBits), d_classes(nClasses), d_type(infoType) {
107  d_counts.resize(0);
108  for (unsigned int i = 0; i < nClasses; i++) {
109  USHORT_VECT cCount;
110  cCount.resize(d_dims, 0);
111  d_counts.push_back(cCount);
112  }
113  d_clsCount.resize(d_classes, 0);
114  d_nInst = 0;
115  d_top = 0;
116  dp_topBits=0;
117  d_biasList.resize(0);
118  dp_maskBits=0;
119  }
120 
122  if(dp_topBits)
123  delete [] dp_topBits;
124  if(dp_maskBits)
125  delete dp_maskBits;
126  }
127 
128  /*! \brief Accumulate the votes for all the bits turned on in a bit vector
129  *
130  * ARGUMENTS:
131  *
132  * - bv : bit vector that supports [] operator
133  * - label : the class label for the bit vector. It is assumed that 0 <= class < nClasses
134  */
135  void accumulateVotes(const ExplicitBitVect &bv, unsigned int label);
136  void accumulateVotes(const SparseBitVect &bv, unsigned int label);
137 
138  /*! \brief Returns the top n bits ranked by the information metric
139  *
140  * This is actually the function where most of the work of ranking is happening
141  *
142  * \param num the number of top ranked bits that are required
143  *
144  * \return a pointer to an information array. The client should *not*
145  * delete this
146  */
147  double *getTopN(unsigned int num);
148 
149  /*! \brief return the number of labelled instances(examples) or fingerprints seen so far
150  *
151  */
152  unsigned int getNumInstances() const {
153  return d_nInst;
154  }
155 
156  /*! \brief return the number of classes
157  *
158  */
159  unsigned int getNumClasses() const {
160  return d_classes;
161  }
162 
163  /*! \brief Set the classes to which the entropy calculation should be biased
164  *
165  * This list contains a set of class ids used when in the BIASENTROPY mode of ranking bits.
166  * In this mode, a bit must be correllated higher with one of the biased classes than all the
167  * other classes. For example, in a two class problem with actives and inactives, the fraction of
168  * actives that hit the bit has to be greater than the fraction of inactives that hit the bit
169  *
170  * ARGUMENTS:
171  * classList - list of class ids that we want a bias towards
172  */
173  void setBiasList(RDKit::INT_VECT &classList);
174 
175 
176  /*! \brief Set the bits to be used as a mask
177  *
178  * If this function is called, only the bits which are present in the
179  * maskBits list will be used.
180  *
181  * ARGUMENTS:
182  * maskBits - the bits to be considered
183  */
184  void setMaskBits(RDKit::INT_VECT &maskBits);
185 
186  /*! \brief Write the top N bits to a stream
187  *
188  */
189  void writeTopBitsToStream(std::ostream *outStream) const;
190 
191  /*! \brief Write the top bits to a file
192  *
193  */
194  void writeTopBitsToFile(std::string fileName) const;
195 
196  private:
197  /*! \brief check if we want to compute the info content for a bit based on the bias list
198  *
199  * This what happens here:
200  * - the fraction of items in each class that hit a particular bit are computed
201  * - the maximum of these fractions for classes that are not in the biasList are computed
202  * - If this maximum is less than the fraction for atleast one of classes in the biaslist
203  * the bit is considered good
204  * ARGUMENTS:
205  * - resMat : the result matrix, one dimensional matrix of dimension (2*(num of classes))
206  * a 2D structure is assumed with the first row containing number of items of each class
207  * with the bit set and the second row to entires of each class with the bit turned off
208  */
209  bool BiasCheckBit(RDKit::USHORT *resMat) const;
210 
211  /*! \brief Compute the biased info entropy gain based on the bias list
212  *
213  * This what happens here:
214  * - we call BiasCheckBit to see if the bit qualifies to compute the infocontent
215  * - If this bit is ok then we call InfoEntropyGain otherwise we return 0.0
216  *
217  * ARGUMENTS:
218  * - resMat : the result matrix, one dimensional matrix of dimension (2*(num of classes))
219  * a 2D structure is assumed with the first row containing number of items of each class
220  * with the bit set and the second row to entires of each class with the bit turned off
221  */
222  double BiasInfoEntropyGain(RDKit::USHORT *resMat) const;
223 
224  /*! \brief Compute the biased chi qsure value based on the bias list
225  *
226  * This what happens here:
227  * - we call BiasCheckBit to see if the bit qualifies to compute the infocontent
228  * - If this bit is ok then we call InfoEntropyGain otherwise we return 0.0
229  *
230  * ARGUMENTS:
231  * - resMat : the result matrix, one dimensional matrix of dimension (2*(num of classes))
232  * a 2D structure is assumed with the first row containing number of items of each class
233  * with the bit set and the second row to entires of each class with the bit turned off
234  */
235  double BiasChiSquareGain(RDKit::USHORT *resMat) const;
236 
237  unsigned int d_dims; // the number of bits in the fingerprints
238  unsigned int d_classes; // the number of classes (active, inactive, moderately active etc.)
239  InfoType d_type; // the type of information meassure - currently we support only entropy
240  VECT_USHORT_VECT d_counts; // place holder of counting the number of hits for each bit for each class
241  USHORT_VECT d_clsCount; // counter for the number of instances of each class
242  double *dp_topBits; // storage for the top ranked bits and the corresponding statistics
243  unsigned int d_top; // the number of bits that have been ranked
244  unsigned int d_nInst; // total number of instances or fingerprints used accumulate votes
245  RDKit::INT_VECT d_biasList; // if we want a bias towards certain classes in ranking bits
246  ExplicitBitVect *dp_maskBits; // allows only certain bits to be considered
247 
248  };
249 }
250 #endif
unsigned int getNumClasses() const
return the number of classes
unsigned short USHORT
Definition: types.h:143
Pulls in all the BitVect classes.
void setMaskBits(RDKit::INT_VECT &maskBits)
Set the bits to be used as a mask.
unsigned int getNumInstances() const
return the number of labelled instances(examples) or fingerprints seen so far
void writeTopBitsToStream(std::ostream *outStream) const
Write the top N bits to a stream.
a class for bit vectors that are sparsely occupied.
Definition: SparseBitVect.h:35
Class used to rank bits based on a specified measure of infomation.
void accumulateVotes(const ExplicitBitVect &bv, unsigned int label)
Accumulate the votes for all the bits turned on in a bit vector.
std::vector< int > INT_VECT
Definition: types.h:146
void writeTopBitsToFile(std::string fileName) const
Write the top bits to a file.
std::vector< USHORT_VECT > VECT_USHORT_VECT
Definition: InfoBitRanker.h:80
InfoType
the type of measure for information
Definition: InfoBitRanker.h:88
double * getTopN(unsigned int num)
Returns the top n bits ranked by the information metric.
a class for bit vectors that are densely occupied
InfoBitRanker(unsigned int nBits, unsigned int nClasses, InfoType infoType=InfoBitRanker::ENTROPY)
Constructor.
void setBiasList(RDKit::INT_VECT &classList)
Set the classes to which the entropy calculation should be biased.
std::vector< RDKit::USHORT > USHORT_VECT
Definition: InfoBitRanker.h:79