RDKit
Open-source cheminformatics and machine learning.
InfoBitRanker.h
Go to the documentation of this file.
1 // $Id$
2 //
3 // Copyright (C) 2003-2007 Greg Landrum and Rational Discovery LLC
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 
11 #include <RDGeneral/export.h>
12 #ifndef _RD_INFORANKER_H_
13 #define _RD_INFORANKER_H_
14 
15 #include <RDGeneral/types.h>
16 #include <DataStructs/BitVects.h>
17 #include <iostream>
18 
19 /*! \brief Class used to rank bits based on a specified measure of infomation
20  *
21  * Basically a primitive mimic of the CombiChem "signal" functionality
22  * To use:
23  * - create an instance of this class
24  * - loop over the fingerprints in the dataset by calling accumulateVotes
25  *method
26  * - call getTopN to get the top n ranked bits
27  *
28  * Sample usage and results from the python wrapper:
29  * Here's a small set of vectors:
30  * >>> for i,bv in enumerate(bvs): print bv.ToBitString(),acts[i]
31  * ...
32  * 0001 0
33  * 0101 0
34  * 0010 1
35  * 1110 1
36  *
37  * Default ranker, using infogain:
38  * >>> ranker = InfoBitRanker(4,2)
39  * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
40  * ...
41  * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print
42  *int(bit),'%.3f'%gain,int(n0),int(n1)
43  * ...
44  * 3 1.000 2 0
45  * 2 1.000 0 2
46  * 0 0.311 0 1
47  *
48  * Using the biased infogain:
49  * >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.BIASENTROPY)
50  * >>> ranker.SetBiasList((1,))
51  * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
52  * ...
53  * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print
54  *int(bit),'%.3f'%gain,int(n0),int(n1)
55  * ...
56  * 2 1.000 0 2
57  * 0 0.311 0 1
58  * 1 0.000 1 1
59  *
60  * A chi squared ranker is also available:
61  * >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.CHISQUARE)
62  * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
63  * ...
64  * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print
65  *int(bit),'%.3f'%gain,int(n0),int(n1)
66  * ...
67  * 3 4.000 2 0
68  * 2 4.000 0 2
69  * 0 1.333 0 1
70  *
71  * As is a biased chi squared:
72  * >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.BIASCHISQUARE)
73  * >>> ranker.SetBiasList((1,))
74  * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
75  * ...
76  * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print
77  *int(bit),'%.3f'%gain,int(n0),int(n1)
78  * ...
79  * 2 4.000 0 2
80  * 0 1.333 0 1
81  * 1 0.000 1 1
82  */
83 namespace RDInfoTheory {
84 typedef std::vector<RDKit::USHORT> USHORT_VECT;
85 typedef std::vector<USHORT_VECT> VECT_USHORT_VECT;
86 
88  public:
89  /*! \brief the type of measure for information
90  *
91  */
92  typedef enum {
93  ENTROPY = 1,
94  BIASENTROPY = 2,
95  CHISQUARE = 3,
96  BIASCHISQUARE = 4
97  } InfoType;
98 
99  /*! \brief Constructor
100  *
101  * ARGUMENTS:
102  *
103  * - nBits: the dimension of the bit vectors or the fingerprint length
104  * - nClasses: the number of classes used in the classification problem
105  *(e.g. active,
106  * moderately active, inactive etc.). It is assumed that the
107  *classes are
108  * numbered from 0 to (nClasses - 1)
109  * - infoType: the type of information metric
110  */
111  InfoBitRanker(unsigned int nBits, unsigned int nClasses,
112  InfoType infoType = InfoBitRanker::ENTROPY)
113  : d_dims(nBits), d_classes(nClasses), d_type(infoType) {
114  d_counts.resize(0);
115  for (unsigned int i = 0; i < nClasses; i++) {
116  USHORT_VECT cCount;
117  cCount.resize(d_dims, 0);
118  d_counts.push_back(cCount);
119  }
120  d_clsCount.resize(d_classes, 0);
121  d_nInst = 0;
122  d_top = 0;
123  dp_topBits = 0;
124  d_biasList.resize(0);
125  dp_maskBits = 0;
126  }
127 
129  if (dp_topBits) delete[] dp_topBits;
130  if (dp_maskBits) delete dp_maskBits;
131  }
132 
133  /*! \brief Accumulate the votes for all the bits turned on in a bit vector
134  *
135  * ARGUMENTS:
136  *
137  * - bv : bit vector that supports [] operator
138  * - label : the class label for the bit vector. It is assumed that 0 <=
139  *class < nClasses
140  */
141  void accumulateVotes(const ExplicitBitVect &bv, unsigned int label);
142  void accumulateVotes(const SparseBitVect &bv, unsigned int label);
143 
144  /*! \brief Returns the top n bits ranked by the information metric
145  *
146  * This is actually the function where most of the work of ranking is
147  *happening
148  *
149  * \param num the number of top ranked bits that are required
150  *
151  * \return a pointer to an information array. The client should *not*
152  * delete this
153  */
154  double *getTopN(unsigned int num);
155 
156  /*! \brief return the number of labelled instances(examples) or fingerprints
157  *seen so far
158  *
159  */
160  unsigned int getNumInstances() const { return d_nInst; }
161 
162  /*! \brief return the number of classes
163  *
164  */
165  unsigned int getNumClasses() const { return d_classes; }
166 
167  /*! \brief Set the classes to which the entropy calculation should be biased
168  *
169  * This list contains a set of class ids used when in the BIASENTROPY mode of
170  *ranking bits.
171  * In this mode, a bit must be correllated higher with one of the biased
172  *classes than all the
173  * other classes. For example, in a two class problem with actives and
174  *inactives, the fraction of
175  * actives that hit the bit has to be greater than the fraction of inactives
176  *that hit the bit
177  *
178  * ARGUMENTS:
179  * classList - list of class ids that we want a bias towards
180  */
181  void setBiasList(RDKit::INT_VECT &classList);
182 
183  /*! \brief Set the bits to be used as a mask
184  *
185  * If this function is called, only the bits which are present in the
186  * maskBits list will be used.
187  *
188  * ARGUMENTS:
189  * maskBits - the bits to be considered
190  */
191  void setMaskBits(RDKit::INT_VECT &maskBits);
192 
193  /*! \brief Write the top N bits to a stream
194  *
195  */
196  void writeTopBitsToStream(std::ostream *outStream) const;
197 
198  /*! \brief Write the top bits to a file
199  *
200  */
201  void writeTopBitsToFile(const std::string &fileName) const;
202 
203  private:
204  /*! \brief check if we want to compute the info content for a bit based on the
205  *bias list
206  *
207  * This what happens here:
208  * - the fraction of items in each class that hit a particular bit are
209  *computed
210  * - the maximum of these fractions for classes that are not in the
211  *biasList are computed
212  * - If this maximum is less than the fraction for atleast one of classes
213  *in the biaslist
214  * the bit is considered good
215  * ARGUMENTS:
216  * - resMat : the result matrix, one dimensional matrix of dimension (2*(num
217  *of classes))
218  * a 2D structure is assumed with the first row containing number
219  *of items of each class
220  * with the bit set and the second row to entires of each class
221  *with the bit turned off
222  */
223  bool BiasCheckBit(RDKit::USHORT *resMat) const;
224 
225  /*! \brief Compute the biased info entropy gain based on the bias list
226  *
227  * This what happens here:
228  * - we call BiasCheckBit to see if the bit qualifies to compute the
229  *infocontent
230  * - If this bit is ok then we call InfoEntropyGain otherwise we return 0.0
231  *
232  * ARGUMENTS:
233  * - resMat : the result matrix, one dimensional matrix of dimension (2*(num
234  *of classes))
235  * a 2D structure is assumed with the first row containing number
236  *of items of each class
237  * with the bit set and the second row to entires of each class
238  *with the bit turned off
239  */
240  double BiasInfoEntropyGain(RDKit::USHORT *resMat) const;
241 
242  /*! \brief Compute the biased chi qsure value based on the bias list
243  *
244  * This what happens here:
245  * - we call BiasCheckBit to see if the bit qualifies to compute the
246  *infocontent
247  * - If this bit is ok then we call InfoEntropyGain otherwise we return 0.0
248  *
249  * ARGUMENTS:
250  * - resMat : the result matrix, one dimensional matrix of dimension (2*(num
251  *of classes))
252  * a 2D structure is assumed with the first row containing number
253  *of items of each class
254  * with the bit set and the second row to entires of each class
255  *with the bit turned off
256  */
257  double BiasChiSquareGain(RDKit::USHORT *resMat) const;
258 
259  unsigned int d_dims; // the number of bits in the fingerprints
260  unsigned int d_classes; // the number of classes (active, inactive,
261  // moderately active etc.)
262  InfoType d_type; // the type of information meassure - currently we support
263  // only entropy
264  VECT_USHORT_VECT d_counts; // place holder of counting the number of hits for
265  // each bit for each class
266  USHORT_VECT d_clsCount; // counter for the number of instances of each class
267  double *dp_topBits; // storage for the top ranked bits and the corresponding
268  // statistics
269  unsigned int d_top; // the number of bits that have been ranked
270  unsigned int d_nInst; // total number of instances or fingerprints used
271  // accumulate votes
273  d_biasList; // if we want a bias towards certain classes in ranking bits
274  ExplicitBitVect *dp_maskBits; // allows only certain bits to be considered
275 };
276 }
277 #endif
unsigned short USHORT
Definition: types.h:244
Pulls in all the BitVect classes.
a class for bit vectors that are sparsely occupied.
Definition: SparseBitVect.h:34
#define RDKIT_INFOTHEORY_EXPORT
Definition: export.h:320
Class used to rank bits based on a specified measure of infomation.
std::vector< int > INT_VECT
Definition: types.h:247
std::vector< USHORT_VECT > VECT_USHORT_VECT
Definition: InfoBitRanker.h:85
InfoType
the type of measure for information
Definition: InfoBitRanker.h:92
a class for bit vectors that are densely occupied
InfoBitRanker(unsigned int nBits, unsigned int nClasses, InfoType infoType=InfoBitRanker::ENTROPY)
Constructor.
unsigned int getNumInstances() const
return the number of labelled instances(examples) or fingerprints seen so far
unsigned int getNumClasses() const
return the number of classes
std::vector< RDKit::USHORT > USHORT_VECT
Definition: InfoBitRanker.h:84