RDKit
Open-source cheminformatics and machine learning.
MaxMinPicker.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2003-2007 Greg Landrum and Rational Discovery LLC
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #ifndef __RD_MAXMINPICKER_H__
11 #define __RD_MAXMINPICKER_H__
12 
13 #include <RDGeneral/types.h>
14 #include <RDGeneral/utils.h>
15 #include <RDGeneral/Invariant.h>
16 #include <RDGeneral/RDLog.h>
17 #include <RDGeneral/Exceptions.h>
18 #include <cstdlib>
19 #include "DistPicker.h"
20 #include <boost/random.hpp>
21 
22 namespace RDPickers {
23 
24  namespace {
25  class distmatFunctor{
26  public:
27  distmatFunctor(const double *distMat) : dp_distMat(distMat) {};
28  double operator()(unsigned int i,unsigned int j) {
29  return getDistFromLTM(this->dp_distMat,i,j);
30  }
31  private:
32  const double *dp_distMat;
33  };
34  }
35 
36  /*! \brief Implements the MaxMin algorithm for picking a subset of item from a pool
37  *
38  * This class inherits from the DistPicker and implements a specific picking strategy
39  * aimed at diversity. See documentation for "pick()" member function for the algorithm details
40  */
41  class MaxMinPicker : public DistPicker {
42  public:
43  /*! \brief Default Constructor
44  *
45  */
47 
48  /*! \brief Contains the implementation for a lazy MaxMin diversity picker
49  *
50  * See the documentation for the pick() method for details about the algorithm
51  *
52  * \param func - a function (or functor) taking two unsigned ints as arguments
53  * and returning the distance (as a double) between those two elements.
54  * \param poolSize - the size of the pool to pick the items from. It is assumed that the
55  * distance matrix above contains the right number of elements; i.e.
56  * poolSize*(poolSize-1)
57  * \param pickSize - the number items to pick from pool (<= poolSize)
58  * \param firstPicks - (optional)the first items in the pick list
59  * \param seed - (optional) seed for the random number generator
60  */
61  template <typename T>
62  RDKit::INT_VECT lazyPick(T &func,
63  unsigned int poolSize, unsigned int pickSize,
64  RDKit::INT_VECT firstPicks=RDKit::INT_VECT(),
65  int seed=-1) const;
66 
67  /*! \brief Contains the implementation for the MaxMin diversity picker
68  *
69  * Here is how the picking algorithm works, refer to
70  * Ashton, M. et. al., Quant. Struct.-Act. Relat., 21 (2002), 598-604
71  * for more detail:
72  *
73  * A subset of k items is to be selected from a pool containing N molecules.
74  * Then the MaxMin method is as follows:
75  * -# Initialise Subset with some appropriately chosen seed
76  * compound and set x = 1.
77  * -# For each of the N-x remaining compounds in Dataset
78  * calculate its dissimilarity with each of the x compounds in
79  * Subset and retain the smallest of these x dissimilarities for
80  * each compound in Dataset.
81  * -# Select the molecule from Dataset with the largest value
82  * for the smallest dissimilarity calculated in Step 2 and
83  * transfer it to Subset.
84  * -# Set x = x + 1 and return to Step 2 if x < k.
85  *
86  *
87  *
88  * \param distMat - distance matrix - a vector of double. It is assumed that only the
89  * lower triangle element of the matrix are supplied in a 1D array\n
90  * \param poolSize - the size of the pool to pick the items from. It is assumed that the
91  * distance matrix above contains the right number of elements; i.e.
92  * poolSize*(poolSize-1) \n
93  * \param pickSize - the number items to pick from pool (<= poolSize)
94  * \param firstPicks - indices of the items used to seed the pick set.
95  * \param seed - (optional) seed for the random number generator
96  */
97  RDKit::INT_VECT pick(const double *distMat,
98  unsigned int poolSize, unsigned int pickSize,
99  RDKit::INT_VECT firstPicks,
100  int seed=-1) const {
101  CHECK_INVARIANT(distMat, "Invalid Distance Matrix");
102  if(poolSize<pickSize)
103  throw ValueErrorException("pickSize cannot be larger than the poolSize");
104  distmatFunctor functor(distMat);
105  return this->lazyPick(functor,poolSize,pickSize,firstPicks,seed);
106  }
107 
108  /*! \overload */
109  RDKit::INT_VECT pick(const double *distMat,
110  unsigned int poolSize, unsigned int pickSize) const {
111  RDKit::INT_VECT iv;
112  return pick(distMat,poolSize,pickSize,iv);
113  }
114 
115 
116 
117  };
118  // we implement this here in order to allow arbitrary functors without link errors
119  template <typename T>
121  unsigned int poolSize, unsigned int pickSize,
122  RDKit::INT_VECT firstPicks,
123  int seed) const {
124  if(poolSize<pickSize)
125  throw ValueErrorException("pickSize cannot be larger than the poolSize");
126 
127  RDKit::INT_LIST pool;
128 
129  RDKit::INT_VECT picks;
130  picks.reserve(pickSize);
131  unsigned int pick=0;
132 
133  // enter the pool into a list so that we can pick out of it easily
134  for (unsigned int i = 0; i < poolSize; i++) {
135  pool.push_back(i);
136  }
137 
138 
139  // get a seeded random number generator:
140  typedef boost::mt19937 rng_type;
141  typedef boost::uniform_int<> distrib_type;
142  typedef boost::variate_generator<rng_type &,distrib_type> source_type;
143  rng_type generator(42u);
144  distrib_type dist(0,poolSize);
145  source_type randomSource(generator,dist);
146  if(seed>0) generator.seed(static_cast<rng_type::result_type>(seed));
147 
148  // pick the first entry
149  if(!firstPicks.size()){
150  pick = randomSource();
151  // add the pick to the picks
152  picks.push_back(pick);
153  // and remove it from the pool
154  pool.remove(pick);
155  } else{
156  for(RDKit::INT_VECT::const_iterator pIdx=firstPicks.begin();
157  pIdx!=firstPicks.end();++pIdx){
158  pick = static_cast<unsigned int>(*pIdx);
159  if(pick>=poolSize)
160  throw ValueErrorException("pick index was larger than the poolSize");
161  picks.push_back(pick);
162  pool.remove(pick);
163  }
164  }
165  // now pick 1 compound at a time
166  while (picks.size() < pickSize) {
167  double maxOFmin = -1.0;
168  RDKit::INT_LIST_I plri=pool.end();
169  for(RDKit::INT_LIST_I pli=pool.begin();
170  pli!=pool.end(); ++pli){
171  unsigned int poolIdx = (*pli);
172  double minTOi = RDKit::MAX_DOUBLE;
173  for (RDKit::INT_VECT_CI pi = picks.begin();
174  pi != picks.end(); ++pi) {
175  unsigned int pickIdx = (*pi);
176  CHECK_INVARIANT(poolIdx!=pickIdx,"");
177  double dist = func(poolIdx,pickIdx);
178  if (dist <= minTOi) {
179  minTOi = dist;
180  }
181  }
182  if (minTOi > maxOFmin || (RDKit::feq(minTOi,maxOFmin) && poolIdx<pick) ) {
183  maxOFmin = minTOi;
184  pick = poolIdx;
185  plri = pli;
186  }
187  }
188 
189  // now add the new pick to picks and remove it from the pool
190  picks.push_back(pick);
191  CHECK_INVARIANT(plri!=pool.end(),"");
192  pool.erase(plri);
193  }
194  return picks;
195  }
196 
197 
198 };
199 
200 #endif
std::list< int > INT_LIST
Definition: types.h:152
boost::minstd_rand rng_type
Definition: utils.h:32
#define CHECK_INVARIANT(expr, mess)
Definition: Invariant.h:114
const double MAX_DOUBLE
Definition: types.h:136
INT_LIST::iterator INT_LIST_I
Definition: types.h:153
double getDistFromLTM(const double *distMat, unsigned int i, unsigned int j)
function to lookup distance from 1D lower triangular distance matrix
RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize, unsigned int pickSize, RDKit::INT_VECT firstPicks, int seed=-1) const
Contains the implementation for the MaxMin diversity picker.
Definition: MaxMinPicker.h:97
std::vector< int > INT_VECT
Definition: types.h:146
RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize, unsigned int pickSize) const
Definition: MaxMinPicker.h:109
Implements the MaxMin algorithm for picking a subset of item from a pool.
Definition: MaxMinPicker.h:41
INT_VECT::const_iterator INT_VECT_CI
Definition: types.h:148
MaxMinPicker()
Default Constructor.
Definition: MaxMinPicker.h:46
Class to allow us to throw a ValueError from C++ and have it make it back to Python.
Definition: Exceptions.h:31
Abstract base class to do perform item picking (typically molecules) using a distance matrix...
Definition: DistPicker.h:39
RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize, unsigned int pickSize, RDKit::INT_VECT firstPicks=RDKit::INT_VECT(), int seed=-1) const
Contains the implementation for a lazy MaxMin diversity picker.
Definition: MaxMinPicker.h:120
bool feq(double v1, double v2, double tol=1e-4)
floating point comparison with a tolerance