RDKit
Open-source cheminformatics and machine learning.
MaxMinPicker.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2003-2007 Greg Landrum and Rational Discovery LLC
3 // Copyright (C) 2017 Greg Landrum and NextMove Software
4 //
5 // @@ All Rights Reserved @@
6 // This file is part of the RDKit.
7 // The contents are covered by the terms of the BSD license
8 // which is included in the file license.txt, found at the root
9 // of the RDKit source tree.
10 //
11 #include <RDGeneral/export.h>
12 #ifndef RD_MAXMINPICKER_H
13 #define RD_MAXMINPICKER_H
14 
15 #include <RDGeneral/types.h>
16 #include <RDGeneral/utils.h>
17 #include <RDGeneral/Invariant.h>
18 #include <RDGeneral/RDLog.h>
19 #include <RDGeneral/Exceptions.h>
20 #include <cstdlib>
21 #include "DistPicker.h"
22 #include <boost/random.hpp>
23 
24 namespace RDPickers {
25 
26 namespace {
27 class RDKIT_SIMDIVPICKERS_EXPORT distmatFunctor {
28  public:
29  distmatFunctor(const double *distMat) : dp_distMat(distMat){};
30  double operator()(unsigned int i, unsigned int j) {
31  return getDistFromLTM(this->dp_distMat, i, j);
32  }
33 
34  private:
35  const double *dp_distMat;
36 };
37 }
38 
39 /*! \brief Implements the MaxMin algorithm for picking a subset of item from a
40  *pool
41  *
42  * This class inherits from the DistPicker and implements a specific picking
43  *strategy
44  * aimed at diversity. See documentation for "pick()" member function for the
45  *algorithm details
46  */
48  public:
49  /*! \brief Default Constructor
50  *
51  */
53 
54  /*! \brief Contains the implementation for a lazy MaxMin diversity picker
55  *
56  * See the documentation for the pick() method for details about the algorithm
57  *
58  * \param func - a function (or functor) taking two unsigned ints as
59  *arguments
60  * and returning the distance (as a double) between those two
61  *elements.
62  * \param poolSize - the size of the pool to pick the items from. It is
63  *assumed that the
64  * distance matrix above contains the right number of elements;
65  *i.e.
66  * poolSize*(poolSize-1)
67  * \param pickSize - the number items to pick from pool (<= poolSize)
68  * \param firstPicks - (optional)the first items in the pick list
69  * \param seed - (optional) seed for the random number generator
70  */
71  template <typename T>
72  RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
73  unsigned int pickSize) const;
74 
75  template <typename T>
76  RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
77  unsigned int pickSize,
78  const RDKit::INT_VECT &firstPicks,
79  int seed = -1) const;
80 
81  template <typename T>
82  RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
83  unsigned int pickSize,
84  const RDKit::INT_VECT &firstPicks, int seed,
85  double &threshold) const;
86 
87  /*! \brief Contains the implementation for the MaxMin diversity picker
88  *
89  * Here is how the picking algorithm works, refer to
90  * Ashton, M. et. al., Quant. Struct.-Act. Relat., 21 (2002), 598-604
91  * for more detail:
92  *
93  * A subset of k items is to be selected from a pool containing N molecules.
94  * Then the MaxMin method is as follows:
95  * -# Initialise Subset with some appropriately chosen seed
96  * compound and set x = 1.
97  * -# For each of the N-x remaining compounds in Dataset
98  * calculate its dissimilarity with each of the x compounds in
99  * Subset and retain the smallest of these x dissimilarities for
100  * each compound in Dataset.
101  * -# Select the molecule from Dataset with the largest value
102  * for the smallest dissimilarity calculated in Step 2 and
103  * transfer it to Subset.
104  * -# Set x = x + 1 and return to Step 2 if x < k.
105  *
106  *
107  *
108  * \param distMat - distance matrix - a vector of double. It is assumed that
109  *only the
110  * lower triangle element of the matrix are supplied in a 1D
111  *array\n
112  * \param poolSize - the size of the pool to pick the items from. It is
113  *assumed that the
114  * distance matrix above contains the right number of elements;
115  *i.e.
116  * poolSize*(poolSize-1) \n
117  * \param pickSize - the number items to pick from pool (<= poolSize)
118  * \param firstPicks - indices of the items used to seed the pick set.
119  * \param seed - (optional) seed for the random number generator
120  */
121  RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize,
122  unsigned int pickSize, RDKit::INT_VECT firstPicks,
123  int seed = -1) const {
124  CHECK_INVARIANT(distMat, "Invalid Distance Matrix");
125  if (!poolSize) throw ValueErrorException("empty pool to pick from");
126  if (poolSize < pickSize)
127  throw ValueErrorException("pickSize cannot be larger than the poolSize");
128  distmatFunctor functor(distMat);
129  return this->lazyPick(functor, poolSize, pickSize, firstPicks, seed);
130  }
131 
132  /*! \overload */
133  RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize,
134  unsigned int pickSize) const {
135  RDKit::INT_VECT iv;
136  return pick(distMat, poolSize, pickSize, iv);
137  }
138 };
139 
141  double dist_bound; // distance to closest reference
142  unsigned int picks; // number of references considered
143  unsigned int next; // singly linked list of candidates
144 };
145 
146 // we implement this here in order to allow arbitrary functors without link
147 // errors
148 template <typename T>
149 RDKit::INT_VECT MaxMinPicker::lazyPick(T &func, unsigned int poolSize,
150  unsigned int pickSize,
151  const RDKit::INT_VECT &firstPicks,
152  int seed, double &threshold) const {
153  if (!poolSize) throw ValueErrorException("empty pool to pick from");
154 
155  if (poolSize < pickSize)
156  throw ValueErrorException("pickSize cannot be larger than the poolSize");
157 
158  RDKit::INT_VECT picks;
159 
160  unsigned int memsize = (unsigned int)(poolSize * sizeof(MaxMinPickInfo));
161  MaxMinPickInfo *pinfo = new MaxMinPickInfo[memsize];
162  if (!pinfo) {
163  threshold = -1.0;
164  return picks;
165  }
166  memset(pinfo, 0, memsize);
167 
168  picks.reserve(pickSize);
169  unsigned int picked = 0; // picks.size()
170  unsigned int pick = 0;
171 
172  // pick the first entry
173  if (firstPicks.empty()) {
174  // get a seeded random number generator:
175  typedef boost::mt19937 rng_type;
176  typedef boost::uniform_int<> distrib_type;
177  typedef boost::variate_generator<rng_type &, distrib_type> source_type;
178  rng_type generator(42u);
179  distrib_type dist(0, poolSize - 1);
180  source_type randomSource(generator, dist);
181  if (seed > 0) generator.seed(static_cast<rng_type::result_type>(seed));
182 
183  pick = randomSource();
184  // add the pick to the picks
185  picks.push_back(pick);
186  // and remove it from the pool
187  pinfo[pick].picks = 1;
188  picked = 1;
189 
190  } else {
191  for (RDKit::INT_VECT::const_iterator pIdx = firstPicks.begin();
192  pIdx != firstPicks.end(); ++pIdx) {
193  pick = static_cast<unsigned int>(*pIdx);
194  if (pick >= poolSize) {
195  delete[] pinfo;
196  throw ValueErrorException("pick index was larger than the poolSize");
197  }
198  picks.push_back(pick);
199  pinfo[pick].picks = 1;
200  picked++;
201  }
202  }
203 
204  if (picked >= pickSize) {
205  threshold = -1.0;
206  delete[] pinfo;
207  return picks;
208  }
209 
210  unsigned int pool_list = 0;
211  unsigned int *prev = &pool_list;
212  // enter the pool into a list so that we can pick out of it easily
213  for (unsigned int i = 0; i < poolSize; i++)
214  if (pinfo[i].picks == 0) {
215  *prev = i;
216  prev = &pinfo[i].next;
217  }
218  *prev = 0;
219 
220  unsigned int poolIdx;
221  unsigned int pickIdx;
222 
223  // First pass initialize dist_bound
224  prev = &pool_list;
225  pickIdx = picks[0];
226  do {
227  poolIdx = *prev;
228  pinfo[poolIdx].dist_bound = func(poolIdx, pickIdx);
229  pinfo[poolIdx].picks = 1;
230  prev = &pinfo[poolIdx].next;
231  } while (*prev != 0);
232 
233  // now pick 1 compound at a time
234  double maxOFmin = -1.0;
235  double tmpThreshold = -1.0;
236  while (picked < pickSize) {
237  unsigned int *pick_prev = 0;
238  maxOFmin = -1.0;
239  prev = &pool_list;
240  do {
241  poolIdx = *prev;
242  double minTOi = pinfo[poolIdx].dist_bound;
243  if (minTOi > maxOFmin) {
244  unsigned int pi = pinfo[poolIdx].picks;
245  while (pi < picked) {
246  unsigned int picki = picks[pi];
247  CHECK_INVARIANT(poolIdx != picki, "pool index != pick index");
248  double dist = func(poolIdx, picki);
249  pi++;
250  if (dist <= minTOi) {
251  minTOi = dist;
252  if (minTOi <= maxOFmin) break;
253  }
254  }
255  pinfo[poolIdx].dist_bound = minTOi;
256  pinfo[poolIdx].picks = pi;
257  if (minTOi > maxOFmin) {
258  maxOFmin = minTOi;
259  pick_prev = prev;
260  pick = poolIdx;
261  }
262  }
263  prev = &pinfo[poolIdx].next;
264  } while (*prev != 0);
265 
266  // if the current distance is closer then threshold, we're done
267  if (threshold >= 0.0 && maxOFmin < threshold) break;
268  tmpThreshold = maxOFmin;
269  // now add the new pick to picks and remove it from the pool
270  *pick_prev = pinfo[pick].next;
271  picks.push_back(pick);
272  picked++;
273  }
274 
275  threshold = tmpThreshold;
276  delete[] pinfo;
277  return picks;
278 }
279 
280 template <typename T>
281 RDKit::INT_VECT MaxMinPicker::lazyPick(T &func, unsigned int poolSize,
282  unsigned int pickSize,
283  const RDKit::INT_VECT &firstPicks,
284  int seed) const {
285  double threshold = -1.0;
286  return MaxMinPicker::lazyPick(func, poolSize, pickSize, firstPicks, seed,
287  threshold);
288 }
289 
290 template <typename T>
291 RDKit::INT_VECT MaxMinPicker::lazyPick(T &func, unsigned int poolSize,
292  unsigned int pickSize) const {
293  RDKit::INT_LIST firstPicks;
294  double threshold = -1.0;
295  return MaxMinPicker::lazyPick(func, poolSize, pickSize, firstPicks, -1,
296  threshold);
297 }
298 };
299 
300 #endif
std::list< int > INT_LIST
Definition: types.h:253
boost::minstd_rand rng_type
Definition: utils.h:36
#define CHECK_INVARIANT(expr, mess)
Definition: Invariant.h:100
#define RDKIT_SIMDIVPICKERS_EXPORT
Definition: export.h:580
RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize, unsigned int pickSize) const
Contains the implementation for a lazy MaxMin diversity picker.
Definition: MaxMinPicker.h:291
std::vector< int > INT_VECT
Definition: types.h:247
RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize, unsigned int pickSize) const
Definition: MaxMinPicker.h:133
Implements the MaxMin algorithm for picking a subset of item from a pool.
Definition: MaxMinPicker.h:47
RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize, unsigned int pickSize, RDKit::INT_VECT firstPicks, int seed=-1) const
Contains the implementation for the MaxMin diversity picker.
Definition: MaxMinPicker.h:121
MaxMinPicker()
Default Constructor.
Definition: MaxMinPicker.h:52
RDKIT_SIMDIVPICKERS_EXPORT double getDistFromLTM(const double *distMat, unsigned int i, unsigned int j)
function to lookup distance from 1D lower triangular distance matrix
Class to allow us to throw a ValueError from C++ and have it make it back to Python.
Definition: Exceptions.h:33
Abstract base class to do perform item picking (typically molecules) using a distance matrix...
Definition: DistPicker.h:44