RDKit
Open-source cheminformatics and machine learning.
HierarchicalClusterPicker.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2003-2006 Rational Discovery LLC
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #ifndef _HIERARCHCLUSTERPICKER_H
11 #define _HIERARCHCLUSTERPICKER_H
12 
13 #include <RDGeneral/types.h>
14 #include "DistPicker.h"
15 
16 namespace RDPickers {
17 
18  /*! \brief Diversity picker based on hierarchical clustering
19  *
20  * This class inherits from DistPicker since it uses the distance matrix
21  * for diversity picking. The clustering itself is done using the Murtagh
22  * code in $RDBASE/Code/ML/Cluster/Mutagh/
23  */
25  public:
26 
27  /*! \brief The type of hierarchical clustering algorithm to use
28  */
29  typedef enum {
30  WARD=1,
31  SLINK=2,
32  CLINK=3,
33  UPGMA=4,
35  GOWER=6,
37 
38  /*! \brief Constructor - takes a ClusterMethod as an argument
39  *
40  * Sets the hierarch clustering method
41  */
42  explicit HierarchicalClusterPicker(ClusterMethod clusterMethod) : d_method(clusterMethod) {;};
43 
44  /*! \brief This is the function that does the picking
45  *
46  * Here is how the algorithm works \n
47  * FIX: Supply reference
48  *
49  * - The entire pool is clustered using the distance matrix using one of the
50  * hierachical clustering method (specified via the constructor). \n
51  * - Starting with the individaul items in the pool, clusters are merged based
52  * on the output from clustering method. \n
53  * - The merging is stopped when the number of clusters is same as
54  * the number of picks.
55  * - For each item in a cluster the sum of square of the distances to the rest of
56  * of the items (in the cluster) is computed. The item with the smallest of values is
57  * picked as a representative of the cluster. Basically trying to pick the item closest
58  * to the centroid of the cluster.
59  *
60  *
61  * \param distMat - distance matrix - a vector of double. It is assumed that only the
62  * lower triangle element of the matrix are supplied in a 1D array\n
63  * NOTE: this matrix WILL BE ALTERED during the picking\n
64  * \param poolSize - the size of the pool to pick the items from. It is assumed that the
65  * distance matrix above contains the right number of elements; i.e.
66  * poolSize*(poolSize-1) \n
67  * \param pickSize - the number items to pick from pool (<= poolSize)
68  */
69  RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize, unsigned int pickSize) const ;
70 
71  /*! \brief This is the function that does the clustering of the items - used by the picker
72  *
73  * ARGUMENTS:
74  *
75  * \param distMat - distance matrix - a vector of double. It is assumed that only the
76  * lower triangle element of the matrix are supplied in a 1D array\n
77  * NOTE: this matrix WILL BE ALTERED during the picking\n
78  * \param poolSize - the size of the pool to pick the items from. It is assumed that the
79  * distance matrix above contains the right number of elements; i.e.
80  * poolSize*(poolSize-1) \n
81  * \param pickSize - the number clusters to divide the pool into (<= poolSize)
82  */
83  RDKit::VECT_INT_VECT cluster(const double *distMat, unsigned int poolSize, unsigned int pickSize) const;
84 
85  private:
86  ClusterMethod d_method;
87  };
88 };
89 
90 #endif
Diversity picker based on hierarchical clustering.
std::vector< INT_VECT > VECT_INT_VECT
Definition: types.h:160
std::vector< int > INT_VECT
Definition: types.h:146
RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize, unsigned int pickSize) const
This is the function that does the picking.
Abstract base class to do perform item picking (typically molecules) using a distance matrix...
Definition: DistPicker.h:39
RDKit::VECT_INT_VECT cluster(const double *distMat, unsigned int poolSize, unsigned int pickSize) const
This is the function that does the clustering of the items - used by the picker.
ClusterMethod
The type of hierarchical clustering algorithm to use.
HierarchicalClusterPicker(ClusterMethod clusterMethod)
Constructor - takes a ClusterMethod as an argument.