RDKit
Open-source cheminformatics and machine learning.
Catalog.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2003-2006 Rational Discovery LLC
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 
11 #include <RDGeneral/export.h>
12 #ifndef __RD_CATALOG_H__
13 #define __RD_CATALOG_H__
14 
15 // Boost graph stuff
17 #include <boost/graph/graph_traits.hpp>
18 #include <boost/graph/adjacency_list.hpp>
19 #include <boost/version.hpp>
20 #if BOOST_VERSION >= 104000
21 #include <boost/property_map/property_map.hpp>
22 #else
23 #include <boost/property_map.hpp>
24 #endif
26 
27 // for some typedefs
28 #include <RDGeneral/types.h>
29 #include <RDGeneral/StreamOps.h>
30 
31 namespace RDCatalog {
32 const int versionMajor = 1;
33 const int versionMinor = 0;
34 const int versionPatch = 0;
35 const int endianId = 0xDEADBEEF;
36 
37 //-----------------------------------------------------------------------------
38 //! abstract base class for a catalog object
39 template <class entryType, class paramType>
40 class Catalog {
41  public:
42  typedef entryType entryType_t;
43  typedef paramType paramType_t;
44 
45  //------------------------------------
47 
48  //------------------------------------
49  virtual ~Catalog() { delete dp_cParams; }
50 
51  //------------------------------------
52  //! return a serialized form of the Catalog as an std::string
53  virtual std::string Serialize() const = 0;
54 
55  //------------------------------------
56  //! adds an entry to the catalog
57  /*!
58 
59  \param entry the entry to be added
60  \param updateFPLength (optional) if this is true, our internal
61  fingerprint length will also be updated.
62 
63  */
64  virtual unsigned int addEntry(entryType *entry,
65  bool updateFPLength = true) = 0;
66 
67  //------------------------------------
68  //! returns a particular entry in the Catalog
69  virtual const entryType *getEntryWithIdx(unsigned int idx) const = 0;
70 
71  //------------------------------------
72  //! returns the number of entries
73  virtual unsigned int getNumEntries() const = 0;
74 
75  //------------------------------------
76  //! returns the length of our fingerprint
77  unsigned int getFPLength() const { return d_fpLength; }
78 
79  //------------------------------------
80  //! sets our fingerprint length
81  void setFPLength(unsigned int val) { d_fpLength = val; }
82 
83  //------------------------------------
84  //! sets our parameters by copying the \c params argument
85  virtual void setCatalogParams(paramType *params) {
86  PRECONDITION(params, "bad parameter object");
87  // if we already have a paramter object throw an exception
89  "A parameter object already exists on the catalog");
90  /*
91  if (dp_cParams) {
92  // we already have parameter object on the catalog
93  // can't overwrite it
94  PRECONDITION(0, "A parameter object already exist on the catalog");
95  }*/
96  dp_cParams = new paramType(*params);
97  }
98 
99  //------------------------------------
100  //! returns a pointer to our parameters
101  const paramType *getCatalogParams() const { return dp_cParams; }
102 
103  protected:
104  // this is the ID that will be assigned to the next entry
105  // added to the catalog - need not be same as the number of entries
106  // in the catalog and does not correspond with the
107  // id of the entry in the catalog.
108  // this is more along the lines of bitId
109  unsigned int d_fpLength; //!< the length of our fingerprint
110  paramType *dp_cParams; //!< our params object
111 };
112 
113 //-----------------------------------------------------------------------------
114 //! A Catalog with a hierarchical structure
115 /*!
116 
117  The entries of a HierarchCatalog are arranged in a directed graph
118 
119  <b>The difference between <i>Indices</i> and <i>Bit Ids</i></b>
120 
121  A HierarchCatalog may contain more entries than the user is actually
122  interested in. For example a HierarchCatalog constructed to contain
123  orders 5 through 8 may well contain information about orders 1-5,
124  in order to facilitate some search optimizations.
125 
126  - <i>Bit Ids</i> refer to the "interesting" bits.
127  So, in the above example, Bit Id \c 0 will be the first entry
128  with order 5.
129  - <i>Indices</i> refer to the underlying structure of the catalog.
130  So, in the above example, the entry with index \c 0 will be
131  the first entry with order 1.
132 
133 */
134 template <class entryType, class paramType, class orderType>
135 class HierarchCatalog : public Catalog<entryType, paramType> {
136  // the entries in the catalog can be traversed using the edges
137  // in a desired order
138  public:
139  //! used by the BGL to set up the node properties in our graph
140  struct vertex_entry_t {
141  enum { num = 1003 };
142  typedef boost::vertex_property_tag kind;
143  };
144  typedef boost::property<vertex_entry_t, entryType *> EntryProperty;
145 
146  //! the type of the graph itself:
147  typedef boost::adjacency_list<
148  boost::vecS,
149  boost::vecS, // FIX: should be using setS for edges so that parallel
150  // edges are never added (page 225 BGL book)
151  // but that seems result in compile errors
152  boost::bidirectionalS, EntryProperty> CatalogGraph;
153 
154  typedef boost::graph_traits<CatalogGraph> CAT_GRAPH_TRAITS;
155  typedef typename CAT_GRAPH_TRAITS::vertex_iterator VER_ITER;
156  typedef std::pair<VER_ITER, VER_ITER> ENT_ITER_PAIR;
157  typedef typename CAT_GRAPH_TRAITS::adjacency_iterator DOWN_ENT_ITER;
158  typedef std::pair<DOWN_ENT_ITER, DOWN_ENT_ITER> DOWN_ENT_ITER_PAIR;
159 
160  //------------------------------------
162 
163  //------------------------------------
164  //! Construct by making a copy of the input \c params object
167  this->setCatalogParams(params);
168  }
169 
170  //------------------------------------
171  //! Construct from a \c pickle (a serialized form of the HierarchCatalog)
173  this->initFromString(pickle);
174  }
175 
176  //------------------------------------
177  ~HierarchCatalog() { destroy(); }
178 
179  //------------------------------------
180  //! serializes this object to a stream
181  void toStream(std::ostream &ss) const {
182  PRECONDITION(this->getCatalogParams(), "NULL parameter object");
183 
184  // the i/o header:
185  RDKit::streamWrite(ss, endianId);
186  RDKit::streamWrite(ss, versionMajor);
187  RDKit::streamWrite(ss, versionMinor);
188  RDKit::streamWrite(ss, versionPatch);
189 
190  // information about the catalog itself:
191  int tmpUInt;
192  tmpUInt = this->getFPLength();
193  RDKit::streamWrite(ss, tmpUInt);
194  tmpUInt = this->getNumEntries();
195  RDKit::streamWrite(ss, tmpUInt);
196 
197  // std::cout << ">>>>-------------------------------" << std::endl;
198  // std::cout << "\tlength: " << getFPLength() << " " << getNumEntries() <<
199  // std::endl;
200 
201  // add the params object:
202  this->getCatalogParams()->toStream(ss);
203  // std::cout << "\tparams: " << getCatalogParams()->getLowerFragLength();
204  // std::cout << " " << getCatalogParams()->getUpperFragLength();
205  // std::cout << " " << getCatalogParams()->getNumFuncGroups();
206  // std::cout << std::endl;
207 
208  // write the entries in order:
209  for (unsigned int i = 0; i < getNumEntries(); i++) {
210  this->getEntryWithIdx(i)->toStream(ss);
211  }
212 
213  // finally the adjacency list:
214  for (unsigned int i = 0; i < getNumEntries(); i++) {
215  RDKit::INT_VECT children = this->getDownEntryList(i);
216  tmpUInt = static_cast<unsigned int>(children.size());
217  RDKit::streamWrite(ss, tmpUInt);
218  for (RDKit::INT_VECT::const_iterator ivci = children.begin();
219  ivci != children.end(); ivci++) {
220  RDKit::streamWrite(ss, *ivci);
221  }
222  }
223  }
224 
225  //------------------------------------
226  //! serializes this object and returns the resulting \c pickle
227  std::string Serialize() const {
228  std::stringstream ss(std::ios_base::binary | std::ios_base::out |
229  std::ios_base::in);
230  this->toStream(ss);
231  return ss.str();
232  }
233 
234  //------------------------------------
235  //! fills the contents of this object from a stream containing a \c pickle
236  void initFromStream(std::istream &ss) {
237  int tmpInt;
238  // FIX: at the moment we ignore the header info:
239  RDKit::streamRead(ss, tmpInt);
240  RDKit::streamRead(ss, tmpInt);
241  RDKit::streamRead(ss, tmpInt);
242  RDKit::streamRead(ss, tmpInt);
243 
244  unsigned int tmpUInt;
245  RDKit::streamRead(ss, tmpUInt); // fp length
246  this->setFPLength(tmpUInt);
247 
248  unsigned int numEntries;
249  RDKit::streamRead(ss, numEntries);
250  // std::cout << "<<<-------------------------------" << std::endl;
251  // std::cout << "\tlength: " << getFPLength() << " " << numEntries <<
252  // std::endl;
253 
254  // grab the params:
255  paramType *params = new paramType();
256  params->initFromStream(ss);
257  this->setCatalogParams(params);
258 
259  // std::cout << "\tparams: " << getCatalogParams()->getLowerFragLength();
260  // std::cout << " " << getCatalogParams()->getUpperFragLength();
261  // std::cout << " " << getCatalogParams()->getNumFuncGroups();
262  // std::cout << std::endl;
263 
264  // now all of the entries:
265  for (unsigned int i = 0; i < numEntries; i++) {
266  entryType *entry = new entryType();
267  entry->initFromStream(ss);
268  this->addEntry(entry, false);
269  }
270 
271  // and, finally, the adjacency list:
272  for (unsigned int i = 0; i < numEntries; i++) {
273  unsigned int nNeighbors;
274  RDKit::streamRead(ss, nNeighbors);
275  for (unsigned int j = 0; j < nNeighbors; j++) {
276  RDKit::streamRead(ss, tmpInt);
277  this->addEdge(i, tmpInt);
278  }
279  }
280  }
281 
282  //------------------------------------
283  unsigned int getNumEntries() const { return static_cast<unsigned int>(boost::num_vertices(d_graph)); }
284 
285  //------------------------------------
286  //! fills the contents of this object from a string containing a \c pickle
287  void initFromString(const std::string &text) {
288  std::stringstream ss(std::ios_base::binary | std::ios_base::out |
289  std::ios_base::in);
290  // initialize the stream:
291  ss.write(text.c_str(), text.length());
292  // now start reading out values:
293  this->initFromStream(ss);
294  }
295 
296  //------------------------------------
297  //! add a new entry to the catalog
298  /*!
299 
300  \param entry the entry to be added
301  \param updateFPLength (optional) if this is true, our internal
302  fingerprint length will also be updated.
303 
304  */
305  unsigned int addEntry(entryType *entry, bool updateFPLength = true) {
306  PRECONDITION(entry, "bad arguments");
307  if (updateFPLength) {
308  unsigned int fpl = this->getFPLength();
309  entry->setBitId(fpl);
310  fpl++;
311  this->setFPLength(fpl);
312  }
313  unsigned int eid = static_cast<unsigned int>(boost::add_vertex(EntryProperty(entry), d_graph));
314  orderType etype = entry->getOrder();
315  // REVIEW: this initialization is not required: the STL map, in
316  // theory, will create a new object when operator[] is called
317  // for a new item
318  if (d_orderMap.find(etype) == d_orderMap.end()) {
319  RDKit::INT_VECT nets;
320  d_orderMap[etype] = nets;
321  }
322  d_orderMap[etype].push_back(eid);
323  return eid;
324  }
325 
326  //------------------------------------
327  //! adds an edge between two entries in the catalog
328  /*!
329  Since we are using a bidirectional graph - the order in
330  which the ids are supplied here makes a difference
331 
332  \param id1 index of the edge's beginning
333  \param id2 index of the edge's end
334 
335  */
336  void addEdge(unsigned int id1, unsigned int id2) {
337  unsigned int nents = getNumEntries();
338  URANGE_CHECK(id1, nents);
339  URANGE_CHECK(id2, nents);
340  // FIX: if we boost::setS for the edgeList BGL will
341  // do the checking for duplicity (parallel edges)
342  // But for reasons unknown setS results in compile
343  // errors while using adjacent_vertices.
344  typename CAT_GRAPH_TRAITS::edge_descriptor edge;
345  bool found;
346  boost::tie(edge, found) = boost::edge(boost::vertex(id1, d_graph),
347  boost::vertex(id2, d_graph), d_graph);
348  if (!found) {
349  boost::add_edge(id1, id2, d_graph);
350  }
351  }
352 
353  //------------------------------------
354  //! returns a pointer to our entry with a particular index
355  const entryType *getEntryWithIdx(unsigned int idx) const {
356  URANGE_CHECK(idx, getNumEntries());
357  int vd = static_cast<int>(boost::vertex(idx, d_graph));
358  typename boost::property_map<CatalogGraph, vertex_entry_t>::const_type
359  pMap = boost::get(vertex_entry_t(), d_graph);
360  return pMap[vd];
361  }
362 
363  //------------------------------------
364  //! returns a pointer to our entry with a particular bit ID
365  const entryType *getEntryWithBitId(unsigned int idx) const {
366  URANGE_CHECK(idx, this->getFPLength());
367  typename boost::property_map<CatalogGraph, vertex_entry_t>::const_type
368  pMap = boost::get(vertex_entry_t(), d_graph);
369  const entryType *res = NULL;
370  for (unsigned int i = idx; i < this->getNumEntries(); i++) {
371  const entryType *e = pMap[i];
372  if (e->getBitId() == static_cast<int>(idx)) {
373  res = e;
374  break;
375  }
376  }
377  return res;
378  }
379 
380  //------------------------------------
381  //! returns the index of the entry with a particular bit ID
382  int getIdOfEntryWithBitId(unsigned int idx) const {
383  URANGE_CHECK(idx, this->getFPLength());
384  typename boost::property_map<CatalogGraph, vertex_entry_t>::const_type
385  pMap = boost::get(vertex_entry_t(), d_graph);
386  int res = -1;
387  for (unsigned int i = idx; i < this->getNumEntries(); i++) {
388  const entryType *e = pMap[i];
389  if (static_cast<unsigned int>(e->getBitId()) == idx) {
390  res = i;
391  break;
392  }
393  }
394  return res;
395  }
396 
397  //------------------------------------
398  //! returns a list of the indices of entries below the one passed in
399  RDKit::INT_VECT getDownEntryList(unsigned int idx) const {
400  RDKit::INT_VECT res;
401  DOWN_ENT_ITER nbrIdx, endIdx;
402  boost::tie(nbrIdx, endIdx) = boost::adjacent_vertices(idx, d_graph);
403  while (nbrIdx != endIdx) {
404  res.push_back(static_cast<int>(*nbrIdx));
405  nbrIdx++;
406  }
407  // std::cout << res.size() << "\n";
408  return res;
409  }
410 
411  //------------------------------------
412  //! returns a list of the indices that have a particular order
413  const RDKit::INT_VECT &getEntriesOfOrder(orderType ord) {
414  return d_orderMap[ord];
415  }
416 
417  //------------------------------------
418  //! returns a list of the indices that have a particular order
419  /*!
420  \overload
421  */
422  const RDKit::INT_VECT &getEntriesOfOrder(orderType ord) const {
423  typename std::map<orderType, RDKit::INT_VECT>::const_iterator elem;
424  elem = d_orderMap.find(ord);
426  elem != d_orderMap.end(),
427  " catalog does not contain any entries of the order specified");
428  return elem->second;
429  }
430 
431  private:
432  // graphs that store the entries in the catalog in a hierachical manner
433  CatalogGraph d_graph;
434  // a map that maps the order type of entries in the catalog to
435  // a vector of vertex indices in the graphs above
436  // e.g. for a catalog with molecular fragments, the order of a fragment can
437  // simply be the number of bond in it. The list this oder maps to is all the
438  // vertex ids of these fragment in the catalog that have this many bonds in
439  // them
440  std::map<orderType, RDKit::INT_VECT> d_orderMap;
441 
442  //------------------------------------
443  //! clear any memory that we've used
444  void destroy() {
445  ENT_ITER_PAIR entItP = boost::vertices(d_graph);
446  typename boost::property_map<CatalogGraph, vertex_entry_t>::type pMap =
447  boost::get(vertex_entry_t(), d_graph);
448  while (entItP.first != entItP.second) {
449  delete pMap[*(entItP.first++)];
450  }
451  }
452 };
453 
454 //-----------------------------------------------------------------------------
455 //! a linear Catalog (analogous to an std::vector)
456 /*!
457  Here there is no particular hierarchy, simply a
458  collection of entries.
459 */
460 template <class entryType, class orderType>
461 class LinearCatalog : public Catalog<entryType, orderType> {
462  // here there is no particular hierarchy of entries
463  // we simply model it as a vector of entries
464  // FIX: for retrieval purposes a better model map be std::map
465 
466  public:
467  std::string Serialize();
468 
469  unsigned int addEntry(entryType *entry, bool updateFPLength = true);
470 
471  const entryType *getEntryWithIdx(unsigned int idx) const;
472 
473  private:
474  std::vector<entryType *> d_vector;
475 };
476 }
477 
478 #endif
virtual const entryType * getEntryWithIdx(unsigned int idx) const =0
returns a particular entry in the Catalog
const int versionMinor
Definition: Catalog.h:33
void initFromString(const std::string &text)
fills the contents of this object from a string containing a pickle
Definition: Catalog.h:287
CAT_GRAPH_TRAITS::adjacency_iterator DOWN_ENT_ITER
Definition: Catalog.h:157
used by the BGL to set up the node properties in our graph
Definition: Catalog.h:140
paramType paramType_t
Definition: Catalog.h:43
int getIdOfEntryWithBitId(unsigned int idx) const
returns the index of the entry with a particular bit ID
Definition: Catalog.h:382
virtual std::string Serialize() const =0
return a serialized form of the Catalog as an std::string
#define CHECK_INVARIANT(expr, mess)
Definition: Invariant.h:100
unsigned int getNumEntries() const
returns the number of entries
Definition: Catalog.h:283
boost::vertex_property_tag kind
Definition: Catalog.h:142
virtual void setCatalogParams(paramType *params)
sets our parameters by copying the params argument
Definition: Catalog.h:85
virtual unsigned int addEntry(entryType *entry, bool updateFPLength=true)=0
adds an entry to the catalog
void streamRead(std::istream &ss, T &loc)
does a binary read of an object from a stream
Definition: StreamOps.h:245
boost::graph_traits< CatalogGraph > CAT_GRAPH_TRAITS
Definition: Catalog.h:154
RDKIT_CHEMREACTIONS_EXPORT void pickle(const boost::shared_ptr< EnumerationStrategyBase > &enumerator, std::ostream &ss)
pickles a EnumerationStrategy and adds the results to a stream ss
const int endianId
Definition: Catalog.h:35
virtual unsigned int getNumEntries() const =0
returns the number of entries
std::pair< DOWN_ENT_ITER, DOWN_ENT_ITER > DOWN_ENT_ITER_PAIR
Definition: Catalog.h:158
a linear Catalog (analogous to an std::vector)
Definition: Catalog.h:461
A Catalog with a hierarchical structure.
Definition: Catalog.h:135
std::pair< VER_ITER, VER_ITER > ENT_ITER_PAIR
Definition: Catalog.h:156
void initFromStream(std::istream &ss)
fills the contents of this object from a stream containing a pickle
Definition: Catalog.h:236
const RDKit::INT_VECT & getEntriesOfOrder(orderType ord) const
returns a list of the indices that have a particular order
Definition: Catalog.h:422
abstract base class for a catalog object
Definition: Catalog.h:40
std::string Serialize() const
serializes this object and returns the resulting pickle
Definition: Catalog.h:227
const paramType * getCatalogParams() const
returns a pointer to our parameters
Definition: Catalog.h:101
std::vector< int > INT_VECT
Definition: types.h:247
CAT_GRAPH_TRAITS::vertex_iterator VER_ITER
Definition: Catalog.h:155
RDKit::INT_VECT getDownEntryList(unsigned int idx) const
returns a list of the indices of entries below the one passed in
Definition: Catalog.h:399
virtual ~Catalog()
Definition: Catalog.h:49
const int versionPatch
Definition: Catalog.h:34
boost::property< vertex_entry_t, entryType * > EntryProperty
Definition: Catalog.h:144
const entryType * getEntryWithIdx(unsigned int idx) const
returns a pointer to our entry with a particular index
Definition: Catalog.h:355
#define URANGE_CHECK(x, hi)
Definition: Invariant.h:141
boost::adjacency_list< boost::vecS, boost::vecS, boost::bidirectionalS, EntryProperty > CatalogGraph
the type of the graph itself:
Definition: Catalog.h:152
const int versionMajor
Definition: Catalog.h:32
void streamWrite(std::ostream &ss, const T &val)
does a binary write of an object to a stream
Definition: StreamOps.h:225
#define PRECONDITION(expr, mess)
Definition: Invariant.h:108
RDKIT_RDGENERAL_EXPORT std::ostream & toStream(std::ostream &)
unsigned int d_fpLength
the length of our fingerprint
Definition: Catalog.h:109
const entryType * getEntryWithBitId(unsigned int idx) const
returns a pointer to our entry with a particular bit ID
Definition: Catalog.h:365
paramType * dp_cParams
our params object
Definition: Catalog.h:110
const RDKit::INT_VECT & getEntriesOfOrder(orderType ord)
returns a list of the indices that have a particular order
Definition: Catalog.h:413
void addEdge(unsigned int id1, unsigned int id2)
adds an edge between two entries in the catalog
Definition: Catalog.h:336
unsigned int getFPLength() const
returns the length of our fingerprint
Definition: Catalog.h:77
unsigned int addEntry(entryType *entry, bool updateFPLength=true)
add a new entry to the catalog
Definition: Catalog.h:305
void toStream(std::ostream &ss) const
serializes this object to a stream
Definition: Catalog.h:181
void setFPLength(unsigned int val)
sets our fingerprint length
Definition: Catalog.h:81
entryType entryType_t
Definition: Catalog.h:42