RDKit
Open-source cheminformatics and machine learning.
Catalog.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2003-2006 Rational Discovery LLC
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 
11 #ifndef __RD_CATALOG_H__
12 #define __RD_CATALOG_H__
13 
14 // Boost graph stuff
15 #include <boost/graph/graph_traits.hpp>
16 #include <boost/graph/adjacency_list.hpp>
17 #include <boost/version.hpp>
18 #if BOOST_VERSION >= 104000
19 #include <boost/property_map/property_map.hpp>
20 #else
21 #include <boost/property_map.hpp>
22 #endif
23 
24 
25 // for some typedefs
26 #include <RDGeneral/types.h>
27 #include <RDGeneral/StreamOps.h>
28 
29 namespace RDCatalog {
30  const int versionMajor=1;
31  const int versionMinor=0;
32  const int versionPatch=0;
33  const int endianId=0xDEADBEEF;
34 
35  //-----------------------------------------------------------------------------
36  //! abstract base class for a catalog object
37  template <class entryType, class paramType>
38  class Catalog {
39  public:
40  //------------------------------------
41  Catalog() : d_fpLength(0), dp_cParams(0) {};
42 
43  //------------------------------------
44  virtual ~Catalog(){
45  delete dp_cParams;
46  }
47 
48  //------------------------------------
49  //! return a serialized form of the Catalog as an std::string
50  virtual std::string Serialize() const = 0;
51 
52  //------------------------------------
53  //! adds an entry to the catalog
54  /*!
55 
56  \param entry the entry to be added
57  \param updateFPLength (optional) if this is true, our internal
58  fingerprint length will also be updated.
59 
60  */
61  virtual unsigned int addEntry(entryType *entry, bool updateFPLength = true) = 0;
62 
63  //------------------------------------
64  //! returns a particular entry in the Catalog
65  virtual const entryType* getEntryWithIdx(unsigned int idx) const = 0;
66 
67  //------------------------------------
68  //! returns the number of entries
69  virtual unsigned int getNumEntries() const = 0;
70 
71  //------------------------------------
72  //! returns the length of our fingerprint
73  unsigned int getFPLength() const {return d_fpLength;}
74 
75  //------------------------------------
76  //! sets our fingerprint length
77  void setFPLength(unsigned int val) {d_fpLength = val;}
78 
79  //------------------------------------
80  //! sets our parameters by copying the \c params argument
81  void setCatalogParams(paramType *params) {
82  PRECONDITION(params,"bad parameter object");
83  //if we already have a paramter object throw an exception
84  PRECONDITION(!dp_cParams,"A parameter object already exists on the catalog" );
85  /*
86  if (dp_cParams) {
87  // we already have parameter object on the catalog
88  // can't overwrite it
89  PRECONDITION(0, "A parameter object already exist on the catalog");
90  }*/
91  dp_cParams = new paramType(*params);
92  }
93 
94  //------------------------------------
95  //! returns a pointer to our parameters
96  const paramType *getCatalogParams() const { return dp_cParams;}
97 
98  protected:
99  // this is the ID that will be assigned to the next entry
100  // added to the catalog - need not be same as the number of entries
101  // in the catalog and does not correspond with the
102  // id of the entry in the catalog.
103  // this is more along the lines of bitId
104  unsigned int d_fpLength; //!< the length of our fingerprint
105  paramType *dp_cParams; //!< our params object
106 
107  };
108 
109 
110 
111  //-----------------------------------------------------------------------------
112  //! A Catalog with a hierarchical structure
113  /*!
114 
115  The entries of a HierarchCatalog are arranged in a directed graph
116 
117  <b>The difference between <i>Indices</i> and <i>Bit Ids</i></b>
118 
119  A HierarchCatalog may contain more entries than the user is actually
120  interested in. For example a HierarchCatalog constructed to contain
121  orders 5 through 8 may well contain information about orders 1-5,
122  in order to facilitate some search optimizations.
123 
124  - <i>Bit Ids</i> refer to the "interesting" bits.
125  So, in the above example, Bit Id \c 0 will be the first entry
126  with order 5.
127  - <i>Indices</i> refer to the underlying structure of the catalog.
128  So, in the above example, the entry with index \c 0 will be
129  the first entry with order 1.
130 
131  */
132  template <class entryType, class paramType, class orderType>
133  class HierarchCatalog : public Catalog <entryType, paramType> {
134  // the entries in the catalog can be traversed using the edges
135  // in a desired order
136  public:
137  //! used by the BGL to set up the node properties in our graph
138  struct vertex_entry_t {
139  enum { num=1003 };
140  typedef boost::vertex_property_tag kind;
141  };
142  typedef boost::property<vertex_entry_t, entryType *> EntryProperty;
143 
144  //! the type of the graph itself:
145  typedef boost::adjacency_list<boost::vecS,
146  boost::vecS, // FIX: should be using setS for edges so that parallel edges are never added (page 225 BGL book)
147  // but that seems result in compile errors
148  boost::bidirectionalS,
149  EntryProperty> CatalogGraph;
150 
151  typedef boost::graph_traits<CatalogGraph> CAT_GRAPH_TRAITS;
152  typedef typename CAT_GRAPH_TRAITS::vertex_iterator VER_ITER;
153  typedef std::pair<VER_ITER, VER_ITER> ENT_ITER_PAIR;
154  typedef typename CAT_GRAPH_TRAITS::adjacency_iterator DOWN_ENT_ITER;
155  typedef std::pair<DOWN_ENT_ITER, DOWN_ENT_ITER> DOWN_ENT_ITER_PAIR;
156 
157  //------------------------------------
159 
160  //------------------------------------
161  //! Construct by making a copy of the input \c params object
163  this->setCatalogParams(params);
164  }
165 
166  //------------------------------------
167  //! Construct from a \c pickle (a serialized form of the HierarchCatalog)
169  this->initFromString(pickle);
170  }
171 
172  //------------------------------------
174  destroy();
175  }
176 
177  //------------------------------------
178  //! serializes this object to a stream
179  void toStream(std::ostream &ss) const {
180  PRECONDITION(this->getCatalogParams(),"NULL parameter object");
181 
182  // the i/o header:
183  RDKit::streamWrite(ss,endianId);
184  RDKit::streamWrite(ss,versionMajor);
185  RDKit::streamWrite(ss,versionMinor);
186  RDKit::streamWrite(ss,versionPatch);
187 
188  // information about the catalog itself:
189  int tmpUInt;
190  tmpUInt = this->getFPLength();
191  RDKit::streamWrite(ss,tmpUInt);
192  tmpUInt = this->getNumEntries();
193  RDKit::streamWrite(ss,tmpUInt);
194 
195  //std::cout << ">>>>-------------------------------" << std::endl;
196  //std::cout << "\tlength: " << getFPLength() << " " << getNumEntries() << std::endl;
197 
198  // add the params object:
199  this->getCatalogParams()->toStream(ss);
200  //std::cout << "\tparams: " << getCatalogParams()->getLowerFragLength();
201  //std::cout << " " << getCatalogParams()->getUpperFragLength();
202  //std::cout << " " << getCatalogParams()->getNumFuncGroups();
203  //std::cout << std::endl;
204 
205  // write the entries in order:
206  for(unsigned int i=0;i<getNumEntries();i++){
207  this->getEntryWithIdx(i)->toStream(ss);
208  }
209 
210  // finally the adjacency list:
211  for(unsigned int i=0;i<getNumEntries();i++){
212  RDKit::INT_VECT children=this->getDownEntryList(i);
213  tmpUInt = children.size();
214  RDKit::streamWrite(ss,tmpUInt);
215  for(RDKit::INT_VECT::const_iterator ivci=children.begin();
216  ivci!=children.end();
217  ivci++){
218  RDKit::streamWrite(ss,*ivci);
219  }
220  }
221  }
222 
223  //------------------------------------
224  //! serializes this object and returns the resulting \c pickle
225  std::string Serialize() const {
226  std::stringstream ss(std::ios_base::binary|std::ios_base::out|std::ios_base::in);
227  this->toStream(ss);
228  return ss.str();
229  }
230 
231  //------------------------------------
232  //! fills the contents of this object from a stream containing a \c pickle
233  void initFromStream(std::istream &ss) {
234  int tmpInt;
235  // FIX: at the moment we ignore the header info:
236  RDKit::streamRead(ss,tmpInt);
237  RDKit::streamRead(ss,tmpInt);
238  RDKit::streamRead(ss,tmpInt);
239  RDKit::streamRead(ss,tmpInt);
240 
241  unsigned int tmpUInt;
242  RDKit::streamRead(ss,tmpUInt);// fp length
243  this->setFPLength(tmpUInt);
244 
245  unsigned int numEntries;
246  RDKit::streamRead(ss,numEntries);
247  //std::cout << "<<<-------------------------------" << std::endl;
248  //std::cout << "\tlength: " << getFPLength() << " " << numEntries << std::endl;
249 
250 
251  // grab the params:
252  paramType *params = new paramType();
253  params->initFromStream(ss);
254  this->setCatalogParams(params);
255 
256  //std::cout << "\tparams: " << getCatalogParams()->getLowerFragLength();
257  //std::cout << " " << getCatalogParams()->getUpperFragLength();
258  //std::cout << " " << getCatalogParams()->getNumFuncGroups();
259  //std::cout << std::endl;
260 
261  // now all of the entries:
262  for(unsigned int i=0;i<numEntries;i++){
263  entryType *entry = new entryType();
264  entry->initFromStream(ss);
265  this->addEntry(entry,false);
266  }
267 
268  // and, finally, the adjacency list:
269  for(unsigned int i=0;i<numEntries;i++){
270  unsigned int nNeighbors;
271  RDKit::streamRead(ss,nNeighbors);
272  for(unsigned int j=0;j<nNeighbors;j++){
273  RDKit::streamRead(ss,tmpInt);
274  this->addEdge(i,tmpInt);
275  }
276  }
277  }
278 
279  //------------------------------------
280  unsigned int getNumEntries() const {
281  return boost::num_vertices(d_graph);
282  }
283 
284  //------------------------------------
285  //! fills the contents of this object from a string containing a \c pickle
286  void initFromString(const std::string &text){
287  std::stringstream ss(std::ios_base::binary|std::ios_base::out|std::ios_base::in);
288  // initialize the stream:
289  ss.write(text.c_str(),text.length());
290  // now start reading out values:
291  this->initFromStream(ss);
292  }
293 
294  //------------------------------------
295  //! add a new entry to the catalog
296  /*!
297 
298  \param entry the entry to be added
299  \param updateFPLength (optional) if this is true, our internal
300  fingerprint length will also be updated.
301 
302  */
303  unsigned int addEntry(entryType *entry, bool updateFPLength = true){
304  PRECONDITION(entry,"bad arguments");
305  if (updateFPLength) {
306  unsigned int fpl = this->getFPLength();
307  entry->setBitId(fpl);
308  fpl++;
309  this->setFPLength(fpl);
310  }
311  unsigned int eid = boost::add_vertex(EntryProperty(entry), d_graph);
312  orderType etype = entry->getOrder();
313  // REVIEW: this initialization is not required: the STL map, in
314  // theory, will create a new object when operator[] is called
315  // for a new item
316  if (d_orderMap.find(etype) == d_orderMap.end()) {
317  RDKit::INT_VECT nets;
318  d_orderMap[etype] = nets;
319  }
320  d_orderMap[etype].push_back(eid);
321  return eid;
322  }
323 
324  //------------------------------------
325  //! adds an edge between two entries in the catalog
326  /*!
327  Since we are using a bidirectional graph - the order in
328  which the ids are supplied here makes a difference
329 
330  \param id1 index of the edge's beginning
331  \param id2 index of the edge's end
332 
333  */
334  void addEdge(unsigned int id1, unsigned int id2) {
335  unsigned int nents = getNumEntries();
336  RANGE_CHECK(0, id1, nents-1);
337  RANGE_CHECK(0, id2, nents-1);
338  // FIX: if we boost::setS for the edgeList BGL will
339  // do the checking for duplicity (parallel edges)
340  // But for reasons unknown setS results in compile
341  // errors while using adjacent_vertices.
342  typename CAT_GRAPH_TRAITS::edge_descriptor edge;
343  bool found;
344  boost::tie(edge,found) = boost::edge(boost::vertex(id1,d_graph),
345  boost::vertex(id2,d_graph),
346  d_graph);
347  if (!found) {
348  boost::add_edge(id1, id2, d_graph);
349  }
350  }
351 
352  //------------------------------------
353  //! returns a pointer to our entry with a particular index
354  const entryType *getEntryWithIdx(unsigned int idx) const {
355  RANGE_CHECK(0,idx,getNumEntries()-1);
356  int vd = boost::vertex(idx, d_graph);
357  typename boost::property_map < CatalogGraph, vertex_entry_t>::const_type
358  pMap = boost::get(vertex_entry_t(), d_graph);
359  return pMap[vd];
360  }
361 
362  //------------------------------------
363  //! returns a pointer to our entry with a particular bit ID
364  const entryType *getEntryWithBitId(unsigned int idx) const {
365  RANGE_CHECK(0,idx,this->getFPLength()-1);
366  typename boost::property_map < CatalogGraph, vertex_entry_t>::const_type
367  pMap = boost::get(vertex_entry_t(), d_graph);
368  const entryType *res=NULL;
369  for(unsigned int i=idx;i<this->getNumEntries();i++){
370  const entryType *e=pMap[i];
371  if(e->getBitId()==static_cast<int>(idx)){
372  res=e;
373  break;
374  }
375  }
376  return res;
377  }
378 
379  //------------------------------------
380  //! returns the index of the entry with a particular bit ID
381  int getIdOfEntryWithBitId(unsigned int idx) const {
382  RANGE_CHECK(0,idx,this->getFPLength()-1);
383  typename boost::property_map < CatalogGraph, vertex_entry_t>::const_type
384  pMap = boost::get(vertex_entry_t(), d_graph);
385  int res=-1;
386  for(unsigned int i=idx;i<this->getNumEntries();i++){
387  const entryType *e=pMap[i];
388  if(static_cast<unsigned int>(e->getBitId())==idx){
389  res=i;
390  break;
391  }
392  }
393  return res;
394  }
395 
396  //------------------------------------
397  //! returns a list of the indices of entries below the one passed in
398  RDKit::INT_VECT getDownEntryList(unsigned int idx) const {
399  RDKit::INT_VECT res;
400  DOWN_ENT_ITER nbrIdx, endIdx;
401  boost::tie(nbrIdx, endIdx) = boost::adjacent_vertices(idx, d_graph);
402  while (nbrIdx != endIdx) {
403  res.push_back(*nbrIdx);
404  nbrIdx++;
405  }
406  //std::cout << res.size() << "\n";
407  return res;
408  }
409 
410  //------------------------------------
411  //! returns a list of the indices that have a particular order
412  const RDKit::INT_VECT &getEntriesOfOrder(orderType ord) {
413  return d_orderMap[ord];
414  }
415 
416  //------------------------------------
417  //! returns a list of the indices that have a particular order
418  /*!
419  \overload
420  */
421  const RDKit::INT_VECT &getEntriesOfOrder(orderType ord) const {
422  typename std::map<orderType, RDKit::INT_VECT>::const_iterator elem;
423  elem = d_orderMap.find(ord);
424  CHECK_INVARIANT(elem!=d_orderMap.end()," catalog does not contain any entries of the order specified");
425  return elem->second;
426  }
427 
428 
429  private:
430  // graphs that store the entries in the catalog in a hierachical manner
431  CatalogGraph d_graph;
432  // a map that maps the order type of entries in the catalog to
433  // a vector of vertex indices in the graphs above
434  // e.g. for a catalog with molecular fragments, the order of a fragment can
435  // simply be the number of bond in it. The list this oder maps to is all the
436  // vertex ids of these fragment in the catalog that have this many bonds in them
437  std::map<orderType, RDKit::INT_VECT> d_orderMap;
438 
439  //------------------------------------
440  //! clear any memory that we've used
441  void destroy() {
442  ENT_ITER_PAIR entItP = boost::vertices(d_graph);
443  typename boost::property_map < CatalogGraph, vertex_entry_t>::type
444  pMap = boost::get(vertex_entry_t(), d_graph);
445  while (entItP.first != entItP.second) {
446  delete pMap[*(entItP.first++)];
447  }
448  }
449 
450 
451 
452  };
453 
454 
455  //-----------------------------------------------------------------------------
456  //! a linear Catalog (analogous to an std::vector)
457  /*!
458  Here there is no particular hierarchy, simply a
459  collection of entries.
460  */
461  template <class entryType, class orderType>
462  class LinearCatalog : public Catalog <entryType, orderType> {
463  // here there is no particular hierarchy of entries
464  // we simply model it as a vector of entries
465  // FIX: for retrieval purposes a better model map be std::map
466 
467  public:
468  std::string Serialize();
469 
470  unsigned int addEntry(entryType *entry, bool updateFPLength = true);
471 
472  const entryType *getEntryWithIdx(unsigned int idx) const;
473 
474  private:
475  std::vector<entryType*> d_vector;
476  };
477 }
478 
479 #endif
virtual const entryType * getEntryWithIdx(unsigned int idx) const =0
returns a particular entry in the Catalog
const int versionMinor
Definition: Catalog.h:31
void initFromString(const std::string &text)
fills the contents of this object from a string containing a pickle
Definition: Catalog.h:286
CAT_GRAPH_TRAITS::adjacency_iterator DOWN_ENT_ITER
Definition: Catalog.h:154
used by the BGL to set up the node properties in our graph
Definition: Catalog.h:138
#define RANGE_CHECK(lo, x, hi)
Definition: Invariant.h:133
unsigned int getFPLength() const
returns the length of our fingerprint
Definition: Catalog.h:73
int getIdOfEntryWithBitId(unsigned int idx) const
returns the index of the entry with a particular bit ID
Definition: Catalog.h:381
virtual std::string Serialize() const =0
return a serialized form of the Catalog as an std::string
#define CHECK_INVARIANT(expr, mess)
Definition: Invariant.h:114
void setCatalogParams(paramType *params)
sets our parameters by copying the params argument
Definition: Catalog.h:81
boost::vertex_property_tag kind
Definition: Catalog.h:140
virtual unsigned int addEntry(entryType *entry, bool updateFPLength=true)=0
adds an entry to the catalog
void streamRead(std::istream &ss, T &loc)
does a binary read of an object from a stream
Definition: StreamOps.h:241
boost::graph_traits< CatalogGraph > CAT_GRAPH_TRAITS
Definition: Catalog.h:151
unsigned int getNumEntries() const
returns the number of entries
Definition: Catalog.h:280
const int endianId
Definition: Catalog.h:33
virtual unsigned int getNumEntries() const =0
returns the number of entries
std::pair< DOWN_ENT_ITER, DOWN_ENT_ITER > DOWN_ENT_ITER_PAIR
Definition: Catalog.h:155
a linear Catalog (analogous to an std::vector)
Definition: Catalog.h:462
A Catalog with a hierarchical structure.
Definition: Catalog.h:133
std::pair< VER_ITER, VER_ITER > ENT_ITER_PAIR
Definition: Catalog.h:153
void initFromStream(std::istream &ss)
fills the contents of this object from a stream containing a pickle
Definition: Catalog.h:233
const entryType * getEntryWithIdx(unsigned int idx) const
returns a pointer to our entry with a particular index
Definition: Catalog.h:354
abstract base class for a catalog object
Definition: Catalog.h:38
void toStream(std::ostream &ss) const
serializes this object to a stream
Definition: Catalog.h:179
const paramType * getCatalogParams() const
returns a pointer to our parameters
Definition: Catalog.h:96
std::string Serialize() const
serializes this object and returns the resulting pickle
Definition: Catalog.h:225
std::vector< int > INT_VECT
Definition: types.h:146
boost::adjacency_list< boost::vecS, boost::vecS, boost::bidirectionalS, EntryProperty > CatalogGraph
the type of the graph itself:
Definition: Catalog.h:149
CAT_GRAPH_TRAITS::vertex_iterator VER_ITER
Definition: Catalog.h:152
virtual ~Catalog()
Definition: Catalog.h:44
const int versionPatch
Definition: Catalog.h:32
boost::property< vertex_entry_t, entryType * > EntryProperty
Definition: Catalog.h:142
const entryType * getEntryWithBitId(unsigned int idx) const
returns a pointer to our entry with a particular bit ID
Definition: Catalog.h:364
const int versionMajor
Definition: Catalog.h:30
void streamWrite(std::ostream &ss, const T &val)
does a binary write of an object to a stream
Definition: StreamOps.h:235
#define PRECONDITION(expr, mess)
Definition: Invariant.h:119
unsigned int d_fpLength
the length of our fingerprint
Definition: Catalog.h:104
paramType * dp_cParams
our params object
Definition: Catalog.h:105
const RDKit::INT_VECT & getEntriesOfOrder(orderType ord)
returns a list of the indices that have a particular order
Definition: Catalog.h:412
std::ostream & toStream(std::ostream &)
void addEdge(unsigned int id1, unsigned int id2)
adds an edge between two entries in the catalog
Definition: Catalog.h:334
const RDKit::INT_VECT & getEntriesOfOrder(orderType ord) const
returns a list of the indices that have a particular order
Definition: Catalog.h:421
unsigned int addEntry(entryType *entry, bool updateFPLength=true)
add a new entry to the catalog
Definition: Catalog.h:303
void setFPLength(unsigned int val)
sets our fingerprint length
Definition: Catalog.h:77
RDKit::INT_VECT getDownEntryList(unsigned int idx) const
returns a list of the indices of entries below the one passed in
Definition: Catalog.h:398