RDKit
Open-source cheminformatics and machine learning.
SubstructLibrary.h
Go to the documentation of this file.
1 // Copyright (c) 2017, Novartis Institutes for BioMedical Research Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following
12 // disclaimer in the documentation and/or other materials provided
13 // with the distribution.
14 // * Neither the name of Novartis Institutes for BioMedical Research Inc.
15 // nor the names of its contributors may be used to endorse or promote
16 // products derived from this software without specific prior written
17 // permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 //
31 #include <RDGeneral/export.h>
32 #ifndef RDKIT_SUBSTRUCT_LIBRARY
33 #define RDKIT_SUBSTRUCT_LIBRARY
34 
35 #include <GraphMol/RDKitBase.h>
36 #include <GraphMol/MolPickler.h>
41 #include <DataStructs/BitOps.h>
42 
43 namespace RDKit {
44 
45 //! Base class API for holding molecules to substructure search.
46 /*!
47  This is an API that hides the implementation details used for
48  indexing molecules for substructure searching. It simply
49  provides an API for adding and getting molecules from a set.
50  */
52  public:
53  virtual ~MolHolderBase() {}
54 
55  //! Add a new molecule to the substructure search library
56  //! Returns the molecules index in the library
57  virtual unsigned int addMol(const ROMol &m) = 0;
58 
59  // implementations should throw IndexError on out of range
60  virtual boost::shared_ptr<ROMol> getMol(unsigned int) const = 0;
61 
62  //! Get the current library size
63  virtual unsigned int size() const = 0;
64 };
65 
66 //! Concrete class that holds molecules in memory
67 /*!
68  This is currently one of the faster implementations.
69  However it is very memory intensive.
70 */
72  std::vector<boost::shared_ptr<ROMol>> mols;
73 
74  public:
75  MolHolder() : MolHolderBase(), mols() {}
76 
77  virtual unsigned int addMol(const ROMol &m) {
78  mols.push_back(boost::make_shared<ROMol>(m));
79  return size() - 1;
80  }
81 
82  virtual boost::shared_ptr<ROMol> getMol(unsigned int idx) const {
83  if (idx >= mols.size()) throw IndexErrorException(idx);
84  return mols[idx];
85  }
86 
87  virtual unsigned int size() const {
88  return rdcast<unsigned int>(mols.size());
89  }
90 };
91 
92 //! Concrete class that holds binary cached molecules in memory
93 /*!
94  This implementation uses quite a bit less memory than the
95  non cached implementation. However, due to the reduced speed
96  it should be used in conjunction with a pattern fingerprinter.
97 
98  See RDKit::FPHolder
99 */
101  std::vector<std::string> mols;
102 
103  public:
105 
106  virtual unsigned int addMol(const ROMol &m) {
107  mols.push_back(std::string());
108  MolPickler::pickleMol(m, mols.back());
109  return size() - 1;
110  }
111 
112  //! Adds a pickled binary molecule, no validity checking of the input
113  //! is done.
114  unsigned int addBinary(const std::string &pickle) {
115  mols.push_back(pickle);
116  return size() - 1;
117  }
118 
119  virtual boost::shared_ptr<ROMol> getMol(unsigned int idx) const {
120  if (idx >= mols.size()) throw IndexErrorException(idx);
121  boost::shared_ptr<ROMol> mol(new ROMol);
122  MolPickler::molFromPickle(mols[idx], mol.get());
123  return mol;
124  }
125 
126  virtual unsigned int size() const {
127  return rdcast<unsigned int>(mols.size());
128  }
129 };
130 
131 //! Concrete class that holds smiles strings in memory
132 /*!
133  This implementation uses quite a bit less memory than the
134  cached binary or uncached implementation. However, due to the
135  reduced speed it should be used in conjunction with a pattern
136  fingerprinter.
137 
138  See RDKit::FPHolder
139 */
141  std::vector<std::string> mols;
142 
143  public:
145 
146  virtual unsigned int addMol(const ROMol &m) {
147  bool doIsomericSmiles = true;
148  mols.push_back(MolToSmiles(m, doIsomericSmiles));
149  return size() - 1;
150  }
151 
152  //! Add a smiles to the dataset, no validation is done
153  //! to the inputs.
154  unsigned int addSmiles(const std::string &smiles) {
155  mols.push_back(smiles);
156  return size() - 1;
157  }
158 
159  virtual boost::shared_ptr<ROMol> getMol(unsigned int idx) const {
160  if (idx >= mols.size()) throw IndexErrorException(idx);
161 
162  boost::shared_ptr<ROMol> mol(SmilesToMol(mols[idx]));
163  return mol;
164  }
165 
166  virtual unsigned int size() const {
167  return rdcast<unsigned int>(mols.size());
168  }
169 };
170 
171 //! Concrete class that holds trusted smiles strings in memory
172 /*!
173  A trusted smiles is essentially a smiles string that
174  RDKit has generated. This indicates that fewer
175  sanitization steps are required. See
176  http://rdkit.blogspot.com/2016/09/avoiding-unnecessary-work-and.html
177 
178  This implementation uses quite a bit less memory than the
179  cached binary or uncached implementation. However, due to the
180  reduced speed it should be used in conjunction with a pattern
181  fingerprinter.
182 
183  See RDKit::FPHolder
184 */
186  std::vector<std::string> mols;
187 
188  public:
190 
191  virtual unsigned int addMol(const ROMol &m) {
192  bool doIsomericSmiles = true;
193  mols.push_back(MolToSmiles(m, doIsomericSmiles));
194  return size() - 1;
195  }
196 
197  //! Add a smiles to the dataset, no validation is done
198  //! to the inputs.
199  unsigned int addSmiles(const std::string &smiles) {
200  mols.push_back(smiles);
201  return size() - 1;
202  }
203 
204  virtual boost::shared_ptr<ROMol> getMol(unsigned int idx) const {
205  if (idx >= mols.size()) throw IndexErrorException(idx);
206 
207  RWMol *m = SmilesToMol(mols[idx], 0, false);
208  m->updatePropertyCache();
209  return boost::shared_ptr<ROMol>(m);
210  }
211 
212  virtual unsigned int size() const {
213  return rdcast<unsigned int>(mols.size());
214  }
215 };
216 
217 //! Base FPI for the fingerprinter used to rule out impossible matches
219  std::vector<ExplicitBitVect *> fps;
220 
221  public:
222  virtual ~FPHolderBase() {
223  for (size_t i = 0; i < fps.size(); ++i) delete fps[i];
224  }
225 
226  //! Adds a molecule to the fingerprinter
227  unsigned int addMol(const ROMol &m) {
228  fps.push_back(makeFingerprint(m));
229  return rdcast<unsigned int>(fps.size() - 1);
230  }
231 
232  //! Adds a raw bit vector to the fingerprinter
233  unsigned int addFingerprint(const ExplicitBitVect &v) {
234  fps.push_back(new ExplicitBitVect(v));
235  return rdcast<unsigned int>(fps.size() - 1);
236  }
237 
238  //! Return false if a substructure search can never match the molecule
239  bool passesFilter(unsigned int idx, const ExplicitBitVect &query) const {
240  if (idx >= fps.size()) throw IndexErrorException(idx);
241 
242  return AllProbeBitsMatch(query, *fps[idx]);
243  }
244 
245  //! Get the bit vector at the specified index (throws IndexError if out of
246  //! range)
247  const ExplicitBitVect &getFingerprint(unsigned int idx) const {
248  if (idx >= fps.size()) throw IndexErrorException(idx);
249  return *fps[idx];
250  }
251 
252  //! make the query vector
253  //! Caller owns the vector!
254  virtual ExplicitBitVect *makeFingerprint(const ROMol &m) const = 0;
255 };
256 
257 //! Uses the pattern fingerprinter to rule out matches
259  public:
260  //! Caller owns the vector!
261  virtual ExplicitBitVect *makeFingerprint(const ROMol &m) const {
262  return PatternFingerprintMol(m, 2048);
263  }
264 };
265 
266 //! Substructure Search a library of molecules
267 /*! This class allows for multithreaded substructure searches os
268  large datasets.
269 
270  The implementations can use fingerprints to speed up searches
271  and have molecules cached as binary forms to reduce memory
272  usage.
273 
274  basic usage:
275  \code
276  SubstructLibrary lib;
277  lib.addMol(mol);
278  std::vector<unsigned int> results = lib.getMatches(query);
279  for(std::vector<unsigned int>::const_iterator matchIndex=results.begin();
280  matchIndex != results.end();
281  ++matchIndex) {
282  boost::shared_ptr<ROMol> match = lib.getMol(*matchIndex);
283  }
284  \endcode
285 
286  Using different mol holders and pattern fingerprints.
287 
288  \code
289  boost::shared_ptr<CachedTrustedSmilesMolHolder> molHolder = \
290  boost::make_shared<CachedTrustedSmilesMolHolder>();
291  boost::shared_ptr<PatternHolder> patternHolder = \
292  boost::make_shared<PatternHolder>();
293 
294  SubstructLibrary lib(molHolder, patternHolder);
295  lib.addMol(mol);
296  \endcode
297 
298  Cached molecule holders create molecules on demand. There are currently
299  three styles of cached molecules.
300 
301  CachedMolHolder: stores molecules in the rdkit binary format.
302  CachedSmilesMolHolder: stores molecules in smiles format.
303  CachedTrustedSmilesMolHolder: stores molecules in smiles format.
304 
305  The CachedTrustedSmilesMolHolder is made to add molecules from
306  a trusted source. This makes the basic assumption that RDKit was
307  used to sanitize and canonicalize the smiles string. In practice
308  this is considerably faster than using arbitrary smiles strings since
309  certain assumptions can be made.
310 
311  When loading from external data, as opposed to using the "addMol" API,
312  care must be taken to ensure that the pattern fingerprints and smiles
313  are synchronized.
314 
315  Each pattern holder has an API point for making its fingerprint. This
316  is useful to ensure that the pattern stored in the database will be
317  compatible with the patterns made when analyzing queries.
318 
319  \code
320  boost::shared_ptr<CachedTrustedSmilesMolHolder> molHolder = \
321  boost::make_shared<CachedTrustedSmilesMolHolder>();
322  boost::shared_ptr<PatternHolder> patternHolder = \
323  boost::make_shared<PatternHolder>();
324 
325  // the PatternHolder instance is able to make fingerprints.
326  // These, of course, can be read from a file. For demonstration
327  // purposes we construct them here.
328  const std::string trustedSmiles = "c1ccccc1";
329  ROMol *m = SmilesToMol(trustedSmiles);
330  const ExplicitBitVect *bitVector = patternHolder->makeFingerprint(*m);
331 
332  // The trusted smiles and bitVector can be read from any source.
333  // This is the fastest way to load a substruct library.
334  molHolder->addSmiles( trustedSmiles );
335  patternHolder->addFingerprint( *bitVector );
336  SubstructLibrary lib(molHolder, patternHolder);
337  delete m;
338  delete bitVector;
339  \endcode
340 
341 */
343  boost::shared_ptr<MolHolderBase> molholder;
344  boost::shared_ptr<FPHolderBase> fpholder;
345  MolHolderBase *mols; // used for a small optimization
346  FPHolderBase *fps;
347 
348  public:
350  : molholder(new MolHolder),
351  fpholder(),
352  mols(molholder.get()),
353  fps(NULL) {}
354 
355  SubstructLibrary(boost::shared_ptr<MolHolderBase> molecules)
356  : molholder(molecules), fpholder(), mols(molholder.get()), fps(0) {}
357 
358  SubstructLibrary(boost::shared_ptr<MolHolderBase> molecules,
359  boost::shared_ptr<FPHolderBase> fingerprints)
360  : molholder(molecules),
361  fpholder(fingerprints),
362  mols(molholder.get()),
363  fps(fpholder.get()) {}
364 
365  //! Get the underlying molecule holder implementation
367  PRECONDITION(mols, "Molecule holder NULL in SubstructLibrary");
368  return *mols;
369  }
370 
371  const MolHolderBase &getMolecules() const {
372  PRECONDITION(mols, "Molecule holder NULL in SubstructLibrary");
373  return *mols;
374  }
375 
376  //! Get the underlying fingerprint implementation.
377  /*! Throws a value error if no fingerprints have been set */
379  if (!fps)
380  throw ValueErrorException("Substruct Library does not have fingerprints");
381  return *fps;
382  }
383 
384  const FPHolderBase &getFingerprints() const {
385  if (!fps)
386  throw ValueErrorException("Substruct Library does not have fingerprints");
387  return *fps;
388  }
389 
390  //! Add a molecule to the library
391  /*!
392  \param mol Molecule to add
393 
394  returns index for the molecule in the library
395  */
396  unsigned int addMol(const ROMol &mol);
397 
398  //! Get the matching indices for the query
399  /*!
400  \param query Query to match against molecules
401  \param recursionPossible flags whether or not recursive matches are allowed
402  [ default true ]
403  \param useChirality use atomic CIP codes as part of the comparison [
404  default true ]
405  \param useQueryQueryMatches if set, the contents of atom and bond queries [
406  default false ]
407  will be used as part of the matching
408  \param numThreads If -1 use all available processors [default -1]
409  \param maxResults Maximum results to return, -1 means return all [default
410  -1]
411  */
412  std::vector<unsigned int> getMatches(const ROMol &query,
413  bool recursionPossible = true,
414  bool useChirality = true,
415  bool useQueryQueryMatches = false,
416  int numThreads = -1,
417  int maxResults = -1);
418  //! Get the matching indices for the query between the given indices
419  /*!
420  \param query Query to match against molecules
421  \param startIdx Start index of the search
422  \param endIdx Ending idx (non-inclusive) of the search.
423  \param recursionPossible flags whether or not recursive matches are allowed
424  [ default true ]
425  \param useChirality use atomic CIP codes as part of the comparison [
426  default true ]
427  \param useQueryQueryMatches if set, the contents of atom and bond queries [
428  default false ]
429  will be used as part of the matching
430  \param numThreads If -1 use all available processors [default -1]
431  \param maxResults Maximum results to return, -1 means return all [default
432  -1]
433  */
434  std::vector<unsigned int> getMatches(
435  const ROMol &query, unsigned int startIdx, unsigned int endIdx,
436  bool recursionPossible = true, bool useChirality = true,
437  bool useQueryQueryMatches = false, int numThreads = -1,
438  int maxResults = -1);
439 
440  //! Return the number of matches for the query
441  /*!
442  \param query Query to match against molecules
443  \param recursionPossible flags whether or not recursive matches are allowed
444  [ default true ]
445  \param useChirality use atomic CIP codes as part of the comparison [
446  default true ]
447  \param useQueryQueryMatches if set, the contents of atom and bond queries [
448  default false ]
449  will be used as part of the matching
450  \param numThreads If -1 use all available processors [default -1]
451  */
452  unsigned int countMatches(const ROMol &query, bool recursionPossible = true,
453  bool useChirality = true,
454  bool useQueryQueryMatches = false,
455  int numThreads = -1);
456  //! Return the number of matches for the query between the given indices
457  /*!
458  \param query Query to match against molecules
459  \param startIdx Start index of the search
460  \param endIdx Ending idx (non-inclusive) of the search.
461  \param recursionPossible flags whether or not recursive matches are allowed
462  [ default true ]
463  \param useChirality use atomic CIP codes as part of the comparison [
464  default true ]
465  \param useQueryQueryMatches if set, the contents of atom and bond queries [
466  default false ]
467  will be used as part of the matching
468  \param numThreads If -1 use all available processors [default -1]
469  */
470  unsigned int countMatches(const ROMol &query, unsigned int startIdx,
471  unsigned int endIdx, bool recursionPossible = true,
472  bool useChirality = true,
473  bool useQueryQueryMatches = false,
474  int numThreads = -1);
475 
476  //! Returns true if any match exists for the query
477  /*!
478  \param query Query to match against molecules
479  \param recursionPossible flags whether or not recursive matches are allowed
480  [ default true ]
481  \param useChirality use atomic CIP codes as part of the comparison [
482  default true ]
483  \param useQueryQueryMatches if set, the contents of atom and bond queries [
484  default false ]
485  will be used as part of the matching
486  \param numThreads If -1 use all available processors [default -1]
487  */
488  bool hasMatch(const ROMol &query, bool recursionPossible = true,
489  bool useChirality = true, bool useQueryQueryMatches = false,
490  int numThreads = -1);
491  //! Returns true if any match exists for the query between the specified
492  //! indices
493  /*!
494  \param query Query to match against molecules
495  \param startIdx Start index of the search
496  \param endIdx Ending idx (inclusive) of the search.
497  \param recursionPossible flags whether or not recursive matches are allowed
498  [ default true ]
499  \param useChirality use atomic CIP codes as part of the comparison [
500  default true ]
501  \param useQueryQueryMatches if set, the contents of atom and bond queries [
502  default false ]
503  will be used as part of the matching
504  \param numThreads If -1 use all available processors [default -1]
505  */
506  bool hasMatch(const ROMol &query, unsigned int startIdx, unsigned int endIdx,
507  bool recursionPossible = true, bool useChirality = true,
508  bool useQueryQueryMatches = false, int numThreads = -1);
509 
510  //! Returns the molecule at the given index
511  /*!
512  \param idx Index of the molecule in the library
513  */
514  boost::shared_ptr<ROMol> getMol(unsigned int idx) const {
515  // expects implementation to throw IndexError if out of range
516  PRECONDITION(mols, "molholder is null in SubstructLibrary");
517  return mols->getMol(idx);
518  }
519 
520  //! Returns the molecule at the given index
521  /*!
522  \param idx Index of the molecule in the library
523  */
524  boost::shared_ptr<ROMol> operator[](unsigned int idx) {
525  // expects implementation to throw IndexError if out of range
526  PRECONDITION(mols, "molholder is null in SubstructLibrary");
527  return mols->getMol(idx);
528  }
529 
530  //! return the number of molecules in the library
531  unsigned int size() const {
532  PRECONDITION(mols, "molholder is null in SubstructLibrary");
533  return rdcast<unsigned int>(molholder->size());
534  }
535 };
536 }
537 
538 #endif
SubstructLibrary(boost::shared_ptr< MolHolderBase > molecules, boost::shared_ptr< FPHolderBase > fingerprints)
RDKIT_FINGERPRINTS_EXPORT ExplicitBitVect * PatternFingerprintMol(const ROMol &mol, unsigned int fpSize=2048, std::vector< unsigned int > *atomCounts=0, ExplicitBitVect *setOnlyBits=0)
Generates a topological fingerprint for a molecule using a series of pre-defined structural patterns...
boost::shared_ptr< ROMol > getMol(unsigned int idx) const
Returns the molecule at the given index.
const ExplicitBitVect & getFingerprint(unsigned int idx) const
static void pickleMol(const ROMol *mol, std::ostream &ss)
pickles a molecule and sends the results to stream ss
MolHolderBase & getMolHolder()
Get the underlying molecule holder implementation.
RDKIT_SMILESPARSE_EXPORT RWMol * SmilesToMol(const std::string &smi, const SmilesParserParams &params)
RWMol is a molecule class that is intended to be edited.
Definition: RWMol.h:31
boost::shared_ptr< ROMol > operator[](unsigned int idx)
Returns the molecule at the given index.
virtual boost::shared_ptr< ROMol > getMol(unsigned int idx) const
virtual unsigned int size() const
Get the current library size.
virtual boost::shared_ptr< ROMol > getMol(unsigned int idx) const
unsigned int size() const
return the number of molecules in the library
#define RDKIT_SUBSTRUCTLIBRARY_EXPORT
Definition: export.h:632
Concrete class that holds molecules in memory.
RDKIT_SMILESPARSE_EXPORT std::string MolToSmiles(const ROMol &mol, bool doIsomericSmiles=true, bool doKekule=false, int rootedAtAtom=-1, bool canonical=true, bool allBondsExplicit=false, bool allHsExplicit=false, bool doRandom=false)
returns canonical SMILES for a molecule
virtual unsigned int addMol(const ROMol &m)
Concrete class that holds trusted smiles strings in memory.
virtual unsigned int size() const
Get the current library size.
RDKIT_CHEMREACTIONS_EXPORT void pickle(const boost::shared_ptr< EnumerationStrategyBase > &enumerator, std::ostream &ss)
pickles a EnumerationStrategy and adds the results to a stream ss
pulls in the core RDKit functionality
RDKIT_DATASTRUCTS_EXPORT bool AllProbeBitsMatch(const char *probe, const char *ref)
virtual unsigned int addMol(const ROMol &m)
unsigned int addMol(const ROMol &m)
Adds a molecule to the fingerprinter.
Base FPI for the fingerprinter used to rule out impossible matches.
virtual boost::shared_ptr< ROMol > getMol(unsigned int) const =0
virtual unsigned int addMol(const ROMol &m)
virtual unsigned int size() const
Get the current library size.
void updatePropertyCache(bool strict=true)
calculates any of our lazy properties
const MolHolderBase & getMolecules() const
Std stuff.
Definition: Atom.h:30
SubstructLibrary(boost::shared_ptr< MolHolderBase > molecules)
Class to allow us to throw an IndexError from C++ and have it make it back to Python.
Definition: Exceptions.h:19
virtual boost::shared_ptr< ROMol > getMol(unsigned int idx) const
const FPHolderBase & getFingerprints() const
Base class API for holding molecules to substructure search.
bool passesFilter(unsigned int idx, const ExplicitBitVect &query) const
Return false if a substructure search can never match the molecule.
virtual ExplicitBitVect * makeFingerprint(const ROMol &m) const
Caller owns the vector!
unsigned int addSmiles(const std::string &smiles)
Contains general bit-comparison and similarity operations.
unsigned int addSmiles(const std::string &smiles)
static void molFromPickle(const std::string &pickle, ROMol *mol)
constructs a molecule from a pickle stored in a string
unsigned int addBinary(const std::string &pickle)
Concrete class that holds binary cached molecules in memory.
#define PRECONDITION(expr, mess)
Definition: Invariant.h:108
Uses the pattern fingerprinter to rule out matches.
virtual unsigned int addMol(const ROMol &m)
unsigned int addFingerprint(const ExplicitBitVect &v)
Adds a raw bit vector to the fingerprinter.
Class to allow us to throw a ValueError from C++ and have it make it back to Python.
Definition: Exceptions.h:33
a class for bit vectors that are densely occupied
virtual boost::shared_ptr< ROMol > getMol(unsigned int idx) const
FPHolderBase & getFingerprints()
Get the underlying fingerprint implementation.
Concrete class that holds smiles strings in memory.
virtual unsigned int size() const
Get the current library size.
Substructure Search a library of molecules.