RDKit
Open-source cheminformatics and machine learning.
MolSupplier.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2002-2019 greg landrum, Rational Discovery LLC
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #include <RDGeneral/export.h>
11 #ifndef RD_MOLSUPPLIER_H
12 #define RD_MOLSUPPLIER_H
13 
14 #include <RDGeneral/types.h>
15 
16 #include <string>
17 #include <list>
18 #include <memory>
19 #include <vector>
20 #include <iostream>
21 #include <fstream>
22 #include <GraphMol/ROMol.h>
24 
25 #ifdef RDK_BUILD_COORDGEN_SUPPORT
26 namespace schrodinger {
27 namespace mae {
28 class Reader;
29 class Block;
30 } // namespace mae
31 } // namespace schrodinger
32 #endif // RDK_BUILD_COORDGEN_SUPPORT
33 
34 namespace RDKit {
35 RDKIT_FILEPARSERS_EXPORT std::string strip(const std::string &orig);
36 
37 /*!
38 //
39 // Here are a couple of ways one can interact with MolSuppliers:
40 //
41 // 1) Lazy (ForwardIterator):
42 // while(!supplier.atEnd()){
43 // ROMol *mol = supplier.next();
44 // if(mol){
45 // do something;
46 // }
47 // }
48 // 2) Random Access:
49 // for(int i=0;i<supplier.length();i++){
50 // ROMol *mol = supplier[i];
51 // if(mol){
52 // do something;
53 // }
54 // }
55 //
56 //
57 */
59  // this is an abstract base class to supply molecules one at a time
60  public:
62  virtual ~MolSupplier(){};
63  virtual void init() = 0;
64  virtual void reset() = 0;
65  virtual bool atEnd() = 0;
66  virtual ROMol *next() = 0;
67 
68  private:
69  // disable automatic copy constructors and assignment operators
70  // for this class and its subclasses. They will likely be
71  // carrying around stream pointers and copying those is a recipe
72  // for disaster.
73  MolSupplier(const MolSupplier &);
74  MolSupplier &operator=(const MolSupplier &);
75 
76  protected:
77  // stream to read the molecules from:
78  std::istream *dp_inStream = nullptr;
79  // do we own dp_inStream?
80  bool df_owner = false;
81  // opens a stream for reading and verifies that it can be read from.
82  // if not it throws an exception
83  // the caller owns the resulting stream
84  std::istream *openAndCheckStream(const std::string &filename) {
85  // FIX: this binary mode of opening file is here because of a bug in
86  // VC++ 6.0
87  // the function "tellg" does not work correctly if we do not open it this
88  // way
89  // Jan 2009: Confirmed that this is still the case in visual studio 2008
90  std::ifstream *strm =
91  new std::ifstream(filename.c_str(), std::ios_base::binary);
92  if ((!(*strm)) || strm->bad()) {
93  std::ostringstream errout;
94  errout << "Bad input file " << filename;
95  delete strm;
96  throw BadFileException(errout.str());
97  }
98 
99  strm->peek();
100  if (strm->bad() || strm->eof()) {
101  std::ostringstream errout;
102  errout << "Invalid input file " << filename;
103  delete strm;
104  throw BadFileException(errout.str());
105  }
106  return static_cast<std::istream *>(strm);
107  }
108 };
109 
110 // \brief a supplier from an SD file that only reads forward:
112  /*************************************************************************
113  * A lazy mol supplier from a SD file.
114  * - When new molecules are read using "next" their positions in the file are
115  *noted.
116  ***********************************************************************************/
117  public:
118  ForwardSDMolSupplier() { init(); };
119 
120  explicit ForwardSDMolSupplier(std::istream *inStream,
121  bool takeOwnership = true, bool sanitize = true,
122  bool removeHs = true,
123  bool strictParsing = false);
124 
126  if (df_owner && dp_inStream) {
127  delete dp_inStream;
128  df_owner = false;
129  dp_inStream = NULL;
130  }
131  };
132 
133  virtual void init();
134  virtual void reset();
135  virtual ROMol *next();
136  virtual bool atEnd();
137 
138  void setProcessPropertyLists(bool val) { df_processPropertyLists = val; }
139  bool getProcessPropertyLists() const { return df_processPropertyLists; }
140 
141  bool getEOFHitOnRead() const { return df_eofHitOnRead; }
142 
143  protected:
144  virtual void checkForEnd();
146  virtual void readMolProps(ROMol *);
147  bool df_end = false;
148  int d_line = 0; // line number we are currently on
149  bool df_sanitize = true, df_removeHs = true, df_strictParsing = true;
150  bool df_processPropertyLists = true;
151  bool df_eofHitOnRead = false;
152 };
153 
154 // \brief a lazy supplier from an SD file
156  /*************************************************************************
157  * A lazy mol supplier from a SD file.
158  * - When new molecules are read using "next" their positions in the file are
159  *noted.
160  * - A call to the "length" will automatically parse the entire file and
161  *cache all the mol
162  * block positions
163  * - [] operator is used to access a molecule at "idx", calling next
164  *following this will result
165  * in the next molecule after "idx"
166  ***********************************************************************************/
167 
168  public:
169  SDMolSupplier() { init(); };
170 
171  /*!
172  * \param fileName - the name of the SD file
173  * \param sanitize - if true sanitize the molecule before returning it
174  * \param removeHs - if true remove Hs from the molecule before returning it
175  * (triggers sanitization)
176  * \param strictParsing - if not set, the parser is more lax about
177  * correctness
178  * of the contents.
179  */
180  explicit SDMolSupplier(const std::string &fileName, bool sanitize = true,
181  bool removeHs = true, bool strictParsing = true);
182 
183  explicit SDMolSupplier(std::istream *inStream, bool takeOwnership = true,
184  bool sanitize = true, bool removeHs = true,
185  bool strictParsing = true);
186 
188  void init();
189  void reset();
191  bool atEnd();
192  void moveTo(unsigned int idx);
193  ROMol *operator[](unsigned int idx);
194  /*! \brief returns the text block for a particular item
195  *
196  * \param idx - which item to return
197  */
198  std::string getItemText(unsigned int idx);
199  unsigned int length();
200  void setData(const std::string &text, bool sanitize = true,
201  bool removeHs = true);
202  void setData(const std::string &text, bool sanitize, bool removeHs,
203  bool strictParsing);
204 
205  /*! Resets our internal state and sets the indices of molecules in the stream.
206  * The client should be *very* careful about calling this method, as it's
207  *trivial
208  * to end up with a completely useless supplier.
209  *
210  * \param locs - the vector of stream positions.
211  *
212  * Note that this can be used not only to make reading selected molecules
213  *from a
214  * large SD file much faster, but it can also allow subsetting an SD file or
215  * rearranging the order of the molecules.
216  */
217  void setStreamIndices(const std::vector<std::streampos> &locs);
218 
219  private:
220  void checkForEnd();
221  void setDataCommon(const std::string &text, bool sanitize, bool removeHs);
222  int d_len = 0; // total number of mol blocks in the file (initialized to -1)
223  int d_last = 0; // the molecule we are ready to read
224  std::vector<std::streampos> d_molpos;
225 };
226 
227 //! lazy file parser for Smiles tables
229  /**************************************************************************
230  * Lazy file parser for Smiles table file, similar to the lazy SD
231  * file parser above
232  * - As an when new molecules are read using "next" their
233  * positions in the file are noted.
234  * - A call to the "length" will autamatically parse the entire
235  * file and cache all the mol block positions
236  * - [] operator is used to access a molecule at "idx", calling
237  * next following this will result in the next molecule after
238  * "idx"
239  ***************************************************************************/
240  public:
241  /*!
242  * \param fileName - the name of smiles table file
243  * \param delimiter - delimiting characters between records on a each
244  * line NOTE that this is not a string, the tokenizer looks for
245  * the individual characters in delimiter, not the full string
246  * itself. So the default delimiter: " \t", means " " or "\t".
247  * \param smilesColumn - column number for the SMILES string (defaults
248  * to the first column)
249  * \param nameColumn - column number for the molecule name (defaults to
250  * the second column) If set to -1 we assume that no name is
251  * available for the molecule and the name is defaulted to the
252  * smiles string
253  * \param titleLine - if true, the first line is assumed to list the
254  * names of properties in order separated by 'delimiter'. It is
255  * also assume that the 'SMILES' column and the 'name' column
256  * are not specified here if false - no title line is assumed
257  * and the properties are recorded as the "columnX" where "X" is
258  * the column number
259  * \param sanitize - if true sanitize the molecule before returning it
260  */
261  explicit SmilesMolSupplier(const std::string &fileName,
262  const std::string &delimiter = " \t",
263  int smilesColumn = 0, int nameColumn = 1,
264  bool titleLine = true, bool sanitize = true);
266  explicit SmilesMolSupplier(std::istream *inStream, bool takeOwnership = true,
267  const std::string &delimiter = " \t",
268  int smilesColumn = 0, int nameColumn = 1,
269  bool titleLine = true, bool sanitize = true);
270 
272  void setData(const std::string &text, const std::string &delimiter = " ",
273  int smilesColumn = 0, int nameColumn = 1, bool titleLine = true,
274  bool sanitize = true);
275  void init();
276  void reset();
278  bool atEnd();
279  void moveTo(unsigned int idx);
280  ROMol *operator[](unsigned int idx);
281  /*! \brief returns the text block for a particular item
282  *
283  * \param idx - which item to return
284  */
285  std::string getItemText(unsigned int idx);
286  unsigned int length();
287 
288  private:
289  ROMol *processLine(std::string inLine);
290  void processTitleLine();
291  std::string nextLine();
292  long int skipComments();
293  void checkForEnd();
294 
295  bool df_end = false; // have we reached the end of the file?
296  int d_len = 0; // total number of smiles in the file
297  int d_next = 0; // the molecule we are ready to read
298  int d_line = 0; // line number we are currently on
299  std::vector<std::streampos>
300  d_molpos; // vector of positions in the file for molecules
301  std::vector<int> d_lineNums;
302  std::string d_delim; // the delimiter string
303  bool df_sanitize = true; // sanitize molecules before returning them?
304  STR_VECT d_props; // vector of property names
305  bool df_title = true; // do we have a title line?
306  int d_smi = 0; // column id for the smile string
307  int d_name = 1; // column id for the name
308 };
309 
310 //! lazy file parser for TDT files
312  /**************************************************************************
313  * Lazy file parser for TDT files, similar to the lazy SD
314  * file parser above
315  * - As an when new molecules are read using "next" their
316  * positions in the file are noted.
317  * - A call to the "length" will autamatically parse the entire
318  * file and cache all the mol block positions
319  * - [] operator is used to access a molecule at "idx", calling
320  * next following this will result in the next molecule after
321  * "idx"
322  ***************************************************************************/
323  public:
324  /*!
325  * \param fileName - the name of the TDT file
326  * \param nameRecord - property name for the molecule name.
327  * If empty (the default), the name defaults to be empty
328  * \param confId2D - if >=0 and 2D coordinates are provided, the 2D
329  * structure (depiction) in the input will be read into the
330  * corresponding conformer id.
331  * \param confId3D - if >=0 and 3D coordinates are provided, the 3D
332  * structure (depiction) in the input will be read into the
333  * corresponding conformer id.
334  * \param sanitize - if true sanitize the molecule before returning it
335  */
336  explicit TDTMolSupplier(const std::string &fileName,
337  const std::string &nameRecord = "", int confId2D = -1,
338  int confId3D = 0, bool sanitize = true);
339  explicit TDTMolSupplier(std::istream *inStream, bool takeOwnership = true,
340  const std::string &nameRecord = "", int confId2D = -1,
341  int confId3D = 0, bool sanitize = true);
344  void setData(const std::string &text, const std::string &nameRecord = "",
345  int confId2D = -1, int confId3D = 0, bool sanitize = true);
346  void init();
347  void reset();
349  bool atEnd();
350  void moveTo(unsigned int idx);
351  ROMol *operator[](unsigned int idx);
352  /*! \brief returns the text block for a particular item
353  *
354  * \param idx - which item to return
355  */
356  std::string getItemText(unsigned int idx);
357  unsigned int length();
358 
359  private:
360  bool advanceToNextRecord();
361  void checkForEnd();
362  ROMol *parseMol(std::string inLine);
363 
364  bool df_end = false; // have we reached the end of the file?
365  int d_len = 0; // total number of mols in the file
366  int d_last = 0; // the molecule we are ready to read
367  int d_line = 0; // line number we are currently on
368  int d_confId2D = -1; // id to use for 2D conformers
369  int d_confId3D = 0; // id to use for 3D conformers
370  std::vector<std::streampos>
371  d_molpos; // vector of positions in the file for molecules
372  bool df_sanitize = true; // sanitize molecules before returning them?
373  std::string d_nameProp =
374  ""; // local storage for the property providing mol names
375 };
376 
377 //! lazy file parser for PDB files
379  public:
380  explicit PDBMolSupplier(std::istream *inStream, bool takeOwnership = true,
381  bool sanitize = true, bool removeHs = true,
382  unsigned int flavor = 0,
383  bool proximityBonding = true);
384  explicit PDBMolSupplier(const std::string &fname, bool sanitize = true,
385  bool removeHs = true, unsigned int flavor = 0,
386  bool proximityBonding = true);
387 
388  virtual ~PDBMolSupplier() {
389  if (df_owner && dp_inStream) delete dp_inStream;
390  };
391 
392  virtual void init();
393  virtual void reset();
394  virtual ROMol *next();
395  virtual bool atEnd();
396 
397  protected:
398  bool df_sanitize, df_removeHs, df_proximityBonding;
399  unsigned int d_flavor;
400 };
401 #ifdef RDK_BUILD_COORDGEN_SUPPORT
402 //! lazy file parser for MAE files
403 class RDKIT_FILEPARSERS_EXPORT MaeMolSupplier : public MolSupplier {
404  /**
405  * Due to maeparser's shared_ptr<istream> Reader interface, MaeMolSupplier
406  * always requires taking ownership of the istream ptr, as the shared ptr will
407  * always clear it upon destruction.
408  */
409 
410  public:
411  MaeMolSupplier() { init(); };
412 
413  explicit MaeMolSupplier(std::shared_ptr<std::istream> inStream,
414  bool sanitize = true, bool removeHs = true);
415 
416  explicit MaeMolSupplier(std::istream *inStream, bool takeOwnership = true,
417  bool sanitize = true, bool removeHs = true);
418 
419  explicit MaeMolSupplier(const std::string &fname, bool sanitize = true,
420  bool removeHs = true);
421 
422  virtual ~MaeMolSupplier(){
423  // The dp_sInStream shared_ptr will take care of cleaning up.
424  };
425 
426  virtual void init();
427  virtual void reset();
428  virtual ROMol *next();
429  virtual bool atEnd();
430 
431  protected:
432  bool df_sanitize, df_removeHs;
433  std::shared_ptr<schrodinger::mae::Reader> d_reader;
434  std::shared_ptr<schrodinger::mae::Block> d_next_struct;
435  std::shared_ptr<std::istream> dp_sInStream;
436  std::string d_stored_exc;
437 };
438 #endif // RDK_BUILD_COORDGEN_SUPPORT
439 } // namespace RDKit
440 
441 #endif
RDKit::SmilesMolSupplier::getItemText
std::string getItemText(unsigned int idx)
returns the text block for a particular item
RDKit::SDMolSupplier::SDMolSupplier
SDMolSupplier()
Definition: MolSupplier.h:169
RDKit::MolSupplier::openAndCheckStream
std::istream * openAndCheckStream(const std::string &filename)
Definition: MolSupplier.h:84
BadFileException.h
RDKit::SDMolSupplier::setData
void setData(const std::string &text, bool sanitize, bool removeHs, bool strictParsing)
RDKit::ForwardSDMolSupplier::ForwardSDMolSupplier
ForwardSDMolSupplier()
Definition: MolSupplier.h:118
RDKit::ForwardSDMolSupplier::getEOFHitOnRead
bool getEOFHitOnRead() const
Definition: MolSupplier.h:141
RDKit::SDMolSupplier::length
unsigned int length()
RDKit::SDMolSupplier::setStreamIndices
void setStreamIndices(const std::vector< std::streampos > &locs)
RDKit::TDTMolSupplier::TDTMolSupplier
TDTMolSupplier(const std::string &fileName, const std::string &nameRecord="", int confId2D=-1, int confId3D=0, bool sanitize=true)
ROMol.h
Defines the primary molecule class ROMol as well as associated typedefs.
RDKit::PDBMolSupplier::next
virtual ROMol * next()
types.h
RDKit::SDMolSupplier::reset
void reset()
RDKit::TDTMolSupplier::TDTMolSupplier
TDTMolSupplier()
RDKit::MolSupplier::init
virtual void init()=0
RDKit::SmilesMolSupplier::reset
void reset()
RDKit::MolSupplier::next
virtual ROMol * next()=0
RDKit::SDMolSupplier::atEnd
bool atEnd()
RDKit::SDMolSupplier::~SDMolSupplier
~SDMolSupplier()
Definition: MolSupplier.h:187
RDKit::SDMolSupplier
Definition: MolSupplier.h:155
RDKit::PDBMolSupplier::atEnd
virtual bool atEnd()
RDKit::ForwardSDMolSupplier::setProcessPropertyLists
void setProcessPropertyLists(bool val)
Definition: MolSupplier.h:138
RDKIT_FILEPARSERS_EXPORT
#define RDKIT_FILEPARSERS_EXPORT
Definition: export.h:216
RDKit::MolSupplier
Definition: MolSupplier.h:58
RDKit::TDTMolSupplier::getItemText
std::string getItemText(unsigned int idx)
returns the text block for a particular item
RDKit::SmilesMolSupplier::~SmilesMolSupplier
~SmilesMolSupplier()
RDKit::ForwardSDMolSupplier::atEnd
virtual bool atEnd()
RDKit::SDMolSupplier::getItemText
std::string getItemText(unsigned int idx)
returns the text block for a particular item
RDKit::strip
RDKIT_FILEPARSERS_EXPORT std::string strip(const std::string &orig)
RDKit::SmilesMolSupplier::SmilesMolSupplier
SmilesMolSupplier(const std::string &fileName, const std::string &delimiter=" \t", int smilesColumn=0, int nameColumn=1, bool titleLine=true, bool sanitize=true)
RDKit::TDTMolSupplier::next
ROMol * next()
RDKit::SDMolSupplier::moveTo
void moveTo(unsigned int idx)
RDKit::ForwardSDMolSupplier
Definition: MolSupplier.h:111
RDKit::STR_VECT
std::vector< std::string > STR_VECT
Definition: Dict.h:29
RDKit::PDBMolSupplier::d_flavor
unsigned int d_flavor
Definition: MolSupplier.h:399
RDKit::MolSupplier::MolSupplier
MolSupplier()
Definition: MolSupplier.h:61
RDKit::ForwardSDMolSupplier::getProcessPropertyLists
bool getProcessPropertyLists() const
Definition: MolSupplier.h:139
RDKit::PDBMolSupplier::~PDBMolSupplier
virtual ~PDBMolSupplier()
Definition: MolSupplier.h:388
RDKit::ForwardSDMolSupplier::checkForEnd
virtual void checkForEnd()
RDKit::TDTMolSupplier::TDTMolSupplier
TDTMolSupplier(std::istream *inStream, bool takeOwnership=true, const std::string &nameRecord="", int confId2D=-1, int confId3D=0, bool sanitize=true)
RDKit::SmilesMolSupplier::SmilesMolSupplier
SmilesMolSupplier()
RDKit::ROMol
Definition: ROMol.h:171
RDKit::SmilesMolSupplier::next
ROMol * next()
RDKit::ForwardSDMolSupplier::readMolProps
virtual void readMolProps(ROMol *)
RDKit::SDMolSupplier::init
void init()
RDKit::ForwardSDMolSupplier::~ForwardSDMolSupplier
virtual ~ForwardSDMolSupplier()
Definition: MolSupplier.h:125
RDKit::MolSupplier::~MolSupplier
virtual ~MolSupplier()
Definition: MolSupplier.h:62
RDKit::TDTMolSupplier::moveTo
void moveTo(unsigned int idx)
RDKit::SmilesMolSupplier::length
unsigned int length()
RDKit::SmilesMolSupplier::atEnd
bool atEnd()
RDKit::ForwardSDMolSupplier::_next
ROMol * _next()
RDKit::TDTMolSupplier::setData
void setData(const std::string &text, const std::string &nameRecord="", int confId2D=-1, int confId3D=0, bool sanitize=true)
RDKit::SDMolSupplier::operator[]
ROMol * operator[](unsigned int idx)
RDKit::TDTMolSupplier::init
void init()
RDKit::PDBMolSupplier::df_sanitize
bool df_sanitize
Definition: MolSupplier.h:398
RDKit::TDTMolSupplier
lazy file parser for TDT files
Definition: MolSupplier.h:311
RDKit::ForwardSDMolSupplier::reset
virtual void reset()
RDKit
Std stuff.
Definition: Atom.h:30
RDKit::SmilesMolSupplier
lazy file parser for Smiles tables
Definition: MolSupplier.h:228
RDKit::MolSupplier::reset
virtual void reset()=0
RDKit::TDTMolSupplier::operator[]
ROMol * operator[](unsigned int idx)
RDKit::PDBMolSupplier::PDBMolSupplier
PDBMolSupplier(std::istream *inStream, bool takeOwnership=true, bool sanitize=true, bool removeHs=true, unsigned int flavor=0, bool proximityBonding=true)
RDKit::TDTMolSupplier::atEnd
bool atEnd()
RDKit::PDBMolSupplier
lazy file parser for PDB files
Definition: MolSupplier.h:378
RDKit::TDTMolSupplier::~TDTMolSupplier
~TDTMolSupplier()
RDKit::MolOps::removeHs
RDKIT_GRAPHMOL_EXPORT ROMol * removeHs(const ROMol &mol, bool implicitOnly=false, bool updateExplicitCount=false, bool sanitize=true)
returns a copy of a molecule with hydrogens removed
RDKit::ForwardSDMolSupplier::next
virtual ROMol * next()
RDKit::TDTMolSupplier::length
unsigned int length()
RDKit::PDBMolSupplier::reset
virtual void reset()
RDKit::BadFileException
used by various file parsing classes to indicate a bad file
Definition: BadFileException.h:21
RDKit::SDMolSupplier::SDMolSupplier
SDMolSupplier(std::istream *inStream, bool takeOwnership=true, bool sanitize=true, bool removeHs=true, bool strictParsing=true)
RDKit::SDMolSupplier::SDMolSupplier
SDMolSupplier(const std::string &fileName, bool sanitize=true, bool removeHs=true, bool strictParsing=true)
RDKit::MolSupplier::atEnd
virtual bool atEnd()=0
RDKit::ForwardSDMolSupplier::ForwardSDMolSupplier
ForwardSDMolSupplier(std::istream *inStream, bool takeOwnership=true, bool sanitize=true, bool removeHs=true, bool strictParsing=false)
RDKit::TDTMolSupplier::reset
void reset()
RDKit::SDMolSupplier::next
ROMol * next()
RDKit::SmilesMolSupplier::setData
void setData(const std::string &text, const std::string &delimiter=" ", int smilesColumn=0, int nameColumn=1, bool titleLine=true, bool sanitize=true)
RDKit::SmilesMolSupplier::SmilesMolSupplier
SmilesMolSupplier(std::istream *inStream, bool takeOwnership=true, const std::string &delimiter=" \t", int smilesColumn=0, int nameColumn=1, bool titleLine=true, bool sanitize=true)
RDKit::SmilesMolSupplier::init
void init()
RDKit::PDBMolSupplier::init
virtual void init()
RDKit::SDMolSupplier::setData
void setData(const std::string &text, bool sanitize=true, bool removeHs=true)
RDKit::SmilesMolSupplier::operator[]
ROMol * operator[](unsigned int idx)
RDKit::ForwardSDMolSupplier::init
virtual void init()
RDKit::PDBMolSupplier::PDBMolSupplier
PDBMolSupplier(const std::string &fname, bool sanitize=true, bool removeHs=true, unsigned int flavor=0, bool proximityBonding=true)
RDKit::SmilesMolSupplier::moveTo
void moveTo(unsigned int idx)
export.h