RDKit
Open-source cheminformatics and machine learning.
MolSupplier.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2002-2013 greg landrum, Rational Discovery LLC
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #include <RDGeneral/export.h>
11 #ifndef _RD_MOLSUPPLIER_H
12 #define _RD_MOLSUPPLIER_H
13 
14 #include <RDGeneral/types.h>
15 
16 #include <string>
17 #include <list>
18 #include <memory>
19 #include <vector>
20 #include <iostream>
21 #include <GraphMol/ROMol.h>
22 
23 #ifdef RDK_BUILD_COORDGEN_SUPPORT
24 namespace schrodinger {
25 namespace mae {
26 class Reader;
27 class Block;
28 } // namespace mae
29 } // namespace schrodinger
30 #endif // RDK_BUILD_COORDGEN_SUPPORT
31 
32 namespace RDKit {
33 RDKIT_FILEPARSERS_EXPORT std::string strip(const std::string &orig);
34 
35 /*!
36 //
37 // Here are a couple of ways one can interact with MolSuppliers:
38 //
39 // 1) Lazy (ForwardIterator):
40 // while(!supplier.atEnd()){
41 // ROMol *mol = supplier.next();
42 // if(mol){
43 // do something;
44 // }
45 // }
46 // 2) Random Access:
47 // for(int i=0;i<supplier.length();i++){
48 // ROMol *mol = supplier[i];
49 // if(mol){
50 // do something;
51 // }
52 // }
53 //
54 //
55 */
57  // this is an abstract base class to supply molecules one at a time
58  public:
60  virtual ~MolSupplier(){};
61  virtual void init() = 0;
62  virtual void reset() = 0;
63  virtual bool atEnd() = 0;
64  virtual ROMol *next() = 0;
65 
66  private:
67  // disable automatic copy constructors and assignment operators
68  // for this class and its subclasses. They will likely be
69  // carrying around stream pointers and copying those is a recipe
70  // for disaster.
71  MolSupplier(const MolSupplier &);
72  MolSupplier &operator=(const MolSupplier &);
73 
74  protected:
75  // stream to read the molecules from:
76  std::istream *dp_inStream;
77  // do we own dp_inStream?
78  bool df_owner;
79 };
80 
81 // \brief a supplier from an SD file that only reads forward:
83  /*************************************************************************
84  * A lazy mol supplier from a SD file.
85  * - When new molecules are read using "next" their positions in the file are
86  *noted.
87  ***********************************************************************************/
88  public:
89  ForwardSDMolSupplier() { init(); };
90 
91  explicit ForwardSDMolSupplier(std::istream *inStream,
92  bool takeOwnership = true, bool sanitize = true,
93  bool removeHs = true,
94  bool strictParsing = false);
95 
97  if (df_owner && dp_inStream) {
98  delete dp_inStream;
99  df_owner = false;
100  dp_inStream = NULL;
101  }
102  };
103 
104  virtual void init();
105  virtual void reset();
106  virtual ROMol *next();
107  virtual bool atEnd();
108 
109  protected:
110  virtual void checkForEnd();
111  ROMol *_next();
112  virtual void readMolProps(ROMol *);
113  bool df_end;
114  int d_line; // line number we are currently on
115  bool df_sanitize, df_removeHs, df_strictParsing;
116 };
117 
118 // \brief a lazy supplier from an SD file
120  /*************************************************************************
121  * A lazy mol supplier from a SD file.
122  * - When new molecules are read using "next" their positions in the file are
123  *noted.
124  * - A call to the "length" will automatically parse the entire file and
125  *cache all the mol
126  * block positions
127  * - [] operator is used to access a molecule at "idx", calling next
128  *following this will result
129  * in the next molecule after "idx"
130  ***********************************************************************************/
131 
132  public:
133  SDMolSupplier() { init(); };
134 
135  /*!
136  * \param fileName - the name of the SD file
137  * \param sanitize - if true sanitize the molecule before returning it
138  * \param removeHs - if true remove Hs from the molecule before returning it
139  * (triggers sanitization)
140  * \param strictParsing - if not set, the parser is more lax about
141  * correctness
142  * of the contents.
143  */
144  explicit SDMolSupplier(const std::string &fileName, bool sanitize = true,
145  bool removeHs = true, bool strictParsing = true);
146 
147  explicit SDMolSupplier(std::istream *inStream, bool takeOwnership = true,
148  bool sanitize = true, bool removeHs = true,
149  bool strictParsing = true);
150 
152  void init();
153  void reset();
154  ROMol *next();
155  bool atEnd();
156  void moveTo(unsigned int idx);
157  ROMol *operator[](unsigned int idx);
158  /*! \brief returns the text block for a particular item
159  *
160  * \param idx - which item to return
161  */
162  std::string getItemText(unsigned int idx);
163  unsigned int length();
164  void setData(const std::string &text, bool sanitize = true,
165  bool removeHs = true);
166  void setData(const std::string &text, bool sanitize, bool removeHs,
167  bool strictParsing);
168 
169  /*! Resets our internal state and sets the indices of molecules in the stream.
170  * The client should be *very* careful about calling this method, as it's
171  *trivial
172  * to end up with a completely useless supplier.
173  *
174  * \param locs - the vector of stream positions.
175  *
176  * Note that this can be used not only to make reading selected molecules
177  *from a
178  * large SD file much faster, but it can also allow subsetting an SD file or
179  * rearranging the order of the molecules.
180  */
181  void setStreamIndices(const std::vector<std::streampos> &locs);
182 
183  private:
184  void checkForEnd();
185  void setDataCommon(const std::string &text, bool sanitize, bool removeHs);
186  int d_len; // total number of mol blocks in the file (initialized to -1)
187  int d_last; // the molecule we are ready to read
188  std::vector<std::streampos> d_molpos;
189 };
190 
191 //! lazy file parser for Smiles tables
193  /**************************************************************************
194  * Lazy file parser for Smiles table file, similar to the lazy SD
195  * file parser above
196  * - As an when new molecules are read using "next" their
197  * positions in the file are noted.
198  * - A call to the "length" will autamatically parse the entire
199  * file and cache all the mol block positions
200  * - [] operator is used to access a molecule at "idx", calling
201  * next following this will result in the next molecule after
202  * "idx"
203  ***************************************************************************/
204  public:
205  /*!
206  * \param fileName - the name of smiles table file
207  * \param delimiter - delimiting characters between records on a each
208  * line NOTE that this is not a string, the tokenizer looks for
209  * the individual characters in delimiter, not the full string
210  * itself. So the default delimiter: " \t", means " " or "\t".
211  * \param smilesColumn - column number for the SMILES string (defaults
212  * to the first column)
213  * \param nameColumn - column number for the molecule name (defaults to
214  * the second column) If set to -1 we assume that no name is
215  * available for the molecule and the name is defaulted to the
216  * smiles string
217  * \param titleLine - if true, the first line is assumed to list the
218  * names of properties in order seperated by 'delimiter'. It is
219  * also assume that the 'SMILES' column and the 'name' column
220  * are not specified here if false - no title line is assumed
221  * and the properties are recorded as the "columnX" where "X" is
222  * the column number
223  * \param sanitize - if true sanitize the molecule before returning it
224  */
225  explicit SmilesMolSupplier(const std::string &fileName,
226  const std::string &delimiter = " \t",
227  int smilesColumn = 0, int nameColumn = 1,
228  bool titleLine = true, bool sanitize = true);
230  explicit SmilesMolSupplier(std::istream *inStream, bool takeOwnership = true,
231  const std::string &delimiter = " \t",
232  int smilesColumn = 0, int nameColumn = 1,
233  bool titleLine = true, bool sanitize = true);
234 
236  void setData(const std::string &text, const std::string &delimiter = " ",
237  int smilesColumn = 0, int nameColumn = 1, bool titleLine = true,
238  bool sanitize = true);
239  void init();
240  void reset();
241  ROMol *next();
242  bool atEnd();
243  void moveTo(unsigned int idx);
244  ROMol *operator[](unsigned int idx);
245  /*! \brief returns the text block for a particular item
246  *
247  * \param idx - which item to return
248  */
249  std::string getItemText(unsigned int idx);
250  unsigned int length();
251 
252  private:
253  ROMol *processLine(std::string inLine);
254  void processTitleLine();
255  std::string nextLine();
256  long int skipComments();
257  void checkForEnd();
258 
259  bool df_end; // have we reached the end of the file?
260  int d_len; // total number of smiles in the file
261  int d_next; // the molecule we are ready to read
262  int d_line; // line number we are currently on
263  std::vector<std::streampos>
264  d_molpos; // vector of positions in the file for molecules
265  std::vector<int> d_lineNums;
266  std::string d_delim; // the delimiter string
267  bool df_sanitize; // sanitize molecules before returning them?
268  STR_VECT d_props; // vector of property names
269  bool df_title; // do we have a title line?
270  int d_smi; // column id for the smile string
271  int d_name; // column id for the name
272 };
273 
274 //! lazy file parser for TDT files
276  /**************************************************************************
277  * Lazy file parser for TDT files, similar to the lazy SD
278  * file parser above
279  * - As an when new molecules are read using "next" their
280  * positions in the file are noted.
281  * - A call to the "length" will autamatically parse the entire
282  * file and cache all the mol block positions
283  * - [] operator is used to access a molecule at "idx", calling
284  * next following this will result in the next molecule after
285  * "idx"
286  ***************************************************************************/
287  public:
288  /*!
289  * \param fileName - the name of the TDT file
290  * \param nameRecord - property name for the molecule name.
291  * If empty (the default), the name defaults to be empty
292  * \param confId2D - if >=0 and 2D coordinates are provided, the 2D
293  * structure (depiction) in the input will be read into the
294  * corresponding conformer id.
295  * \param confId3D - if >=0 and 3D coordinates are provided, the 3D
296  * structure (depiction) in the input will be read into the
297  * corresponding conformer id.
298  * \param sanitize - if true sanitize the molecule before returning it
299  */
300  explicit TDTMolSupplier(const std::string &fileName,
301  const std::string &nameRecord = "", int confId2D = -1,
302  int confId3D = 0, bool sanitize = true);
303  explicit TDTMolSupplier(std::istream *inStream, bool takeOwnership = true,
304  const std::string &nameRecord = "", int confId2D = -1,
305  int confId3D = 0, bool sanitize = true);
306  TDTMolSupplier();
307  ~TDTMolSupplier();
308  void setData(const std::string &text, const std::string &nameRecord = "",
309  int confId2D = -1, int confId3D = 0, bool sanitize = true);
310  void init();
311  void reset();
312  ROMol *next();
313  bool atEnd();
314  void moveTo(unsigned int idx);
315  ROMol *operator[](unsigned int idx);
316  /*! \brief returns the text block for a particular item
317  *
318  * \param idx - which item to return
319  */
320  std::string getItemText(unsigned int idx);
321  unsigned int length();
322 
323  private:
324  bool advanceToNextRecord();
325  void checkForEnd();
326  ROMol *parseMol(std::string inLine);
327 
328  bool df_end; // have we reached the end of the file?
329  int d_len; // total number of mols in the file
330  int d_last; // the molecule we are ready to read
331  int d_line; // line number we are currently on
332  int d_confId2D; // id to use for 2D conformers
333  int d_confId3D; // id to use for 3D conformers
334  std::vector<std::streampos>
335  d_molpos; // vector of positions in the file for molecules
336  bool df_sanitize; // sanitize molecules before returning them?
337  std::string d_nameProp; // local storage for the property providing mol names
338 };
339 
340 //! lazy file parser for PDB files
342  public:
343  explicit PDBMolSupplier(std::istream *inStream, bool takeOwnership = true,
344  bool sanitize = true, bool removeHs = true,
345  unsigned int flavor = 0,
346  bool proximityBonding = true);
347  explicit PDBMolSupplier(const std::string &fname, bool sanitize = true,
348  bool removeHs = true, unsigned int flavor = 0,
349  bool proximityBonding = true);
350 
351  virtual ~PDBMolSupplier() {
352  if (df_owner && dp_inStream) delete dp_inStream;
353  };
354 
355  virtual void init();
356  virtual void reset();
357  virtual ROMol *next();
358  virtual bool atEnd();
359 
360  protected:
361  bool df_sanitize, df_removeHs, df_proximityBonding;
362  unsigned int d_flavor;
363 };
364 #ifdef RDK_BUILD_COORDGEN_SUPPORT
365 //! lazy file parser for MAE files
366 class RDKIT_FILEPARSERS_EXPORT MaeMolSupplier : public MolSupplier {
367  public:
368  MaeMolSupplier() { init(); };
369  explicit MaeMolSupplier(std::istream *inStream, bool takeOwnership = true,
370  bool sanitize = true, bool removeHs = true);
371  explicit MaeMolSupplier(const std::string &fname, bool sanitize = true,
372  bool removeHs = true);
373 
374  virtual ~MaeMolSupplier() {
375  if (df_owner && dp_inStream) delete dp_inStream;
376  };
377 
378  virtual void init();
379  virtual void reset();
380  virtual ROMol *next();
381  virtual bool atEnd();
382 
383  protected:
384  bool df_sanitize, df_removeHs;
385  std::shared_ptr<schrodinger::mae::Reader> d_reader;
386  std::shared_ptr<schrodinger::mae::Block> d_next_struct;
387 };
388 #endif // RDK_BUILD_COORDGEN_SUPPORT
389 } // namespace RDKit
390 
391 #endif
#define RDKIT_FILEPARSERS_EXPORT
Definition: export.h:203
virtual ~MolSupplier()
Definition: MolSupplier.h:60
RDKIT_GRAPHMOL_EXPORT ROMol * removeHs(const ROMol &mol, bool implicitOnly=false, bool updateExplicitCount=false, bool sanitize=true)
returns a copy of a molecule with hydrogens removed
unsigned int d_flavor
Definition: MolSupplier.h:362
virtual ~PDBMolSupplier()
Definition: MolSupplier.h:351
lazy file parser for TDT files
Definition: MolSupplier.h:275
Defines the primary molecule class ROMol as well as associated typedefs.
std::istream * dp_inStream
Definition: MolSupplier.h:76
Std stuff.
Definition: Atom.h:30
lazy file parser for Smiles tables
Definition: MolSupplier.h:192
lazy file parser for PDB files
Definition: MolSupplier.h:341
RDKIT_FILEPARSERS_EXPORT std::string strip(const std::string &orig)
std::vector< std::string > STR_VECT
Definition: Dict.h:29