RDKit
Open-source cheminformatics and machine learning.
MolSupplier.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2002-2013 greg landrum, Rational Discovery LLC
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #ifndef _RD_MOLSUPPLIER_H
11 #define _RD_MOLSUPPLIER_H
12 
13 #include <RDGeneral/types.h>
14 
15 #include <string>
16 #include <list>
17 #include <vector>
18 #include <iostream>
19 #include <GraphMol/ROMol.h>
20 
21 namespace RDKit {
22  std::string strip(const std::string &orig);
23 
24  /*!
25  //
26  // Here are a couple of ways one can interact with MolSuppliers:
27  //
28  // 1) Lazy (ForwardIterator):
29  // while(!supplier.atEnd()){
30  // ROMol *mol = supplier.next();
31  // if(mol){
32  // do something;
33  // }
34  // }
35  // 2) Random Access:
36  // for(int i=0;i<supplier.length();i++){
37  // ROMol *mol = supplier[i];
38  // if(mol){
39  // do something;
40  // }
41  // }
42  //
43  //
44  */
45  class MolSupplier {
46  // this is an abstract base class to supply molecules one at a time
47  public:
49  virtual ~MolSupplier() {};
50  virtual void init() = 0;
51  virtual void reset() = 0;
52  virtual bool atEnd() = 0;
53  virtual ROMol *next() = 0;
54 
55  private:
56  // disable automatic copy constructors and assignment operators
57  // for this class and its subclasses. They will likely be
58  // carrying around stream pointers and copying those is a recipe
59  // for disaster.
60  MolSupplier(const MolSupplier&);
61  MolSupplier &operator=(const MolSupplier&);
62  protected:
63  // stream to read the molecules from:
64  std::istream *dp_inStream;
65  // do we own dp_inStream?
66  bool df_owner;
67  };
68 
69 
70  // \brief a supplier from an SD file that only reads forward:
72  /*************************************************************************
73  * A lazy mol supplier from a SD file.
74  * - When new molecules are read using "next" their positions in the file are noted.
75  ***********************************************************************************/
76  public:
78 
79  explicit ForwardSDMolSupplier(std::istream *inStream, bool takeOwnership=true,
80  bool sanitize=true,bool removeHs=true,
81  bool strictParsing=false);
82 
84  if (df_owner && dp_inStream) {
85  delete dp_inStream;
86  df_owner=false;
87  dp_inStream=NULL;
88  }
89  };
90 
91  virtual void init();
92  virtual void reset();
93  virtual ROMol *next();
94  virtual bool atEnd();
95 
96  protected:
97  virtual void checkForEnd();
98  ROMol *_next();
99  virtual void readMolProps(ROMol *);
100  bool df_end;
101  int d_line; // line number we are currently on
102  bool df_sanitize,df_removeHs,df_strictParsing;
103  };
104 
105 
106  // \brief a lazy supplier from an SD file
108  /*************************************************************************
109  * A lazy mol supplier from a SD file.
110  * - When new molecules are read using "next" their positions in the file are noted.
111  * - A call to the "length" will automatically parse the entire file and cache all the mol
112  * block positions
113  * - [] operator is used to access a molecule at "idx", calling next following this will result
114  * in the next molecule after "idx"
115  ***********************************************************************************/
116 
117  public:
118  SDMolSupplier() { init(); };
119 
120  /*!
121  * \param fileName - the name of the SD file
122  * \param sanitize - if true sanitize the molecule before returning it
123  * \param removeHs - if true remove Hs from the molecule before returning it
124  * (triggers sanitization)
125  * \param strictParsing - if not set, the parser is more lax about correctness
126  * of the contents.
127  */
128  explicit SDMolSupplier(const std::string &fileName, bool sanitize=true,
129  bool removeHs=true,bool strictParsing=true);
130 
131  explicit SDMolSupplier(std::istream *inStream, bool takeOwnership=true,
132  bool sanitize=true,bool removeHs=true,bool strictParsing=true);
133 
134 
136  void init();
137  void reset();
138  ROMol *next();
139  bool atEnd();
140  void moveTo(unsigned int idx);
141  ROMol * operator[](unsigned int idx);
142  /*! \brief returns the text block for a particular item
143  *
144  * \param idx - which item to return
145  */
146  std::string getItemText(unsigned int idx);
147  unsigned int length();
148  void setData(const std::string &text,bool sanitize=true, bool removeHs=true);
149 
150  /*! Resets our internal state and sets the indices of molecules in the stream.
151  * The client should be *very* careful about calling this method, as it's trivial
152  * to end up with a completely useless supplier.
153  *
154  * \param locs - the vector of stream positions.
155  *
156  * Note that this can be used not only to make reading selected molecules from a
157  * large SD file much faster, but it can also allow subsetting an SD file or
158  * rearranging the order of the molecules.
159  */
160  void setStreamIndices(const std::vector<std::streampos> &locs);
161 
162  private:
163  void checkForEnd();
164  int d_len; // total number of mol blocks in the file (initialized to -1)
165  int d_last; // the molecule we are ready to read
166  std::vector<std::streampos> d_molpos;
167 
168  };
169 
170  //! lazy file parser for Smiles tables
172  /**************************************************************************
173  * Lazy file parser for Smiles table file, similar to the lazy SD
174  * file parser above
175  * - As an when new molecules are read using "next" their
176  * positions in the file are noted.
177  * - A call to the "length" will autamatically parse the entire
178  * file and cache all the mol block positions
179  * - [] operator is used to access a molecule at "idx", calling
180  * next following this will result in the next molecule after
181  * "idx"
182  ***************************************************************************/
183  public:
184 
185  /*!
186  * \param fileName - the name of smiles table file
187  * \param delimiter - delimiting characters between records on a each
188  * line NOTE that this is not a string, the tokenizer looks for
189  * the individual characters in delimiter, not the full string
190  * itself. So the default delimiter: " \t", means " " or "\t".
191  * \param smilesColumn - column number for the SMILES string (defaults
192  * to the first column)
193  * \param nameColumn - column number for the molecule name (defaults to
194  * the second column) If set to -1 we assume that no name is
195  * available for the molecule and the name is defaulted to the
196  * smiles string
197  * \param titleLine - if true, the first line is assumed to list the
198  * names of properties in order seperated by 'delimiter'. It is
199  * also assume that the 'SMILES' column and the 'name' column
200  * are not specified here if false - no title line is assumed
201  * and the properties are recorded as the "columnX" where "X" is
202  * the column number
203  * \param sanitize - if true sanitize the molecule before returning it
204  */
205  explicit SmilesMolSupplier(const std::string &fileName,
206  const std::string &delimiter=" \t",
207  int smilesColumn=0,
208  int nameColumn=1,
209  bool titleLine=true,
210  bool sanitize=true);
212  explicit SmilesMolSupplier(std::istream *inStream, bool takeOwnership=true,
213  const std::string &delimiter=" \t",
214  int smilesColumn=0,
215  int nameColumn=1,
216  bool titleLine=true,
217  bool sanitize=true);
218 
220  void setData(const std::string &text,
221  const std::string &delimiter=" ",
222  int smilesColumn=0,
223  int nameColumn=1,
224  bool titleLine=true,
225  bool sanitize=true);
226  void init();
227  void reset();
228  ROMol *next();
229  bool atEnd();
230  void moveTo(unsigned int idx);
231  ROMol * operator[](unsigned int idx);
232  /*! \brief returns the text block for a particular item
233  *
234  * \param idx - which item to return
235  */
236  std::string getItemText(unsigned int idx);
237  unsigned int length();
238 
239  private:
240  ROMol *processLine(std::string inLine);
241  void processTitleLine();
242  std::string nextLine();
243  long int skipComments();
244  void checkForEnd();
245 
246  bool df_end; // have we reached the end of the file?
247  int d_len; // total number of smiles in the file
248  int d_next; // the molecule we are ready to read
249  int d_line; // line number we are currently on
250  std::vector<std::streampos> d_molpos; // vector of positions in the file for molecules
251  std::vector<int> d_lineNums;
252  std::string d_delim; // the delimiter string
253  bool df_sanitize; // sanitize molecules before returning them?
254  STR_VECT d_props; // vector of property names
255  bool df_title; // do we have a title line?
256  int d_smi; // column id for the smile string
257  int d_name; // column id for the name
258  };
259 
260  //! lazy file parser for TDT files
261  class TDTMolSupplier : public MolSupplier {
262  /**************************************************************************
263  * Lazy file parser for TDT files, similar to the lazy SD
264  * file parser above
265  * - As an when new molecules are read using "next" their
266  * positions in the file are noted.
267  * - A call to the "length" will autamatically parse the entire
268  * file and cache all the mol block positions
269  * - [] operator is used to access a molecule at "idx", calling
270  * next following this will result in the next molecule after
271  * "idx"
272  ***************************************************************************/
273  public:
274 
275  /*!
276  * \param fileName - the name of the TDT file
277  * \param nameRecord - property name for the molecule name.
278  * If empty (the default), the name defaults to be empty
279  * \param confId2D - if >=0 and 2D coordinates are provided, the 2D
280  * structure (depiction) in the input will be read into the
281  * corresponding conformer id.
282  * \param confId3D - if >=0 and 3D coordinates are provided, the 3D
283  * structure (depiction) in the input will be read into the
284  * corresponding conformer id.
285  * \param sanitize - if true sanitize the molecule before returning it
286  */
287  explicit TDTMolSupplier(const std::string &fileName,
288  const std::string &nameRecord="",
289  int confId2D=-1,int confId3D=0,
290  bool sanitize=true);
291  explicit TDTMolSupplier(std::istream *inStream, bool takeOwnership=true,
292  const std::string &nameRecord="",
293  int confId2D=-1,int confId3D=0,
294  bool sanitize=true);
295  TDTMolSupplier();
296  ~TDTMolSupplier();
297  void setData(const std::string &text,
298  const std::string &nameRecord="",
299  int confId2D=-1,int confId3D=0,
300  bool sanitize=true);
301  void init();
302  void reset();
303  ROMol *next();
304  bool atEnd();
305  void moveTo(unsigned int idx);
306  ROMol * operator[](unsigned int idx);
307  /*! \brief returns the text block for a particular item
308  *
309  * \param idx - which item to return
310  */
311  std::string getItemText(unsigned int idx);
312  unsigned int length();
313 
314  private:
315  bool advanceToNextRecord();
316  void checkForEnd();
317  ROMol *parseMol(std::string inLine);
318 
319  bool df_end; // have we reached the end of the file?
320  int d_len; // total number of mols in the file
321  int d_last; // the molecule we are ready to read
322  int d_line; // line number we are currently on
323  int d_confId2D; // id to use for 2D conformers
324  int d_confId3D; // id to use for 3D conformers
325  std::vector<std::streampos> d_molpos; // vector of positions in the file for molecules
326  bool df_sanitize; // sanitize molecules before returning them?
327  std::string d_nameProp; // local storage for the property providing mol names
328  };
329 
330  //! lazy file parser for PDB files
331  class PDBMolSupplier : public MolSupplier {
332  public:
333  explicit PDBMolSupplier(std::istream *inStream, bool takeOwnership=true,
334  bool sanitize=true, bool removeHs=true,
335  unsigned int flavor=0);
336  explicit PDBMolSupplier(const std::string &fname,
337  bool sanitize=true, bool removeHs=true,
338  unsigned int flavor=0);
339 
340  virtual ~PDBMolSupplier() {
341  if (df_owner && dp_inStream)
342  delete dp_inStream;
343  };
344 
345  virtual void init();
346  virtual void reset();
347  virtual ROMol *next();
348  virtual bool atEnd();
349 
350  protected:
351  bool df_sanitize,df_removeHs;
352  unsigned int d_flavor;
353  };
354 }
355 
356 #endif
ROMol * removeHs(const ROMol &mol, bool implicitOnly=false, bool updateExplicitCount=false, bool sanitize=true)
returns a copy of a molecule with hydrogens removed
virtual ~MolSupplier()
Definition: MolSupplier.h:49
unsigned int d_flavor
Definition: MolSupplier.h:352
virtual ~PDBMolSupplier()
Definition: MolSupplier.h:340
virtual ROMol * next()=0
lazy file parser for TDT files
Definition: MolSupplier.h:261
virtual void reset()=0
Defines the primary molecule class ROMol as well as associated typedefs.
ROMol is a molecule class that is intended to have a fixed topology.
Definition: ROMol.h:105
std::string strip(const std::string &orig)
std::istream * dp_inStream
Definition: MolSupplier.h:64
Includes a bunch of functionality for handling Atom and Bond queries.
Definition: Atom.h:28
lazy file parser for Smiles tables
Definition: MolSupplier.h:171
lazy file parser for PDB files
Definition: MolSupplier.h:331
virtual bool atEnd()=0
std::vector< std::string > STR_VECT
Definition: Dict.h:26
virtual void init()=0