RDKit
Open-source cheminformatics and machine learning.
Loading...
Searching...
No Matches
MolSupplier.h
Go to the documentation of this file.
1//
2// Copyright (C) 2002-2022 greg landrum and other RDKit contributors
3//
4// @@ All Rights Reserved @@
5// This file is part of the RDKit.
6// The contents are covered by the terms of the BSD license
7// which is included in the file license.txt, found at the root
8// of the RDKit source tree.
9//
10#include <RDGeneral/export.h>
11#ifndef RD_MOLSUPPLIER_H
12#define RD_MOLSUPPLIER_H
13
14#include <RDGeneral/types.h>
15
16#include <string>
17#include <string_view>
18#include <list>
19#include <memory>
20#include <vector>
21#include <iostream>
22#include <fstream>
23#include <GraphMol/ROMol.h>
25
26#ifdef RDK_BUILD_MAEPARSER_SUPPORT
27namespace schrodinger {
28namespace mae {
29class Reader;
30class Block;
31} // namespace mae
32} // namespace schrodinger
33#endif // RDK_BUILD_MAEPARSER_SUPPORT
34
35namespace RDKit {
36RDKIT_FILEPARSERS_EXPORT std::string strip(const std::string &orig);
37
38/*!
39//
40// Here are a couple of ways one can interact with MolSuppliers:
41//
42// 1) Lazy (ForwardIterator):
43// while(!supplier.atEnd()){
44// ROMol *mol = supplier.next();
45// if(mol){
46// do something;
47// }
48// }
49// 2) Random Access:
50// for(int i=0;i<supplier.length();i++){
51// ROMol *mol = supplier[i];
52// if(mol){
53// do something;
54// }
55// }
56//
57//
58*/
60 // this is an abstract base class to supply molecules one at a time
61 public:
63 virtual ~MolSupplier() {}
64 virtual void init() = 0;
65 virtual void reset() = 0;
66 virtual bool atEnd() = 0;
67 virtual ROMol *next() = 0;
68
69 virtual void close() {
70 if (df_owner) {
71 delete dp_inStream;
72 df_owner = false;
73 }
74 dp_inStream = nullptr;
75 }
76
77 private:
78 // disable automatic copy constructors and assignment operators
79 // for this class and its subclasses. They will likely be
80 // carrying around stream pointers and copying those is a recipe
81 // for disaster.
82 MolSupplier(const MolSupplier &);
83 MolSupplier &operator=(const MolSupplier &);
84
85 protected:
86 // stream to read the molecules from:
87 std::istream *dp_inStream = nullptr;
88 // do we own dp_inStream?
89 bool df_owner = false;
90 // opens a stream for reading and verifies that it can be read from.
91 // if not it throws an exception
92 // the caller owns the resulting stream
93 std::istream *openAndCheckStream(const std::string &filename) {
94 // FIX: this binary mode of opening file is here because of a bug in
95 // VC++ 6.0
96 // the function "tellg" does not work correctly if we do not open it this
97 // way
98 // Jan 2009: Confirmed that this is still the case in visual studio 2008
99 std::ifstream *strm =
100 new std::ifstream(filename.c_str(), std::ios_base::binary);
101 if ((!(*strm)) || strm->bad()) {
102 std::ostringstream errout;
103 errout << "Bad input file " << filename;
104 delete strm;
105 throw BadFileException(errout.str());
106 }
107
108 strm->peek();
109 if (strm->bad() || strm->eof()) {
110 std::ostringstream errout;
111 errout << "Invalid input file " << filename;
112 delete strm;
113 throw BadFileException(errout.str());
114 }
115 return static_cast<std::istream *>(strm);
116 }
117};
118
119// \brief a supplier from an SD file that only reads forward:
121 /*************************************************************************
122 * A lazy mol supplier from a SD file.
123 * - When new molecules are read using "next" their positions in the file are
124 *noted.
125 ***********************************************************************************/
126 public:
128
129 explicit ForwardSDMolSupplier(std::istream *inStream,
130 bool takeOwnership = true, bool sanitize = true,
131 bool removeHs = true,
132 bool strictParsing = false);
133
134 ~ForwardSDMolSupplier() override { close(); }
135
136 void init() override;
137 void reset() override;
138 ROMol *next() override;
139 bool atEnd() override;
140
141 void setProcessPropertyLists(bool val) { df_processPropertyLists = val; }
142 bool getProcessPropertyLists() const { return df_processPropertyLists; }
143
144 bool getEOFHitOnRead() const { return df_eofHitOnRead; }
145
146 protected:
147 virtual void checkForEnd();
149 virtual void readMolProps(ROMol *);
150 bool df_end = false;
151 int d_line = 0; // line number we are currently on
152 bool df_sanitize = true, df_removeHs = true, df_strictParsing = true;
153 bool df_processPropertyLists = true;
154 bool df_eofHitOnRead = false;
155};
156
157// \brief a lazy supplier from an SD file
159 /*************************************************************************
160 * A lazy mol supplier from a SD file.
161 * - When new molecules are read using "next" their positions in the file are
162 *noted.
163 * - A call to the "length" will automatically parse the entire file and
164 *cache all the mol
165 * block positions
166 * - [] operator is used to access a molecule at "idx", calling next
167 *following this will result
168 * in the next molecule after "idx"
169 ***********************************************************************************/
170
171 public:
172 SDMolSupplier() { init(); }
173
174 /*!
175 * \param fileName - the name of the SD file
176 * \param sanitize - if true sanitize the molecule before returning it
177 * \param removeHs - if true remove Hs from the molecule before returning it
178 * (triggers sanitization)
179 * \param strictParsing - if set to false, the parser is more lax about
180 * correctness
181 * of the contents.
182 */
183 explicit SDMolSupplier(const std::string &fileName, bool sanitize = true,
184 bool removeHs = true, bool strictParsing = true);
185
186 explicit SDMolSupplier(std::istream *inStream, bool takeOwnership = true,
187 bool sanitize = true, bool removeHs = true,
188 bool strictParsing = true);
189
190 ~SDMolSupplier() override { close(); }
191 void init() override;
192 void reset() override;
193 ROMol *next() override;
194 bool atEnd() override;
195 void moveTo(unsigned int idx);
196 ROMol *operator[](unsigned int idx);
197 /*! \brief returns the text block for a particular item
198 *
199 * \param idx - which item to return
200 */
201 std::string getItemText(unsigned int idx);
202 unsigned int length();
203 void setData(const std::string &text, bool sanitize = true,
204 bool removeHs = true);
205 void setData(const std::string &text, bool sanitize, bool removeHs,
206 bool strictParsing);
207
208 /*! Resets our internal state and sets the indices of molecules in the stream.
209 * The client should be *very* careful about calling this method, as it's
210 *trivial
211 * to end up with a completely useless supplier.
212 *
213 * \param locs - the vector of stream positions.
214 *
215 * Note that this can be used not only to make reading selected molecules
216 *from a
217 * large SD file much faster, but it can also allow subsetting an SD file or
218 * rearranging the order of the molecules.
219 */
220 void setStreamIndices(const std::vector<std::streampos> &locs);
221
222 private:
223 void checkForEnd() override;
224 void setDataCommon(const std::string &text, bool sanitize, bool removeHs);
225 int d_len = 0; // total number of mol blocks in the file (initialized to -1)
226 int d_last = 0; // the molecule we are ready to read
227 std::vector<std::streampos> d_molpos;
228};
229
230//! lazy file parser for Smiles tables
232 /**************************************************************************
233 * Lazy file parser for Smiles table file, similar to the lazy SD
234 * file parser above
235 * - As an when new molecules are read using "next" their
236 * positions in the file are noted.
237 * - A call to the "length" will autamatically parse the entire
238 * file and cache all the mol block positions
239 * - [] operator is used to access a molecule at "idx", calling
240 * next following this will result in the next molecule after
241 * "idx"
242 ***************************************************************************/
243 public:
244 /*!
245 * \param fileName - the name of smiles table file
246 * \param delimiter - delimiting characters between records on a each
247 * line NOTE that this is not a string, the tokenizer looks for
248 * the individual characters in delimiter, not the full string
249 * itself. So the default delimiter: " \t", means " " or "\t".
250 * \param smilesColumn - column number for the SMILES string (defaults
251 * to the first column)
252 * \param nameColumn - column number for the molecule name (defaults to
253 * the second column) If set to -1 we assume that no name is
254 * available for the molecule and the name is defaulted to the
255 * smiles string
256 * \param titleLine - if true, the first line is assumed to list the
257 * names of properties in order separated by 'delimiter'. It is
258 * also assume that the 'SMILES' column and the 'name' column
259 * are not specified here if false - no title line is assumed
260 * and the properties are recorded as the "columnX" where "X" is
261 * the column number
262 * \param sanitize - if true sanitize the molecule before returning it
263 */
264 explicit SmilesMolSupplier(const std::string &fileName,
265 const std::string &delimiter = " \t",
266 int smilesColumn = 0, int nameColumn = 1,
267 bool titleLine = true, bool sanitize = true);
269 explicit SmilesMolSupplier(std::istream *inStream, bool takeOwnership = true,
270 const std::string &delimiter = " \t",
271 int smilesColumn = 0, int nameColumn = 1,
272 bool titleLine = true, bool sanitize = true);
273
274 ~SmilesMolSupplier() override { close(); }
275 void setData(const std::string &text, const std::string &delimiter = " ",
276 int smilesColumn = 0, int nameColumn = 1, bool titleLine = true,
277 bool sanitize = true);
278 void init() override;
279 void reset() override;
280 ROMol *next() override;
281 bool atEnd() override;
282 void moveTo(unsigned int idx);
283 ROMol *operator[](unsigned int idx);
284 /*! \brief returns the text block for a particular item
285 *
286 * \param idx - which item to return
287 */
288 std::string getItemText(unsigned int idx);
289 unsigned int length();
290
291 private:
292 ROMol *processLine(std::string inLine);
293 void processTitleLine();
294 std::string nextLine();
295 long int skipComments();
296 void checkForEnd();
297
298 bool df_end = false; // have we reached the end of the file?
299 long d_len = 0; // total number of smiles in the file
300 long d_next = 0; // the molecule we are ready to read
301 size_t d_line = 0; // line number we are currently on
302 std::vector<std::streampos>
303 d_molpos; // vector of positions in the file for molecules
304 std::vector<int> d_lineNums;
305 std::string d_delim; // the delimiter string
306 bool df_sanitize = true; // sanitize molecules before returning them?
307 STR_VECT d_props; // vector of property names
308 bool df_title = true; // do we have a title line?
309 int d_smi = 0; // column id for the smile string
310 int d_name = 1; // column id for the name
311};
312
313//! lazy file parser for TDT files
315 /**************************************************************************
316 * Lazy file parser for TDT files, similar to the lazy SD
317 * file parser above
318 * - As an when new molecules are read using "next" their
319 * positions in the file are noted.
320 * - A call to the "length" will autamatically parse the entire
321 * file and cache all the mol block positions
322 * - [] operator is used to access a molecule at "idx", calling
323 * next following this will result in the next molecule after
324 * "idx"
325 ***************************************************************************/
326 public:
327 /*!
328 * \param fileName - the name of the TDT file
329 * \param nameRecord - property name for the molecule name.
330 * If empty (the default), the name defaults to be empty
331 * \param confId2D - if >=0 and 2D coordinates are provided, the 2D
332 * structure (depiction) in the input will be read into the
333 * corresponding conformer id.
334 * \param confId3D - if >=0 and 3D coordinates are provided, the 3D
335 * structure (depiction) in the input will be read into the
336 * corresponding conformer id.
337 * \param sanitize - if true sanitize the molecule before returning it
338 */
339 explicit TDTMolSupplier(const std::string &fileName,
340 const std::string &nameRecord = "", int confId2D = -1,
341 int confId3D = 0, bool sanitize = true);
342 explicit TDTMolSupplier(std::istream *inStream, bool takeOwnership = true,
343 const std::string &nameRecord = "", int confId2D = -1,
344 int confId3D = 0, bool sanitize = true);
346 ~TDTMolSupplier() override { close(); }
347 void setData(const std::string &text, const std::string &nameRecord = "",
348 int confId2D = -1, int confId3D = 0, bool sanitize = true);
349 void init() override;
350 void reset() override;
351 ROMol *next() override;
352 bool atEnd() override;
353 void moveTo(unsigned int idx);
354 ROMol *operator[](unsigned int idx);
355 /*! \brief returns the text block for a particular item
356 *
357 * \param idx - which item to return
358 */
359 std::string getItemText(unsigned int idx);
360 unsigned int length();
361
362 private:
363 bool advanceToNextRecord();
364 void checkForEnd();
365 ROMol *parseMol(std::string inLine);
366
367 bool df_end = false; // have we reached the end of the file?
368 int d_len = 0; // total number of mols in the file
369 int d_last = 0; // the molecule we are ready to read
370 int d_line = 0; // line number we are currently on
371 int d_confId2D = -1; // id to use for 2D conformers
372 int d_confId3D = 0; // id to use for 3D conformers
373 std::vector<std::streampos>
374 d_molpos; // vector of positions in the file for molecules
375 bool df_sanitize = true; // sanitize molecules before returning them?
376 std::string d_nameProp =
377 ""; // local storage for the property providing mol names
378};
379
380//! lazy file parser for PDB files
382 public:
383 explicit PDBMolSupplier(std::istream *inStream, bool takeOwnership = true,
384 bool sanitize = true, bool removeHs = true,
385 unsigned int flavor = 0,
386 bool proximityBonding = true);
387 explicit PDBMolSupplier(const std::string &fname, bool sanitize = true,
388 bool removeHs = true, unsigned int flavor = 0,
389 bool proximityBonding = true);
390
391 ~PDBMolSupplier() override { close(); }
392
393 void init() override;
394 void reset() override;
395 ROMol *next() override;
396 bool atEnd() override;
397
398 protected:
399 bool df_sanitize, df_removeHs, df_proximityBonding;
400 unsigned int d_flavor;
401};
402#ifdef RDK_BUILD_MAEPARSER_SUPPORT
403//! lazy file parser for MAE files
405 /**
406 * Due to maeparser's shared_ptr<istream> Reader interface, MaeMolSupplier
407 * always requires taking ownership of the istream ptr, as the shared ptr will
408 * always clear it upon destruction.
409 */
410
411 public:
412 MaeMolSupplier() {}
413
414 explicit MaeMolSupplier(std::shared_ptr<std::istream> inStream,
415 bool sanitize = true, bool removeHs = true);
416
417 explicit MaeMolSupplier(std::istream *inStream, bool takeOwnership = true,
418 bool sanitize = true, bool removeHs = true);
419
420 explicit MaeMolSupplier(const std::string &fname, bool sanitize = true,
421 bool removeHs = true);
422
423 ~MaeMolSupplier() override {}
424
425 void init() override;
426 void reset() override;
427 ROMol *next() override;
428 bool atEnd() override;
429 void moveTo(unsigned int idx);
430 ROMol *operator[](unsigned int idx);
431 unsigned int length();
432
433 void close() override { dp_sInStream.reset(); }
434
435 void setData(const std::string &text, bool sanitize = true,
436 bool removeHs = true);
437
438 private:
439 void moveToNextBlock();
440
441 protected:
442 bool df_sanitize;
443 bool df_removeHs;
444 std::shared_ptr<schrodinger::mae::Reader> d_reader;
445 std::shared_ptr<schrodinger::mae::Block> d_next_struct;
446 std::shared_ptr<std::istream> dp_sInStream;
447 std::string d_stored_exc;
448 unsigned d_position;
449 unsigned d_length;
450};
451#endif // RDK_BUILD_MAEPARSER_SUPPORT
452} // namespace RDKit
453
454#endif
Defines the primary molecule class ROMol as well as associated typedefs.
used by various file parsing classes to indicate a bad file
virtual void readMolProps(ROMol *)
void setProcessPropertyLists(bool val)
ROMol * next() override
ForwardSDMolSupplier(std::istream *inStream, bool takeOwnership=true, bool sanitize=true, bool removeHs=true, bool strictParsing=false)
bool getProcessPropertyLists() const
std::istream * openAndCheckStream(const std::string &filename)
Definition MolSupplier.h:93
virtual bool atEnd()=0
virtual ROMol * next()=0
virtual void reset()=0
virtual void init()=0
virtual ~MolSupplier()
Definition MolSupplier.h:63
virtual void close()
Definition MolSupplier.h:69
lazy file parser for PDB files
~PDBMolSupplier() override
PDBMolSupplier(std::istream *inStream, bool takeOwnership=true, bool sanitize=true, bool removeHs=true, unsigned int flavor=0, bool proximityBonding=true)
bool atEnd() override
void reset() override
ROMol * next() override
void init() override
PDBMolSupplier(const std::string &fname, bool sanitize=true, bool removeHs=true, unsigned int flavor=0, bool proximityBonding=true)
unsigned int d_flavor
void setStreamIndices(const std::vector< std::streampos > &locs)
ROMol * next() override
void setData(const std::string &text, bool sanitize=true, bool removeHs=true)
bool atEnd() override
ROMol * operator[](unsigned int idx)
unsigned int length()
void reset() override
void setData(const std::string &text, bool sanitize, bool removeHs, bool strictParsing)
SDMolSupplier(const std::string &fileName, bool sanitize=true, bool removeHs=true, bool strictParsing=true)
std::string getItemText(unsigned int idx)
returns the text block for a particular item
SDMolSupplier(std::istream *inStream, bool takeOwnership=true, bool sanitize=true, bool removeHs=true, bool strictParsing=true)
void moveTo(unsigned int idx)
void init() override
~SDMolSupplier() override
lazy file parser for Smiles tables
void moveTo(unsigned int idx)
ROMol * next() override
SmilesMolSupplier(const std::string &fileName, const std::string &delimiter=" \t", int smilesColumn=0, int nameColumn=1, bool titleLine=true, bool sanitize=true)
void init() override
SmilesMolSupplier(std::istream *inStream, bool takeOwnership=true, const std::string &delimiter=" \t", int smilesColumn=0, int nameColumn=1, bool titleLine=true, bool sanitize=true)
void reset() override
ROMol * operator[](unsigned int idx)
bool atEnd() override
std::string getItemText(unsigned int idx)
returns the text block for a particular item
void setData(const std::string &text, const std::string &delimiter=" ", int smilesColumn=0, int nameColumn=1, bool titleLine=true, bool sanitize=true)
lazy file parser for TDT files
void moveTo(unsigned int idx)
std::string getItemText(unsigned int idx)
returns the text block for a particular item
ROMol * operator[](unsigned int idx)
bool atEnd() override
~TDTMolSupplier() override
void init() override
TDTMolSupplier(const std::string &fileName, const std::string &nameRecord="", int confId2D=-1, int confId3D=0, bool sanitize=true)
TDTMolSupplier(std::istream *inStream, bool takeOwnership=true, const std::string &nameRecord="", int confId2D=-1, int confId3D=0, bool sanitize=true)
ROMol * next() override
void reset() override
void setData(const std::string &text, const std::string &nameRecord="", int confId2D=-1, int confId3D=0, bool sanitize=true)
unsigned int length()
#define RDKIT_FILEPARSERS_EXPORT
Definition export.h:161
Std stuff.
RDKIT_FILEPARSERS_EXPORT std::string strip(const std::string &orig)
std::vector< std::string > STR_VECT
Definition Dict.h:29
bool rdvalue_is(const RDValue_cast_t)