RDKit
Open-source cheminformatics and machine learning.
FingerprintGenerator.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2018 Boran Adas, Google Summer of Code
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 
11 #include <RDGeneral/export.h>
12 #ifndef RD_FINGERPRINTGEN_H_2018_05
13 #define RD_FINGERPRINTGEN_H_2018_05
14 
18 #include <cstdint>
19 
20 namespace RDKit {
21 class ROMol;
22 
24  // will review this structure once more fignerprint types are implemented
25 
26  std::vector<std::vector<std::uint64_t>> *atomToBits;
27 
28  std::map<std::uint32_t, std::vector<std::pair<std::uint32_t, std::uint32_t>>>
30  // morgan fp
31  // maps bitId -> vector of (atomId, radius)
32 
33  std::pair<std::vector<std::vector<std::uint32_t>>,
34  std::map<std::uint32_t, std::vector<std::vector<int>>>> *bitInfo;
35  // rdkit fp
36  // first part, vector of bits set for each atom, must have the same size as
37  // atom count for molecule
38  // second part, maps bitId -> vector of paths
39 
40  std::vector<unsigned int> *atomCounts;
41  // number of paths that set bits for each atom, must have the same size as
42  // atom count for molecule
43 };
44 
45 /*!
46  \brief Abstract base class that holds molecule independent arguments that are
47  common amongst all fingerprint types and classes inherited from this would
48  hold fingerprint type specific arguments
49 
50  */
51 template <typename OutputType>
53  : private boost::noncopyable {
54  public:
55  FingerprintArguments(const bool countSimulation,
56  const std::vector<std::uint32_t> countBounds,
57  const std::uint32_t fpSize);
58  const bool d_countSimulation;
59  const std::vector<std::uint32_t> d_countBounds;
60  const std::uint32_t d_fpSize;
61 
62  /*!
63  \brief Returns the size of the fingerprint based on arguments
64 
65  \return OutputType size of the fingerprint
66  */
67  virtual OutputType getResultSize() const = 0;
68 
69  /**
70  \brief method that returns information string about the fingerprint specific
71  argument set and the arguments themselves
72 
73  \return std::string information string
74  */
75  virtual std::string infoString() const = 0;
76 
77  /**
78  \brief method that returns information string about common fingerprinting
79  arguments' values
80 
81  \return std::string information string
82  */
83  std::string commonArgumentsString() const;
84 
85  virtual ~FingerprintArguments(){};
86 };
87 
88 /*!
89  \brief abstract base class that holds atom-environments that will be hashed to
90  generate the fingerprint
91 
92  */
93 template <typename OutputType>
94 class RDKIT_FINGERPRINTS_EXPORT AtomEnvironment : private boost::noncopyable {
95  public:
96  /*!
97  \brief calculates and returns the bit id to be set for this atom-environment
98 
99  \param arguments Fingerprinting type specific molecule independent
100  arguments
101  \param atomInvariants Atom-invariants to be used during hashing
102  \param bondInvariants Bond-invariants to be used during hashing
103  \param hashResults if set results will be ready to be modded
104 
105  \return OutputType calculated bit id for this environment
106  */
107  virtual OutputType getBitId(FingerprintArguments<OutputType> *arguments,
108  const std::vector<std::uint32_t> *atomInvariants,
109  const std::vector<std::uint32_t> *bondInvariants,
111  const bool hashResults = false) const = 0;
112 
113  virtual ~AtomEnvironment(){};
114 };
115 
116 /*!
117  \brief abstract base class that generates atom-environments from a molecule
118 
119  */
120 template <typename OutputType>
122  : private boost::noncopyable {
123  public:
124  /*!
125  \brief generate and return all atom-envorinments from a molecule
126 
127  \param mol molecule to generate the atom-environments from
128  \param arguments fingerprint type specific molecule independent
129  arguments
130  \param fromAtoms atoms to be used during environment generation,
131  usage of this parameter depends on the implementation of different
132  fingerprint types
133  \param ignoreAtoms atoms to be ignored during environment generation,
134  usage of this parameter depends on the implementation of different
135  fingerprint types
136  \param confId which conformation to use during environment
137  generation, needed for some fingerprint types
138  \param additionalOutput contains pointers for additional outputs of
139  fingerprinting operation, usage depends on implementation of the fingerprint
140  type
141  \param atomInvariants atom invariants to be used during environment
142  generation, in some cases some of the hashing can be done during environment
143  generation so it is also passed here
144  \param bondInvariants bond invariants to be used during environment
145  generation, same as atomInvariants it might be needed
146  \param hashResults if set results will be ready to be modded
147 
148  \return std::vector<AtomEnvironment *> atom-environments generated from
149  this molecule
150  */
151  virtual std::vector<AtomEnvironment<OutputType> *> getEnvironments(
152  const ROMol &mol, FingerprintArguments<OutputType> *arguments,
153  const std::vector<std::uint32_t> *fromAtoms = nullptr,
154  const std::vector<std::uint32_t> *ignoreAtoms = nullptr,
155  const int confId = -1, const AdditionalOutput *additionalOutput = nullptr,
156  const std::vector<std::uint32_t> *atomInvariants = nullptr,
157  const std::vector<std::uint32_t> *bondInvariants = nullptr,
158  const bool hashResults = false) const = 0;
159 
160  /**
161  \brief method that returns information about this /c AtomEnvironmentGenerator
162  and its arguments if any
163 
164  \return std::string information string
165  */
166  virtual std::string infoString() const = 0;
167 
169 };
170 
171 /*!
172  \brief abstract base class for atom invariants generators
173 
174  */
176  : private boost::noncopyable {
177  public:
178  /*!
179  \brief get atom invariants from a molecule
180 
181  \param mol molecule to generate the atom invariants for
182 
183  \return std::vector<std::uint32_t> atom invariants generated for the given
184  molecule
185  */
186  virtual std::vector<std::uint32_t> *getAtomInvariants(
187  const ROMol &mol) const = 0;
188 
189  /**
190  \brief method that returns information about this /c AtomInvariantsGenerator
191  and its arguments
192 
193  \return std::string information string
194  */
195  virtual std::string infoString() const = 0;
196 
198  virtual AtomInvariantsGenerator *clone() const = 0;
199 };
200 
201 /*!
202  \brief abstract base class for bond invariants generators
203 
204  */
206  : private boost::noncopyable {
207  public:
208  /*!
209  \brief get bond invariants from a molecule
210 
211  \param mol molecule to generate the bond invariants for
212 
213  \return std::vector<std::uint32_t> bond invariants generated for the given
214  molecule
215  */
216  virtual std::vector<std::uint32_t> *getBondInvariants(
217  const ROMol &mol) const = 0;
218 
219  /**
220  \brief method that returns information about this /c BondInvariantsGenerator
221  and its arguments
222 
223  \return std::string information string
224  */
225  virtual std::string infoString() const = 0;
226 
228  virtual BondInvariantsGenerator *clone() const = 0;
229 }; // namespace RDKit
230 
231 /*!
232  \brief class that generates same fingerprint style for different output
233  formats
234 
235  */
236 template <typename OutputType>
238  : private boost::noncopyable {
239  FingerprintArguments<OutputType> *dp_fingerprintArguments;
240  AtomEnvironmentGenerator<OutputType> *dp_atomEnvironmentGenerator;
241  AtomInvariantsGenerator *dp_atomInvariantsGenerator;
242  BondInvariantsGenerator *dp_bondInvariantsGenerator;
243  const bool df_ownsAtomInvGenerator;
244  const bool df_ownsBondInvGenerator;
245 
246  SparseIntVect<OutputType> *getFingerprintHelper(
247  const ROMol &mol, const std::vector<std::uint32_t> *fromAtoms = nullptr,
248  const std::vector<std::uint32_t> *ignoreAtoms = nullptr,
249  const int confId = -1, const AdditionalOutput *additionalOutput = nullptr,
250  const std::vector<std::uint32_t> *customAtomInvariants = nullptr,
251  const std::vector<std::uint32_t> *customBondInvariants = nullptr,
252  const std::uint64_t fpSize = 0) const;
253 
254  public:
256  AtomEnvironmentGenerator<OutputType> *atomEnvironmentGenerator,
257  FingerprintArguments<OutputType> *fingerprintArguments,
258  AtomInvariantsGenerator *atomInvariantsGenerator = nullptr,
259  BondInvariantsGenerator *bondInvariantsGenerator = nullptr,
260  bool ownsAtomInvGenerator = false, bool ownsBondInvGenerator = false);
261 
263 
264  SparseIntVect<OutputType> *getSparseCountFingerprint(
265  const ROMol &mol, const std::vector<std::uint32_t> *fromAtoms = nullptr,
266  const std::vector<std::uint32_t> *ignoreAtoms = nullptr,
267  const int confId = -1, const AdditionalOutput *additionalOutput = nullptr,
268  const std::vector<std::uint32_t> *customAtomInvariants = nullptr,
269  const std::vector<std::uint32_t> *customBondInvariants = nullptr) const;
270 
271  SparseBitVect *getSparseFingerprint(
272  const ROMol &mol, const std::vector<std::uint32_t> *fromAtoms = nullptr,
273  const std::vector<std::uint32_t> *ignoreAtoms = nullptr,
274  const int confId = -1, const AdditionalOutput *additionalOutput = nullptr,
275  const std::vector<std::uint32_t> *customAtomInvariants = nullptr,
276  const std::vector<std::uint32_t> *customBondInvariants = nullptr) const;
277 
278  SparseIntVect<std::uint32_t> *getCountFingerprint(
279  const ROMol &mol, const std::vector<std::uint32_t> *fromAtoms = nullptr,
280  const std::vector<std::uint32_t> *ignoreAtoms = nullptr,
281  const int confId = -1, const AdditionalOutput *additionalOutput = nullptr,
282  const std::vector<std::uint32_t> *customAtomInvariants = nullptr,
283  const std::vector<std::uint32_t> *customBondInvariants = nullptr) const;
284 
286  const ROMol &mol, const std::vector<std::uint32_t> *fromAtoms = nullptr,
287  const std::vector<std::uint32_t> *ignoreAtoms = nullptr,
288  const int confId = -1, const AdditionalOutput *additionalOutput = nullptr,
289  const std::vector<std::uint32_t> *customAtomInvariants = nullptr,
290  const std::vector<std::uint32_t> *customBondInvariants = nullptr) const;
291 
292  std::string infoString() const;
293 };
294 
296 
297 //! used to indicate errors for unimplemented fp types in convenience functions
299  : public std::exception {
300  public:
301  //! construct with an error message
302  UnimplementedFPException(const char *msg) : _msg(msg){};
303  //! construct with an error message
304  UnimplementedFPException(const std::string &msg) : _msg(msg){};
305  //! get the error message
306  const char *message() const { return _msg.c_str(); };
308 
309  private:
310  std::string _msg;
311 };
312 
313 // convenience functions, fingerprint generation with default values
314 
316  const ROMol &mol, FPType fPType);
317 
319  FPType fPType);
320 
322  const ROMol &mol, FPType fPType);
323 
325  FPType fPType);
326 
327 RDKIT_FINGERPRINTS_EXPORT std::vector<SparseIntVect<std::uint64_t> *> *
328 getSparseCountFPBulk(const std::vector<const ROMol *> molVector, FPType fPType);
329 
330 RDKIT_FINGERPRINTS_EXPORT std::vector<SparseBitVect *> *getSparseFPBulk(
331  const std::vector<const ROMol *> molVector, FPType fPType);
332 
333 RDKIT_FINGERPRINTS_EXPORT std::vector<SparseIntVect<std::uint32_t> *>
334  *getCountFPBulk(const std::vector<const ROMol *> molVector, FPType fPType);
335 
336 RDKIT_FINGERPRINTS_EXPORT std::vector<ExplicitBitVect *> *getFPBulk(
337  const std::vector<const ROMol *> molVector, FPType fPType);
338 
339 } // namespace RDKit
340 
341 #endif
abstract base class that holds atom-environments that will be hashed to generate the fingerprint ...
std::pair< std::vector< std::vector< std::uint32_t > >, std::map< std::uint32_t, std::vector< std::vector< int > > > > * bitInfo
RDKIT_FINGERPRINTS_EXPORT std::vector< ExplicitBitVect * > * getFPBulk(const std::vector< const ROMol *> molVector, FPType fPType)
RDKIT_FINGERPRINTS_EXPORT std::vector< SparseIntVect< std::uint64_t > * > * getSparseCountFPBulk(const std::vector< const ROMol *> molVector, FPType fPType)
used to indicate errors for unimplemented fp types in convenience functions
RDKIT_FINGERPRINTS_EXPORT ExplicitBitVect * getFP(const ROMol &mol, FPType fPType)
abstract base class for atom invariants generators
a class for bit vectors that are sparsely occupied.
Definition: SparseBitVect.h:34
const char * message() const
get the error message
abstract base class for bond invariants generators
UnimplementedFPException(const std::string &msg)
construct with an error message
RDKIT_FINGERPRINTS_EXPORT std::vector< SparseBitVect * > * getSparseFPBulk(const std::vector< const ROMol *> molVector, FPType fPType)
RDKIT_FINGERPRINTS_EXPORT SparseBitVect * getSparseFP(const ROMol &mol, FPType fPType)
abstract base class that generates atom-environments from a molecule
RDKIT_FINGERPRINTS_EXPORT SparseIntVect< boost::uint32_t > * getFingerprint(const ROMol &mol, unsigned int radius, std::vector< boost::uint32_t > *invariants=0, const std::vector< boost::uint32_t > *fromAtoms=0, bool useChirality=false, bool useBondTypes=true, bool useCounts=true, bool onlyNonzeroInvariants=false, BitInfoMap *atomsSettingBits=0)
returns the Morgan fingerprint for a molecule
const std::vector< std::uint32_t > d_countBounds
UnimplementedFPException(const char *msg)
construct with an error message
std::vector< std::vector< std::uint64_t > > * atomToBits
RDKIT_FINGERPRINTS_EXPORT SparseIntVect< std::uint64_t > * getSparseCountFP(const ROMol &mol, FPType fPType)
Std stuff.
Definition: Atom.h:30
std::map< std::uint32_t, std::vector< std::pair< std::uint32_t, std::uint32_t > > > * bitInfoMap
#define RDKIT_FINGERPRINTS_EXPORT
Definition: export.h:229
a class for efficiently storing sparse vectors of ints
Definition: SparseIntVect.h:28
RDKIT_FINGERPRINTS_EXPORT std::vector< SparseIntVect< std::uint32_t > * > * getCountFPBulk(const std::vector< const ROMol *> molVector, FPType fPType)
a class for bit vectors that are densely occupied
std::vector< unsigned int > * atomCounts
RDKIT_FINGERPRINTS_EXPORT SparseIntVect< std::uint32_t > * getCountFP(const ROMol &mol, FPType fPType)
class that generates same fingerprint style for different output formats
Abstract base class that holds molecule independent arguments that are common amongst all fingerprint...