RDKit
Open-source cheminformatics and machine learning.
MorganFingerprints.h
Go to the documentation of this file.
1 //
2 //
3 // Copyright (c) 2009-2010, Novartis Institutes for BioMedical Research Inc.
4 // All rights reserved.
5 //
6 // Redistribution and use in source and binary forms, with or without
7 // modification, are permitted provided that the following conditions are
8 // met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above
13 // copyright notice, this list of conditions and the following
14 // disclaimer in the documentation and/or other materials provided
15 // with the distribution.
16 // * Neither the name of Novartis Institutes for BioMedical Research Inc.
17 // nor the names of its contributors may be used to endorse or promote
18 // products derived from this software without specific prior written permission.
19 //
20 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 //
32 // Created by Greg Landrum, July 2008
33 //
34 //
35 
36 /*! \file MorganFingerprints.h
37 
38 */
39 #ifndef __RD_MORGANFPS_H__
40 #define __RD_MORGANFPS_H__
41 
42 #include <vector>
43 #include <map>
46 #include <boost/cstdint.hpp>
47 
48 namespace RDKit {
49  class ROMol;
50  namespace MorganFingerprints {
51  extern std::vector<std::string> defaultFeatureSmarts;
52 
53  typedef std::map<boost::uint32_t,std::vector<std::pair<boost::uint32_t,boost::uint32_t> > > BitInfoMap;
54 
55  const std::string morganFingerprintVersion="1.0.0";
56 
57  //! returns the Morgan fingerprint for a molecule
58  /*!
59  These fingerprints are similar to the well-known ECFP or
60  FCFP fingerprints, depending on which invariants are used.
61 
62  The algorithm used is described in the paper
63  Rogers, D. & Hahn, M. Extended-Connectivity Fingerprints. JCIM 50:742-54 (2010)
64  http://dx.doi.org/10.1021/ci100050t
65 
66  The original implementation was done using this paper:
67  D. Rogers, R.D. Brown, M. Hahn J. Biomol. Screen. 10:682-6 (2005)
68  and an unpublished technical report:
69  http://www.ics.uci.edu/~welling/teaching/ICS274Bspring06/David%20Rogers%20-%20ECFP%20Manuscript.doc
70 
71  \param mol: the molecule to be fingerprinted
72  \param radius: the number of iterations to grow the fingerprint
73  \param invariants : optional pointer to a set of atom invariants to
74  be used. By default ECFP-type invariants are used
75  (calculated by getConnectivityInvariants())
76  \param fromAtoms : if this is provided, only the atoms in the vector will be
77  used as centers in the fingerprint
78  \param useChirality : if set, additional information will be added to the fingerprint
79  when chiral atoms are discovered. This will cause \verbatim C[C@H](F)Cl,
80  C[C@@H](F)Cl, and CC(F)Cl \endverbatim to generate different fingerprints.
81  \param useBondTypes : if set, bond types will be included as part of the hash for
82  calculating bits
83  \param useCounts : if set, counts of the features will be used
84  \param onlyNonzeroInvariants : if set, bits will only be set from atoms that
85  have a nonzero invariant.
86  \param atomsSettingBits : if nonzero, this will be used to return information
87  about the atoms that set each particular bit.
88  The keys are the map are bit ids, the values
89  are lists of (atomId, radius) pairs.
90 
91  \return a pointer to the fingerprint. The client is
92  responsible for calling delete on this.
93 
94  */
96  getFingerprint(const ROMol &mol,
97  unsigned int radius,
98  std::vector<boost::uint32_t> *invariants=0,
99  const std::vector<boost::uint32_t> *fromAtoms=0,
100  bool useChirality=false,
101  bool useBondTypes=true,
102  bool useCounts=true,
103  bool onlyNonzeroInvariants=false,
104  BitInfoMap *atomsSettingBits=0);
105 
106  //! returns the Morgan fingerprint for a molecule
107  /*!
108  These fingerprints are similar to the well-known ECFP or
109  FCFP fingerprints, depending on which invariants are used.
110 
111  The algorithm used is described in the paper
112  Rogers, D. & Hahn, M. Extended-Connectivity Fingerprints. JCIM 50:742-54 (2010)
113  http://dx.doi.org/10.1021/ci100050t
114 
115  The original implementation was done using this paper:
116  D. Rogers, R.D. Brown, M. Hahn J. Biomol. Screen. 10:682-6 (2005)
117  and an unpublished technical report:
118  http://www.ics.uci.edu/~welling/teaching/ICS274Bspring06/David%20Rogers%20-%20ECFP%20Manuscript.doc
119 
120  \param mol: the molecule to be fingerprinted
121  \param radius: the number of iterations to grow the fingerprint
122  \param invariants : optional pointer to a set of atom invariants to
123  be used. By default ECFP-type invariants are used
124  (calculated by getConnectivityInvariants())
125  \param fromAtoms : if this is provided, only the atoms in the vector will be
126  used as centers in the fingerprint
127  \param useChirality : if set, additional information will be added to the fingerprint
128  when chiral atoms are discovered. This will cause \verbatim C[C@H](F)Cl,
129  C[C@@H](F)Cl, and CC(F)Cl \endverbatim to generate different fingerprints.
130  \param useBondTypes : if set, bond types will be included as part of the hash for
131  calculating bits
132  \param onlyNonzeroInvariants : if set, bits will only be set from atoms that
133  have a nonzero invariant.
134  \param atomsSettingBits : if nonzero, this will be used to return information
135  about the atoms that set each particular bit.
136  The keys are the map are bit ids, the values
137  are lists of (atomId, radius) pairs.
138 
139  \return a pointer to the fingerprint. The client is
140  responsible for calling delete on this.
141 
142  */
144  getHashedFingerprint(const ROMol &mol,
145  unsigned int radius,
146  unsigned int nBits=2048,
147  std::vector<boost::uint32_t> *invariants=0,
148  const std::vector<boost::uint32_t> *fromAtoms=0,
149  bool useChirality=false,
150  bool useBondTypes=true,
151  bool onlyNonzeroInvariants=false,
152  BitInfoMap *atomsSettingBits=0);
153 
154 
155  //! returns the Morgan fingerprint for a molecule as a bit vector
156  /*!
157  see documentation for getFingerprint() for theory/references
158 
159  \param mol: the molecule to be fingerprinted
160  \param radius: the number of iterations to grow the fingerprint
161  \param nBits: the number of bits in the final fingerprint
162  \param invariants : optional pointer to a set of atom invariants to
163  be used. By default ECFP-type invariants are used
164  (calculated by getConnectivityInvariants())
165  \param fromAtoms : if this is provided, only the atoms in the vector will be
166  used as centers in the fingerprint
167  \param useChirality : if set, additional information will be added to the fingerprint
168  when chiral atoms are discovered. This will cause \verbatim C[C@H](F)Cl,
169  C[C@@H](F)Cl, and CC(F)Cl \endverbatim to generate different fingerprints.
170  \param useBondTypes : if set, bond types will be included as part of the hash for
171  calculating bits
172  \param onlyNonzeroInvariants : if set, bits will only be set from atoms that
173  have a nonzero invariant.
174  \param atomsSettingBits : if nonzero, this will be used to return information
175  about the atoms that set each particular bit.
176  The keys are the map are bit ids, the values
177  are lists of (atomId, radius) pairs.
178 
179  \return a pointer to the fingerprint. The client is
180  responsible for calling delete on this.
181 
182  */
184  getFingerprintAsBitVect(const ROMol &mol,
185  unsigned int radius,
186  unsigned int nBits,
187  std::vector<boost::uint32_t> *invariants=0,
188  const std::vector<boost::uint32_t> *fromAtoms=0,
189  bool useChirality=false,
190  bool useBondTypes=true,
191  bool onlyNonzeroInvariants=false,
192  BitInfoMap *atomsSettingBits=0);
193 
194  //! returns the connectivity invariants for a molecule
195  /*!
196 
197  \param mol : the molecule to be considered
198  \param invars : used to return the results
199  \param includeRingMembership : if set, whether or not the atom is in
200  a ring will be used in the invariant list.
201  */
202  void getConnectivityInvariants(const ROMol &mol,
203  std::vector<boost::uint32_t> &invars,
204  bool includeRingMembership=true);
205  const std::string morganConnectivityInvariantVersion="1.0.0";
206 
207  //! returns the feature invariants for a molecule
208  /*!
209 
210  \param mol: the molecule to be considered
211  \param invars : used to return the results
212  \param patterns: if provided should contain the queries used to assign atom-types.
213  if not provided, feature definitions adapted from reference:
214  Gobbi and Poppinger, Biotech. Bioeng. _61_ 47-54 (1998)
215  will be used for Donor, Acceptor, Aromatic, Halogen, Basic, Acidic
216 
217  */
218  void getFeatureInvariants(const ROMol &mol,
219  std::vector<boost::uint32_t> &invars,
220  std::vector<const ROMol *> *patterns=0);
221  const std::string morganFeatureInvariantVersion="0.1.0";
222 
223  } // end of namespace MorganFingerprints
224 }
225 
226 #endif
ExplicitBitVect * getFingerprintAsBitVect(const ROMol &mol, unsigned int radius, unsigned int nBits, std::vector< boost::uint32_t > *invariants=0, const std::vector< boost::uint32_t > *fromAtoms=0, bool useChirality=false, bool useBondTypes=true, bool onlyNonzeroInvariants=false, BitInfoMap *atomsSettingBits=0)
returns the Morgan fingerprint for a molecule as a bit vector
const std::string morganFingerprintVersion
ROMol is a molecule class that is intended to have a fixed topology.
Definition: ROMol.h:105
const std::string morganFeatureInvariantVersion
SparseIntVect< boost::uint32_t > * getHashedFingerprint(const ROMol &mol, unsigned int radius, unsigned int nBits=2048, std::vector< boost::uint32_t > *invariants=0, const std::vector< boost::uint32_t > *fromAtoms=0, bool useChirality=false, bool useBondTypes=true, bool onlyNonzeroInvariants=false, BitInfoMap *atomsSettingBits=0)
returns the Morgan fingerprint for a molecule
std::vector< std::string > defaultFeatureSmarts
void getFeatureInvariants(const ROMol &mol, std::vector< boost::uint32_t > &invars, std::vector< const ROMol * > *patterns=0)
returns the feature invariants for a molecule
const std::string morganConnectivityInvariantVersion
std::map< boost::uint32_t, std::vector< std::pair< boost::uint32_t, boost::uint32_t > > > BitInfoMap
Includes a bunch of functionality for handling Atom and Bond queries.
Definition: Atom.h:28
SparseIntVect< boost::uint32_t > * getFingerprint(const ROMol &mol, unsigned int radius, std::vector< boost::uint32_t > *invariants=0, const std::vector< boost::uint32_t > *fromAtoms=0, bool useChirality=false, bool useBondTypes=true, bool useCounts=true, bool onlyNonzeroInvariants=false, BitInfoMap *atomsSettingBits=0)
returns the Morgan fingerprint for a molecule
void getConnectivityInvariants(const ROMol &mol, std::vector< boost::uint32_t > &invars, bool includeRingMembership=true)
returns the connectivity invariants for a molecule
a class for efficiently storing sparse vectors of ints
Definition: SparseIntVect.h:27
a class for bit vectors that are densely occupied