RDKit
Open-source cheminformatics and machine learning.
MorganFingerprints.h
Go to the documentation of this file.
1 //
2 //
3 // Copyright (c) 2009-2010, Novartis Institutes for BioMedical Research Inc.
4 // All rights reserved.
5 //
6 // Redistribution and use in source and binary forms, with or without
7 // modification, are permitted provided that the following conditions are
8 // met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above
13 // copyright notice, this list of conditions and the following
14 // disclaimer in the documentation and/or other materials provided
15 // with the distribution.
16 // * Neither the name of Novartis Institutes for BioMedical Research Inc.
17 // nor the names of its contributors may be used to endorse or promote
18 // products derived from this software without specific prior written
19 // permission.
20 //
21 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 //
33 // Created by Greg Landrum, July 2008
34 //
35 //
36 
37 /*! \file MorganFingerprints.h
38 
39 */
40 #include <RDGeneral/export.h>
41 #ifndef __RD_MORGANFPS_H__
42 #define __RD_MORGANFPS_H__
43 
44 #include <vector>
45 #include <map>
48 #include <boost/cstdint.hpp>
50 
51 namespace RDKit {
52 class ROMol;
53 namespace MorganFingerprints {
54 typedef std::map<boost::uint32_t,
55  std::vector<std::pair<boost::uint32_t, boost::uint32_t>>>
57 
58 const std::string morganFingerprintVersion = "1.0.0";
59 
60 //! returns the Morgan fingerprint for a molecule
61 /*!
62  These fingerprints are similar to the well-known ECFP or
63  FCFP fingerprints, depending on which invariants are used.
64 
65  The algorithm used is described in the paper
66  Rogers, D. & Hahn, M. Extended-Connectivity Fingerprints. JCIM 50:742-54
67  (2010)
68  http://dx.doi.org/10.1021/ci100050t
69 
70  The original implementation was done using this paper:
71  D. Rogers, R.D. Brown, M. Hahn J. Biomol. Screen. 10:682-6 (2005)
72  and an unpublished technical report:
73  http://www.ics.uci.edu/~welling/teaching/ICS274Bspring06/David%20Rogers%20-%20ECFP%20Manuscript.doc
74 
75  \param mol: the molecule to be fingerprinted
76  \param radius: the number of iterations to grow the fingerprint
77  \param invariants : optional pointer to a set of atom invariants to
78  be used. By default ECFP-type invariants are used
79  (calculated by getConnectivityInvariants())
80  \param fromAtoms : if this is provided, only the atoms in the vector will be
81  used as centers in the fingerprint
82  \param useChirality : if set, additional information will be added to the
83  fingerprint
84  when chiral atoms are discovered. This will cause
85  \verbatim C[C@H](F)Cl,
86  C[C@@H](F)Cl, and CC(F)Cl \endverbatim to generate
87  different fingerprints.
88  \param useBondTypes : if set, bond types will be included as part of the hash
89  for
90  calculating bits
91  \param useCounts : if set, counts of the features will be used
92  \param onlyNonzeroInvariants : if set, bits will only be set from atoms that
93  have a nonzero invariant.
94  \param atomsSettingBits : if nonzero, this will be used to return information
95  about the atoms that set each particular bit.
96  The keys are the map are bit ids, the values
97  are lists of (atomId, radius) pairs.
98 
99  \return a pointer to the fingerprint. The client is
100  responsible for calling delete on this.
101 
102 */
104  const ROMol &mol, unsigned int radius,
105  std::vector<boost::uint32_t> *invariants = 0,
106  const std::vector<boost::uint32_t> *fromAtoms = 0,
107  bool useChirality = false, bool useBondTypes = true, bool useCounts = true,
108  bool onlyNonzeroInvariants = false, BitInfoMap *atomsSettingBits = 0);
109 
110 //! returns the Morgan fingerprint for a molecule
111 /*!
112  These fingerprints are similar to the well-known ECFP or
113  FCFP fingerprints, depending on which invariants are used.
114 
115  The algorithm used is described in the paper
116  Rogers, D. & Hahn, M. Extended-Connectivity Fingerprints. JCIM 50:742-54
117  (2010)
118  http://dx.doi.org/10.1021/ci100050t
119 
120  The original implementation was done using this paper:
121  D. Rogers, R.D. Brown, M. Hahn J. Biomol. Screen. 10:682-6 (2005)
122  and an unpublished technical report:
123  http://www.ics.uci.edu/~welling/teaching/ICS274Bspring06/David%20Rogers%20-%20ECFP%20Manuscript.doc
124 
125  \param mol: the molecule to be fingerprinted
126  \param radius: the number of iterations to grow the fingerprint
127  \param invariants : optional pointer to a set of atom invariants to
128  be used. By default ECFP-type invariants are used
129  (calculated by getConnectivityInvariants())
130  \param fromAtoms : if this is provided, only the atoms in the vector will be
131  used as centers in the fingerprint
132  \param useChirality : if set, additional information will be added to the
133  fingerprint
134  when chiral atoms are discovered. This will cause
135  \verbatim C[C@H](F)Cl,
136  C[C@@H](F)Cl, and CC(F)Cl \endverbatim to generate
137  different fingerprints.
138  \param useBondTypes : if set, bond types will be included as part of the hash
139  for
140  calculating bits
141  \param onlyNonzeroInvariants : if set, bits will only be set from atoms that
142  have a nonzero invariant.
143  \param atomsSettingBits : if nonzero, this will be used to return information
144  about the atoms that set each particular bit.
145  The keys are the map are bit ids, the values
146  are lists of (atomId, radius) pairs.
147 
148  \return a pointer to the fingerprint. The client is
149  responsible for calling delete on this.
150 
151 */
153  const ROMol &mol, unsigned int radius, unsigned int nBits = 2048,
154  std::vector<boost::uint32_t> *invariants = 0,
155  const std::vector<boost::uint32_t> *fromAtoms = 0,
156  bool useChirality = false, bool useBondTypes = true,
157  bool onlyNonzeroInvariants = false, BitInfoMap *atomsSettingBits = 0);
158 
159 //! returns the Morgan fingerprint for a molecule as a bit vector
160 /*!
161  see documentation for getFingerprint() for theory/references
162 
163  \param mol: the molecule to be fingerprinted
164  \param radius: the number of iterations to grow the fingerprint
165  \param nBits: the number of bits in the final fingerprint
166  \param invariants : optional pointer to a set of atom invariants to
167  be used. By default ECFP-type invariants are used
168  (calculated by getConnectivityInvariants())
169  \param fromAtoms : if this is provided, only the atoms in the vector will be
170  used as centers in the fingerprint
171  \param useChirality : if set, additional information will be added to the
172  fingerprint
173  when chiral atoms are discovered. This will cause
174  \verbatim C[C@H](F)Cl,
175  C[C@@H](F)Cl, and CC(F)Cl \endverbatim to generate
176  different fingerprints.
177  \param useBondTypes : if set, bond types will be included as part of the hash
178  for
179  calculating bits
180  \param onlyNonzeroInvariants : if set, bits will only be set from atoms that
181  have a nonzero invariant.
182  \param atomsSettingBits : if nonzero, this will be used to return information
183  about the atoms that set each particular bit.
184  The keys are the map are bit ids, the values
185  are lists of (atomId, radius) pairs.
186 
187  \return a pointer to the fingerprint. The client is
188  responsible for calling delete on this.
189 
190 */
192  const ROMol &mol, unsigned int radius, unsigned int nBits,
193  std::vector<boost::uint32_t> *invariants = 0,
194  const std::vector<boost::uint32_t> *fromAtoms = 0,
195  bool useChirality = false, bool useBondTypes = true,
196  bool onlyNonzeroInvariants = false, BitInfoMap *atomsSettingBits = 0);
197 
198 } // end of namespace MorganFingerprints
199 } // namespace RDKit
200 
201 #endif
const std::string morganFingerprintVersion
std::map< boost::uint32_t, std::vector< std::pair< boost::uint32_t, boost::uint32_t > > > BitInfoMap
RDKIT_FINGERPRINTS_EXPORT SparseIntVect< boost::uint32_t > * getFingerprint(const ROMol &mol, unsigned int radius, std::vector< boost::uint32_t > *invariants=0, const std::vector< boost::uint32_t > *fromAtoms=0, bool useChirality=false, bool useBondTypes=true, bool useCounts=true, bool onlyNonzeroInvariants=false, BitInfoMap *atomsSettingBits=0)
returns the Morgan fingerprint for a molecule
Std stuff.
Definition: Atom.h:30
#define RDKIT_FINGERPRINTS_EXPORT
Definition: export.h:229
RDKIT_FINGERPRINTS_EXPORT ExplicitBitVect * getFingerprintAsBitVect(const ROMol &mol, unsigned int radius, unsigned int nBits, std::vector< boost::uint32_t > *invariants=0, const std::vector< boost::uint32_t > *fromAtoms=0, bool useChirality=false, bool useBondTypes=true, bool onlyNonzeroInvariants=false, BitInfoMap *atomsSettingBits=0)
returns the Morgan fingerprint for a molecule as a bit vector
RDKIT_FINGERPRINTS_EXPORT SparseIntVect< boost::uint32_t > * getHashedFingerprint(const ROMol &mol, unsigned int radius, unsigned int nBits=2048, std::vector< boost::uint32_t > *invariants=0, const std::vector< boost::uint32_t > *fromAtoms=0, bool useChirality=false, bool useBondTypes=true, bool onlyNonzeroInvariants=false, BitInfoMap *atomsSettingBits=0)
returns the Morgan fingerprint for a molecule
a class for efficiently storing sparse vectors of ints
Definition: SparseIntVect.h:28
a class for bit vectors that are densely occupied