RDKit
Open-source cheminformatics and machine learning.
AtomPairs.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2007-2013 Greg Landrum
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 
11 /*! \file AtomPairs.h
12 
13 
14  A few quick notes about fingerprint size and the way chirality is handled in these functions.
15 
16  By default the atom-pair and topologic-torsion fingerprints do not include any information about
17  chirality; the atom invariants only include information about the atomic number,
18  number of pi electrons, and degree.
19  When chirality is included, two additional bits are added to the atom invariants to flag R/S/no
20  chirality. These additional bits change the size of the atom invariants and either the size
21  of the final fingerprint (atom pairs) or the maximum allowed path length (torsions). This means
22  that even fingerprints for achiral molecules are different when includeChirality is true.
23 
24 */
25 #ifndef __RD_ATOMPAIRS_H__
26 #define __RD_ATOMPAIRS_H__
27 
29 #include <DataStructs/BitVects.h>
30 #include <boost/cstdint.hpp>
31 namespace RDKit {
32  class Atom;
33 
34  namespace AtomPairs {
35  const std::string atomPairsVersion="1.1.0";
36  const unsigned int numTypeBits=4;
37  const unsigned int atomNumberTypes[1<<numTypeBits]={5,6,7,8,9,14,15,16,17,33,34,35,51,52,43};
38  const unsigned int numPiBits=2;
39  const unsigned int maxNumPi=(1<<numPiBits)-1;
40  const unsigned int numBranchBits=3;
41  const unsigned int maxNumBranches=(1<<numBranchBits)-1;
42  const unsigned int numChiralBits=2;
43  const unsigned int codeSize=numTypeBits+numPiBits+numBranchBits;
44  const unsigned int numPathBits=5;
45  const unsigned int maxPathLen=(1<<numPathBits)-1;
46  const unsigned int numAtomPairFingerprintBits=numPathBits+2*codeSize; // note that this is only accurate if chirality is not included
47 
48  //! returns a numeric code for the atom (the atom's hash in the
49  //! atom-pair scheme)
50  /*!
51  \param atom the atom to be considered
52  \param branchSubtract (optional) a constant to subtract from
53  the number of neighbors when the hash
54  is calculated (used in the topological
55  torsions code)
56  \param includeChirality toggles the inclusions of bits indicating R/S chirality
57  */
58  boost::uint32_t getAtomCode(const Atom *atom,unsigned int branchSubtract=0,bool includeChirality=false);
59 
60  //! returns an atom pair hash based on two atom hashes and the
61  //! distance between the atoms.
62  /*!
63  \param codeI the hash for the first atom
64  \param codeJ the hash for the second atom
65  \param dist the distance (number of bonds) between the two
66  atoms
67  \param includeChirality toggles the inclusions of bits indicating R/S chirality
68  */
69  boost::uint32_t getAtomPairCode(boost::uint32_t codeI,boost::uint32_t codeJ,
70  unsigned int dist,bool includeChirality=false);
71 
72  //! returns the atom-pair fingerprint for a molecule
73  /*!
74  The algorithm used is described here:
75  R.E. Carhart, D.H. Smith, R. Venkataraghavan; "Atom Pairs as
76  Molecular Features in Structure-Activity Studies: Definition
77  and Applications" JCICS 25, 64-73 (1985).
78 
79 
80  \param mol: the molecule to be fingerprinted
81  \param minLength: minimum distance between atoms to be
82  considered in a pair. Default is 1 bond.
83  \param maxLength: maximum distance between atoms to be
84  considered in a pair.
85  Default is maxPathLen-1 bonds.
86  \param fromAtoms: if provided, only atom pairs that involve
87  the specified atoms will be included in the
88  fingerprint
89  \param ignoreAtoms: if provided, any atom pairs that include
90  the specified atoms will not be included in the
91  fingerprint
92  \param atomInvariants: a list of invariants to use for the atom hashes
93  note: only the first \c codeSize bits of each
94  invariant are used.
95  \param includeChirality: if set, chirality will be used in the atom invariants
96  (note: this is ignored if atomInvariants are provided)
97  \param use2D: if set, the 2D (topological) distance matrix is used.
98  \param confId: the conformation to use if 3D distances are being used
99 
100 
101  \return a pointer to the fingerprint. The client is
102  responsible for calling delete on this.
103 
104  */
106  getAtomPairFingerprint(const ROMol &mol,
107  unsigned int minLength,unsigned int maxLength,
108  const std::vector<boost::uint32_t> *fromAtoms=0,
109  const std::vector<boost::uint32_t> *ignoreAtoms=0,
110  const std::vector<boost::uint32_t> *atomInvariants=0,
111  bool includeChirality=false,
112  bool use2D=true,
113  int confId=-1);
114  //! \overload
116  getAtomPairFingerprint(const ROMol &mol,
117  const std::vector<boost::uint32_t> *fromAtoms=0,
118  const std::vector<boost::uint32_t> *ignoreAtoms=0,
119  const std::vector<boost::uint32_t> *atomInvariants=0,
120  bool includeChirality=false,
121  bool use2D=true,
122  int confId=-1);
123 
124 
125  //! returns the hashed atom-pair fingerprint for a molecule
126  /*!
127  \param mol: the molecule to be fingerprinted
128  \param nBits: the length of the fingerprint to generate
129  \param minLength: minimum distance between atoms to be
130  considered in a pair. Default is 1 bond.
131  \param maxLength: maximum distance between atoms to be
132  considered in a pair.
133  Default is maxPathLen-1 bonds.
134  \param fromAtoms: if provided, only atom pairs that involve
135  the specified atoms will be included in the
136  fingerprint
137  \param ignoreAtoms: if provided, any atom pairs that include
138  the specified atoms will not be included in the
139  fingerprint
140  \param atomInvariants: a list of invariants to use for the atom hashes
141  note: only the first \c codeSize bits of each
142  invariant are used.
143  \param includeChirality: if set, chirality will be used in the atom invariants
144  (note: this is ignored if atomInvariants are provided)
145  \param use2D: if set, the 2D (topological) distance matrix is used.
146 
147  \return a pointer to the fingerprint. The client is
148  responsible for calling delete on this.
149 
150  */
153  unsigned int nBits=2048,
154  unsigned int minLength=1,
155  unsigned int maxLength=maxPathLen-1,
156  const std::vector<boost::uint32_t> *fromAtoms=0,
157  const std::vector<boost::uint32_t> *ignoreAtoms=0,
158  const std::vector<boost::uint32_t> *atomInvariants=0,
159  bool includeChirality=false,
160  bool use2D=true,
161  int confId=-1);
162  //! returns the hashed atom-pair fingerprint for a molecule as a bit vector
163  /*!
164  \param mol: the molecule to be fingerprinted
165  \param nBits: the length of the fingerprint to generate
166  \param minLength: minimum distance between atoms to be
167  considered in a pair. Default is 1 bond.
168  \param maxLength: maximum distance between atoms to be
169  considered in a pair.
170  Default is maxPathLen-1 bonds.
171  \param fromAtoms: if provided, only atom pairs that involve
172  the specified atoms will be included in the
173  fingerprint
174  \param ignoreAtoms: if provided, any atom pairs that include
175  the specified atoms will not be included in the
176  fingerprint
177  \param atomInvariants: a list of invariants to use for the atom hashes
178  note: only the first \c codeSize bits of each
179  invariant are used.
180  \param nBitsPerEntry: number of bits to use in simulating counts
181  \param includeChirality: if set, chirality will be used in the atom invariants
182  (note: this is ignored if atomInvariants are provided)
183  \param use2D: if set, the 2D (topological) distance matrix is used.
184  \param confId: the conformation to use if 3D distances are being used
185 
186  \return a pointer to the fingerprint. The client is
187  responsible for calling delete on this.
188 
189  */
192  unsigned int nBits=2048,
193  unsigned int minLength=1,
194  unsigned int maxLength=maxPathLen-1,
195  const std::vector<boost::uint32_t> *fromAtoms=0,
196  const std::vector<boost::uint32_t> *ignoreAtoms=0,
197  const std::vector<boost::uint32_t> *atomInvariants=0,
198  unsigned int nBitsPerEntry=4,
199  bool includeChirality=false,
200  bool use2D=true,
201  int confId=-1);
202 
203 
204 
205  //! returns an topological torsion hash based on the atom hashes
206  //! passed in
207  /*!
208  \param atomCodes the vector of atom hashes
209  */
210  boost::uint64_t getTopologicalTorsionCode(const std::vector<boost::uint32_t> &atomCodes,bool includeChirality=false);
211 
212  //! returns the topological-torsion fingerprint for a molecule
213  /*!
214  The algorithm used is described here:
215  R. Nilakantan, N. Bauman, J. S. Dixon, R. Venkataraghavan;
216  "Topological Torsion: A New Molecular Descriptor for SAR Applications.
217  Comparison with Other Descriptors" JCICS 27, 82-85 (1987).
218 
219  \param mol: the molecule to be fingerprinted
220  \param targetSize: the number of atoms to include in the "torsions"
221  \param fromAtoms: if provided, only torsions that start or end at
222  the specified atoms will be included in the
223  fingerprint
224  \param ignoreAtoms: if provided, any torsions that include
225  the specified atoms will not be included in the
226  fingerprint
227  \param atomInvariants: a list of invariants to use for the atom hashes
228  note: only the first \c codeSize bits of each
229  invariant are used.
230  \param includeChirality: if set, chirality will be used in the atom invariants
231  (note: this is ignored if atomInvariants are provided)
232 
233  \return a pointer to the fingerprint. The client is
234  responsible for calling delete on this.
235 
236  */
239  unsigned int targetSize=4,
240  const std::vector<boost::uint32_t> *fromAtoms=0,
241  const std::vector<boost::uint32_t> *ignoreAtoms=0,
242  const std::vector<boost::uint32_t> *atomInvariants=0,
243  bool includeChirality=false
244  );
245  //! returns a hashed topological-torsion fingerprint for a molecule
246  /*!
247  The algorithm used is described here:
248  R. Nilakantan, N. Bauman, J. S. Dixon, R. Venkataraghavan;
249  "Topological Torsion: A New Molecular Descriptor for SAR Applications.
250  Comparison with Other Descriptors" JCICS 27, 82-85 (1987).
251 
252  \param mol: the molecule to be fingerprinted
253  \param nBits: number of bits to include in the fingerprint
254  \param targetSize: the number of atoms to include in the "torsions"
255  \param fromAtoms: if provided, only torsions that start or end at
256  the specified atoms will be included in the
257  fingerprint
258  \param ignoreAtoms: if provided, any torsions that include
259  the specified atoms will not be included in the
260  fingerprint
261  \param atomInvariants: a list of invariants to use for the atom hashes
262  note: only the first \c codeSize bits of each
263  invariant are used.
264  \param includeChirality: if set, chirality will be used in the atom invariants
265  (note: this is ignored if atomInvariants are provided)
266 
267  \return a pointer to the fingerprint. The client is
268  responsible for calling delete on this.
269 
270  */
273  unsigned int nBits=2048,
274  unsigned int targetSize=4,
275  const std::vector<boost::uint32_t> *fromAtoms=0,
276  const std::vector<boost::uint32_t> *ignoreAtoms=0,
277  const std::vector<boost::uint32_t> *atomInvariants=0,
278  bool includeChirality=false);
279  //! returns a hashed topological-torsion fingerprint for a molecule as a bit vector
280  /*!
281  \param mol: the molecule to be fingerprinted
282  \param nBits: number of bits to include in the fingerprint
283  \param targetSize: the number of atoms to include in the "torsions"
284  \param fromAtoms: if provided, only torsions that start or end at
285  the specified atoms will be included in the
286  fingerprint
287  \param ignoreAtoms: if provided, any torsions that include
288  the specified atoms will not be included in the
289  fingerprint
290  \param atomInvariants: a list of invariants to use for the atom hashes
291  note: only the first \c codeSize bits of each
292  invariant are used.
293  \param nBitsPerEntry: number of bits to use in simulating counts
294  \param includeChirality: if set, chirality will be used in the atom invariants
295  (note: this is ignored if atomInvariants are provided)
296 
297  \return a pointer to the fingerprint. The client is
298  responsible for calling delete on this.
299 
300  */
303  unsigned int nBits=2048,
304  unsigned int targetSize=4,
305  const std::vector<boost::uint32_t> *fromAtoms=0,
306  const std::vector<boost::uint32_t> *ignoreAtoms=0,
307  const std::vector<boost::uint32_t> *atomInvariants=0,
308  unsigned int nBitsPerEntry=4,
309  bool includeChirality=false);
310  }
311 }
312 
313 #endif
const std::string atomPairsVersion
Definition: AtomPairs.h:35
Pulls in all the BitVect classes.
SparseIntVect< boost::int32_t > * getHashedAtomPairFingerprint(const ROMol &mol, unsigned int nBits=2048, unsigned int minLength=1, unsigned int maxLength=maxPathLen-1, const std::vector< boost::uint32_t > *fromAtoms=0, const std::vector< boost::uint32_t > *ignoreAtoms=0, const std::vector< boost::uint32_t > *atomInvariants=0, bool includeChirality=false, bool use2D=true, int confId=-1)
returns the hashed atom-pair fingerprint for a molecule
SparseIntVect< boost::int64_t > * getTopologicalTorsionFingerprint(const ROMol &mol, unsigned int targetSize=4, const std::vector< boost::uint32_t > *fromAtoms=0, const std::vector< boost::uint32_t > *ignoreAtoms=0, const std::vector< boost::uint32_t > *atomInvariants=0, bool includeChirality=false)
returns the topological-torsion fingerprint for a molecule
const unsigned int maxPathLen
Definition: AtomPairs.h:45
const unsigned int numPathBits
Definition: AtomPairs.h:44
const unsigned int maxNumPi
Definition: AtomPairs.h:39
SparseIntVect< boost::int64_t > * getHashedTopologicalTorsionFingerprint(const ROMol &mol, unsigned int nBits=2048, unsigned int targetSize=4, const std::vector< boost::uint32_t > *fromAtoms=0, const std::vector< boost::uint32_t > *ignoreAtoms=0, const std::vector< boost::uint32_t > *atomInvariants=0, bool includeChirality=false)
returns a hashed topological-torsion fingerprint for a molecule
ROMol is a molecule class that is intended to have a fixed topology.
Definition: ROMol.h:105
boost::uint32_t getAtomCode(const Atom *atom, unsigned int branchSubtract=0, bool includeChirality=false)
const unsigned int numPiBits
Definition: AtomPairs.h:38
Includes a bunch of functionality for handling Atom and Bond queries.
Definition: Atom.h:28
const unsigned int maxNumBranches
Definition: AtomPairs.h:41
const unsigned int numBranchBits
Definition: AtomPairs.h:40
boost::uint64_t getTopologicalTorsionCode(const std::vector< boost::uint32_t > &atomCodes, bool includeChirality=false)
ExplicitBitVect * getHashedTopologicalTorsionFingerprintAsBitVect(const ROMol &mol, unsigned int nBits=2048, unsigned int targetSize=4, const std::vector< boost::uint32_t > *fromAtoms=0, const std::vector< boost::uint32_t > *ignoreAtoms=0, const std::vector< boost::uint32_t > *atomInvariants=0, unsigned int nBitsPerEntry=4, bool includeChirality=false)
returns a hashed topological-torsion fingerprint for a molecule as a bit vector
const unsigned int numAtomPairFingerprintBits
Definition: AtomPairs.h:46
a class for efficiently storing sparse vectors of ints
Definition: SparseIntVect.h:27
a class for bit vectors that are densely occupied
boost::uint32_t getAtomPairCode(boost::uint32_t codeI, boost::uint32_t codeJ, unsigned int dist, bool includeChirality=false)
const unsigned int codeSize
Definition: AtomPairs.h:43
const unsigned int atomNumberTypes[1<< numTypeBits]
Definition: AtomPairs.h:37
The class for representing atoms.
Definition: Atom.h:67
ExplicitBitVect * getHashedAtomPairFingerprintAsBitVect(const ROMol &mol, unsigned int nBits=2048, unsigned int minLength=1, unsigned int maxLength=maxPathLen-1, const std::vector< boost::uint32_t > *fromAtoms=0, const std::vector< boost::uint32_t > *ignoreAtoms=0, const std::vector< boost::uint32_t > *atomInvariants=0, unsigned int nBitsPerEntry=4, bool includeChirality=false, bool use2D=true, int confId=-1)
returns the hashed atom-pair fingerprint for a molecule as a bit vector
SparseIntVect< boost::int32_t > * getAtomPairFingerprint(const ROMol &mol, unsigned int minLength, unsigned int maxLength, const std::vector< boost::uint32_t > *fromAtoms=0, const std::vector< boost::uint32_t > *ignoreAtoms=0, const std::vector< boost::uint32_t > *atomInvariants=0, bool includeChirality=false, bool use2D=true, int confId=-1)
returns the atom-pair fingerprint for a molecule
const unsigned int numChiralBits
Definition: AtomPairs.h:42
const unsigned int numTypeBits
Definition: AtomPairs.h:36