RDKit
Open-source cheminformatics and machine learning.
BitOps.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2003-2012 greg Landrum and Rational Discovery LLC
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #ifndef __RD_BITOPS_H__
11 #define __RD_BITOPS_H__
12 /*! \file BitOps.h
13 
14  \brief Contains general bit-comparison and similarity operations.
15 
16  The notation used to document the similarity metrics is:
17  - \c V1_n: number of bits in vector 1
18  - \c V1_o: number of on bits in vector 1
19  - <tt>(V1&V2)_o</tt>: number of on bits in the intersection of vectors 1 and 2
20 
21  */
22 
23 #include "BitVects.h"
24 #include <string>
25 
26 
27 //! general purpose wrapper for calculating the similarity between two bvs
28 //! that may be of unequal size (will automatically fold as appropriate)
29 template <typename T>
30 double SimilarityWrapper(const T &bv1,const T &bv2,
31  double (*metric)(const T &,const T &),
32  bool returnDistance=false){
33  double res=0.0;
34  if(bv1.getNumBits()>bv2.getNumBits()){
35  T *bv1tmp = FoldFingerprint(bv1,bv1.getNumBits()/bv2.getNumBits());
36  res = metric(*bv1tmp,bv2);
37  delete bv1tmp;
38  } else if(bv2.getNumBits()>bv1.getNumBits()){
39  T *bv2tmp = FoldFingerprint(bv2,bv2.getNumBits()/bv1.getNumBits());
40  res = metric(bv1,*bv2tmp);
41  delete bv2tmp;
42  } else {
43  res = metric(bv1,bv2);
44  }
45  if(returnDistance) res = 1.0-res;
46  return res;
47 }
48 //! \overload
49 template <typename T>
50 double SimilarityWrapper(const T &bv1,const T &bv2,double a,double b,
51  double (*metric)(const T &,const T &,double,double),
52  bool returnDistance=false){
53  double res=0.0;
54  if(bv1.getNumBits()>bv2.getNumBits()){
55  T *bv1tmp = FoldFingerprint(bv1,bv1.getNumBits()/bv2.getNumBits());
56  res = metric(*bv1tmp,bv2,a,b);
57  delete bv1tmp;
58  } else if(bv2.getNumBits()>bv1.getNumBits()){
59  T *bv2tmp = FoldFingerprint(bv2,bv2.getNumBits()/bv1.getNumBits());
60  res = metric(bv1,*bv2tmp,a,b);
61  delete bv2tmp;
62  } else {
63  res = metric(bv1,bv2,a,b);
64  }
65  if(returnDistance) res = 1.0-res;
66  return res;
67 }
68 
69 
70 bool AllProbeBitsMatch(const char *probe,const char *ref);
71 bool AllProbeBitsMatch(const std::string &probe,const std::string &ref);
72 bool AllProbeBitsMatch(const ExplicitBitVect& probe,const ExplicitBitVect &ref);
73 
74 
75 template <typename T1>
76 bool AllProbeBitsMatch(const T1 &probe,const std::string &pkl);
77 
78 template <typename T1>
79 bool AllProbeBitsMatch(const T1 &probe,const T1 &ref);
80 
81 
82 //! returns the number of on bits in common between two bit vectors
83 /*!
84  \return (bv1&bv2)_o
85 */
86 template <typename T1, typename T2>
87 int
88 NumOnBitsInCommon(const T1& bv1,const T2& bv2);
89 
90 int
91 NumOnBitsInCommon(const ExplicitBitVect & bv1,const ExplicitBitVect & bv2);
92 
93 //! returns the Tanimoto similarity between two bit vects
94 /*!
95  \return <tt>(bv1&bv2)_o / [bv1_o + bv2_o - (bv1&bv2)_o]</tt>
96 */
97 template <typename T1, typename T2>
98 double
99 TanimotoSimilarity(const T1& bv1,const T2& bv2);
100 
101 //! returns the Cosine similarity between two bit vects
102 /*!
103  \return <tt>(bv1&bv2)_o / sqrt(bv1_o + bv2_o)</tt>
104 */
105 template <typename T1, typename T2>
106 double
107 CosineSimilarity(const T1& bv1,
108  const T2& bv2);
109 
110 //! returns the Kulczynski similarity between two bit vects
111 /*!
112  \return <tt>(bv1&bv2)_o * [bv1_o + bv2_o] / [2 * bv1_o * bv2_o]</tt>
113 */
114 template <typename T1, typename T2>
115 double
116 KulczynskiSimilarity(const T1& bv1,
117  const T2& bv2);
118 
119 //! returns the Dice similarity between two bit vects
120 /*!
121  \return <tt>2*(bv1&bv2)_o / [bv1_o + bv2_o]</tt>
122 */
123 template <typename T1, typename T2>
124 double
125 DiceSimilarity(const T1& bv1,
126  const T2& bv2);
127 
128 //! returns the Tversky similarity between two bit vects
129 /*!
130  \return <tt>(bv1&bv2)_o / [a*bv1_o + b*bv2_o + (1 - a - b)*(bv1&bv2)_o]</tt>
131 
132  Notes:
133  # 0 <= a,b <= 1
134  # Tversky(a=1,b=1) = Tanimoto
135  # Tversky(a=1/2,b=1/2) = Dice
136 
137 */
138 template <typename T1, typename T2>
139 double
140 TverskySimilarity(const T1& bv1,
141  const T2& bv2,double a,double b);
142 
143 //! returns the Sokal similarity between two bit vects
144 /*!
145  \return <tt>(bv1&bv2)_o / [2*bv1_o + 2*bv2_o - 3*(bv1&bv2)_o]</tt>
146 */
147 template <typename T1, typename T2>
148 double
149 SokalSimilarity(const T1& bv1,
150  const T2& bv2);
151 
152 //! returns the McConnaughey similarity between two bit vects
153 /*!
154  \return <tt>[(bv1&bv2)_o * (bv1_o + bv2_o) - (bv1_o * bv2_o)] / (bv1_o * bv2_o)</tt>
155 */
156 template <typename T1, typename T2>
157 double
158 McConnaugheySimilarity(const T1& bv1,
159  const T2& bv2);
160 
161 //! returns the Asymmetric similarity between two bit vects
162 /*!
163  \return <tt>(bv1&bv2)_o / min(bv1_o,bv2_o)</tt>
164 */
165 template <typename T1, typename T2>
166 double
167 AsymmetricSimilarity(const T1& bv1,
168  const T2& bv2);
169 
170 //! returns the Braun-Blanquet similarity between two bit vects
171 /*!
172  \return <tt>(bv1&bv2)_o / max(bv1_o,bv2_o)</tt>
173 */
174 template <typename T1, typename T2>
175 double
176 BraunBlanquetSimilarity(const T1& bv1,
177  const T2& bv2);
178 
179 //! returns the Russel similarity between two bit vects
180 /*!
181  \return <tt>(bv1&bv2)_o / bv1_o</tt>
182 
183  <b>Note:</b> that this operation is non-commutative:
184  RusselSimilarity(bv1,bv2) != RusselSimilarity(bv2,bv1)
185 
186 */
187 template <typename T1, typename T2>
188 double
189 RusselSimilarity(const T1& bv1,
190  const T2& bv2);
191 
192 //! returns the Rogot-Goldberg similarity between two bit vects
193 /*!
194  \return <tt>(bv1&bv2)_o / (bv1_o + bv2_o)
195  + (bv1_n - bv1_o - bv2_o + (bv1&bv2)_o) / (2*bv1_n - bv1_o - bv2_o) </tt>
196 */
197 template <typename T1, typename T2>
198 double
199 RogotGoldbergSimilarity(const T1& bv1,const T2& bv2);
200 
201 
202 //! returns the on bit similarity between two bit vects
203 /*!
204  \return <tt>(bv1&bv2)_o / (bv1|bv2)_o </tt>
205 */
206 template <typename T1, typename T2>
207 double
208 OnBitSimilarity(const T1& bv1,const T2& bv2);
209 
210 //! returns the number of common bits (on and off) between two bit vects
211 /*!
212  \return <tt>bv1_n - (bv1^bv2)_o</tt>
213 */
214 template <typename T1, typename T2>
215 int
216 NumBitsInCommon(const T1& bv1,const T2& bv2);
217 
218 int
219 NumBitsInCommon(const ExplicitBitVect & bv1,const ExplicitBitVect & bv2);
220 
221 //! returns the common-bit similarity (on and off) between two bit vects
222 //! This is also called Manhattan similarity.
223 /*!
224  \return <tt>[bv1_n - (bv1^bv2)_o] / bv1_n</tt>
225 */
226 template <typename T1, typename T2>
227 double
228 AllBitSimilarity(const T1& bv1,const T2& bv2);
229 
230 //! returns an IntVect with indices of all on bits in common between two bit vects
231 template <typename T1, typename T2>
232 IntVect
233 OnBitsInCommon(const T1& bv1,const T2& bv2);
234 
235 //! returns an IntVect with indices of all off bits in common between two bit vects
236 template <typename T1, typename T2>
237 IntVect
238 OffBitsInCommon(const T1& bv1,const T2& bv2);
239 
240 //! returns the on-bit projected similarities between two bit vects
241 /*!
242  \return two values, as a DoubleVect:
243  - <tt>(bv1&bv2)_o / bv1_o</tt>
244  - <tt>(bv1&bv2)_o / bv2_o</tt>
245 */
246 template <typename T1, typename T2>
248 OnBitProjSimilarity(const T1& bv1,const T2& bv2);
249 
250 //! returns the on-bit projected similarities between two bit vects
251 /*!
252  \return two values, as a DoubleVect:
253  - <tt>[bv1_n - (bv1|bv2)_o] / [bv1_n - bv1_o]</tt>
254  - <tt>[bv2_n - (bv1|bv2)_o] / [bv2_n - bv2_o]</tt>
255 
256  <b>Note:</b> <tt>bv1_n = bv2_n</tt>
257 
258 */
259 template <typename T1, typename T2>
261 OffBitProjSimilarity(const T1& bv1,const T2& bv2);
262 
263 
264 //! folds a bit vector \c factor times and returns the result
265 /*!
266  \param bv1 the vector to be folded
267  \param factor (optional) the number of times to fold it
268 
269  \return a pointer to the folded fingerprint, which is
270  <tt>bv1_n/factor</tt> long.
271 
272  <b>Note:</b> The caller is responsible for <tt>delete</tt>ing the result.
273  */
274 template <typename T1>
275 T1 *
276 FoldFingerprint(const T1& bv1,unsigned int factor=2);
277 
278 //! returns a text representation of a bit vector (a string of 0s and 1s)
279 /*!
280  \param bv1 the vector to use
281 
282  \return an std::string
283 
284  */
285 template <typename T1>
286 std::string
287 BitVectToText(const T1& bv1);
288 
289 //! returns a hex representation of a bit vector compatible with Andrew Dalke's FPS format
290 /*!
291  \param bv1 the vector to use
292 
293  \return an std::string
294 
295  */
296 template <typename T1>
297 std::string
298 BitVectToFPSText(const T1& bv1);
299 
300 //! returns a binary string representation of a bit vector (an array of bytes)
301 /*!
302  \param bv1 the vector to use
303 
304  \return an std::string
305 
306  */
307 template <typename T1>
308 std::string
309 BitVectToBinaryText(const T1& bv1);
310 
311 //! updates a bit vector from Andrew Dalke's FPS format
312 /*!
313  \param bv1 the vector to use
314  \param fps the FPS hex string
315 
316 
317  */
318 template <typename T1>
319 void
320 UpdateBitVectFromFPSText(T1& bv1,const std::string &fps);
321 
322 //! updates a bit vector from a binary string representation of a bit vector (an array of bytes)
323 /*!
324  \param bv1 the vector to use
325  \param fps the binary string
326 
327 
328  */
329 template <typename T1>
330 void
331 UpdateBitVectFromBinaryText(T1& bv1,const std::string &fps);
332 
333 
334 
335 #endif
Pulls in all the BitVect classes.
double DiceSimilarity(const T1 &bv1, const T2 &bv2)
returns the Dice similarity between two bit vects
T1 * FoldFingerprint(const T1 &bv1, unsigned int factor=2)
folds a bit vector factor times and returns the result
std::string BitVectToFPSText(const T1 &bv1)
returns a hex representation of a bit vector compatible with Andrew Dalke&#39;s FPS format ...
double SokalSimilarity(const T1 &bv1, const T2 &bv2)
returns the Sokal similarity between two bit vects
DoubleVect OffBitProjSimilarity(const T1 &bv1, const T2 &bv2)
returns the on-bit projected similarities between two bit vects
double TanimotoSimilarity(const T1 &bv1, const T2 &bv2)
returns the Tanimoto similarity between two bit vects
double OnBitSimilarity(const T1 &bv1, const T2 &bv2)
returns the on bit similarity between two bit vects
DoubleVect OnBitProjSimilarity(const T1 &bv1, const T2 &bv2)
returns the on-bit projected similarities between two bit vects
int NumOnBitsInCommon(const T1 &bv1, const T2 &bv2)
returns the number of on bits in common between two bit vectors
int NumBitsInCommon(const T1 &bv1, const T2 &bv2)
returns the number of common bits (on and off) between two bit vects
double RogotGoldbergSimilarity(const T1 &bv1, const T2 &bv2)
returns the Rogot-Goldberg similarity between two bit vects
void UpdateBitVectFromBinaryText(T1 &bv1, const std::string &fps)
updates a bit vector from a binary string representation of a bit vector (an array of bytes) ...
double AsymmetricSimilarity(const T1 &bv1, const T2 &bv2)
returns the Asymmetric similarity between two bit vects
std::string BitVectToBinaryText(const T1 &bv1)
returns a binary string representation of a bit vector (an array of bytes)
double CosineSimilarity(const T1 &bv1, const T2 &bv2)
returns the Cosine similarity between two bit vects
std::vector< double > DoubleVect
Definition: BitVect.h:18
double SimilarityWrapper(const T &bv1, const T &bv2, double(*metric)(const T &, const T &), bool returnDistance=false)
Definition: BitOps.h:30
IntVect OnBitsInCommon(const T1 &bv1, const T2 &bv2)
returns an IntVect with indices of all on bits in common between two bit vects
std::string BitVectToText(const T1 &bv1)
returns a text representation of a bit vector (a string of 0s and 1s)
double TverskySimilarity(const T1 &bv1, const T2 &bv2, double a, double b)
returns the Tversky similarity between two bit vects
double McConnaugheySimilarity(const T1 &bv1, const T2 &bv2)
returns the McConnaughey similarity between two bit vects
double AllBitSimilarity(const T1 &bv1, const T2 &bv2)
bool AllProbeBitsMatch(const char *probe, const char *ref)
std::vector< int > IntVect
Definition: BitVect.h:16
a class for bit vectors that are densely occupied
IntVect OffBitsInCommon(const T1 &bv1, const T2 &bv2)
returns an IntVect with indices of all off bits in common between two bit vects
double RusselSimilarity(const T1 &bv1, const T2 &bv2)
returns the Russel similarity between two bit vects
double KulczynskiSimilarity(const T1 &bv1, const T2 &bv2)
returns the Kulczynski similarity between two bit vects
void UpdateBitVectFromFPSText(T1 &bv1, const std::string &fps)
updates a bit vector from Andrew Dalke&#39;s FPS format
double BraunBlanquetSimilarity(const T1 &bv1, const T2 &bv2)
returns the Braun-Blanquet similarity between two bit vects