RDKit
Open-source cheminformatics and machine learning.
SparseIntVect.h
Go to the documentation of this file.
1 // $Id$
2 //
3 // Copyright (C) 2007-2008 Greg Landrum
4 //
5 // @@ All Rights Reserved @@
6 // This file is part of the RDKit.
7 // The contents are covered by the terms of the BSD license
8 // which is included in the file license.txt, found at the root
9 // of the RDKit source tree.
10 //
11 #include <RDGeneral/export.h>
12 #ifndef __RD_SPARSE_INT_VECT_20070921__
13 #define __RD_SPARSE_INT_VECT_20070921__
14 
15 #include <map>
16 #include <string>
17 #include <RDGeneral/Invariant.h>
18 #include <sstream>
19 #include <RDGeneral/Exceptions.h>
20 #include <RDGeneral/StreamOps.h>
21 #include <boost/cstdint.hpp>
22 
24  0x0001; //!< version number to use in pickles
25 namespace RDKit {
26 //! a class for efficiently storing sparse vectors of ints
27 template <typename IndexType>
29  public:
30  typedef std::map<IndexType, int> StorageType;
31 
32  SparseIntVect() : d_length(0){};
33 
34  //! initialize with a particular length
35  SparseIntVect(IndexType length) : d_length(length){};
36 
37  //! Copy constructor
39  d_length = other.d_length;
40  d_data.insert(other.d_data.begin(), other.d_data.end());
41  }
42 
43  //! constructor from a pickle
44  SparseIntVect(const std::string &pkl) {
45  initFromText(pkl.c_str(), pkl.size());
46  };
47  //! constructor from a pickle
48  SparseIntVect(const char *pkl, const unsigned int len) {
49  initFromText(pkl, len);
50  };
51 
52  //! destructor (doesn't need to do anything)
54 
55 #ifdef __clang__
56 #pragma clang diagnostic push
57 #pragma clang diagnostic ignored "-Wtautological-compare"
58 #elif (defined(__GNUC__) || defined(__GNUG__)) && \
59  (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 1))
60 #if (__GNUC__ > 4 || __GNUC_MINOR__ > 5)
61 #pragma GCC diagnostic push
62 #endif
63 #pragma GCC diagnostic ignored "-Wtype-limits"
64 #endif
65  //! return the value at an index
66  int getVal(IndexType idx) const {
67  if (idx < 0 || idx >= d_length) {
68  throw IndexErrorException(static_cast<int>(idx));
69  }
70  int res = 0;
71  typename StorageType::const_iterator iter = d_data.find(idx);
72  if (iter != d_data.end()) {
73  res = iter->second;
74  }
75  return res;
76  };
77 
78  //! set the value at an index
79  void setVal(IndexType idx, int val) {
80  if (idx < 0 || idx >= d_length) {
81  throw IndexErrorException(static_cast<int>(idx));
82  }
83  if (val != 0) {
84  d_data[idx] = val;
85  } else {
86  d_data.erase(idx);
87  }
88  };
89 #ifdef __clang__
90 #pragma clang diagnostic pop
91 #elif (defined(__GNUC__) || defined(__GNUG__)) && \
92  (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 5))
93 #pragma GCC diagnostic pop
94 #endif
95  //! support indexing using []
96  int operator[](IndexType idx) const { return getVal(idx); };
97 
98  //! returns the length
99  IndexType getLength() const { return d_length; };
100 
101  //! returns the sum of all the elements in the vect
102  //! the doAbs argument toggles summing the absolute values of the elements
103  int getTotalVal(bool doAbs = false) const {
104  int res = 0;
105  typename StorageType::const_iterator iter;
106  for (iter = d_data.begin(); iter != d_data.end(); ++iter) {
107  if (!doAbs)
108  res += iter->second;
109  else
110  res += abs(iter->second);
111  }
112  return res;
113  };
114  //! returns the length
115  unsigned int size() const { return getLength(); };
116 
117  //! returns our nonzero elements as a map(IndexType->int)
118  const StorageType &getNonzeroElements() const { return d_data; }
119 
120  //! this is a "fuzzy" intesection, the final value
121  //! of each element is equal to the minimum from
122  //! the two vects.
124  if (other.d_length != d_length) {
125  throw ValueErrorException("SparseIntVect size mismatch");
126  }
127 
128  typename StorageType::iterator iter = d_data.begin();
129  typename StorageType::const_iterator oIter = other.d_data.begin();
130  while (iter != d_data.end()) {
131  // we're relying on the fact that the maps are sorted:
132  while (oIter != other.d_data.end() && oIter->first < iter->first) {
133  ++oIter;
134  }
135  if (oIter != other.d_data.end() && oIter->first == iter->first) {
136  // found it:
137  if (oIter->second < iter->second) {
138  iter->second = oIter->second;
139  }
140  ++oIter;
141  ++iter;
142  } else {
143  // not there; our value is zero, which means
144  // we should remove this value:
145  typename StorageType::iterator tmpIter = iter;
146  ++tmpIter;
147  d_data.erase(iter);
148  iter = tmpIter;
149  }
150  }
151  return *this;
152  };
154  const SparseIntVect<IndexType> &other) const {
155  SparseIntVect<IndexType> res(*this);
156  return res &= other;
157  }
158 
159  //! this is a "fuzzy" union, the final value
160  //! of each element is equal to the maximum from
161  //! the two vects.
163  if (other.d_length != d_length) {
164  throw ValueErrorException("SparseIntVect size mismatch");
165  }
166 
167  typename StorageType::iterator iter = d_data.begin();
168  typename StorageType::const_iterator oIter = other.d_data.begin();
169  while (iter != d_data.end()) {
170  // we're relying on the fact that the maps are sorted:
171  while (oIter != other.d_data.end() && oIter->first < iter->first) {
172  d_data[oIter->first] = oIter->second;
173  ++oIter;
174  }
175  if (oIter != other.d_data.end() && oIter->first == iter->first) {
176  // found it:
177  if (oIter->second > iter->second) {
178  iter->second = oIter->second;
179  }
180  ++oIter;
181  }
182  ++iter;
183  }
184  // finish up the other vect:
185  while (oIter != other.d_data.end()) {
186  d_data[oIter->first] = oIter->second;
187  ++oIter;
188  }
189  return *this;
190  };
192  const SparseIntVect<IndexType> &other) const {
193  SparseIntVect<IndexType> res(*this);
194  return res |= other;
195  }
196 
198  if (other.d_length != d_length) {
199  throw ValueErrorException("SparseIntVect size mismatch");
200  }
201  typename StorageType::iterator iter = d_data.begin();
202  typename StorageType::const_iterator oIter = other.d_data.begin();
203  while (oIter != other.d_data.end()) {
204  while (iter != d_data.end() && iter->first < oIter->first) {
205  ++iter;
206  }
207  if (iter != d_data.end() && oIter->first == iter->first) {
208  // found it:
209  iter->second += oIter->second;
210  if (!iter->second) {
211  typename StorageType::iterator tIter = iter;
212  ++tIter;
213  d_data.erase(iter);
214  iter = tIter;
215  } else {
216  ++iter;
217  }
218  } else {
219  d_data[oIter->first] = oIter->second;
220  }
221  ++oIter;
222  }
223  return *this;
224  };
226  const SparseIntVect<IndexType> &other) const {
227  SparseIntVect<IndexType> res(*this);
228  return res += other;
229  }
230 
232  if (other.d_length != d_length) {
233  throw ValueErrorException("SparseIntVect size mismatch");
234  }
235  typename StorageType::iterator iter = d_data.begin();
236  typename StorageType::const_iterator oIter = other.d_data.begin();
237  while (oIter != other.d_data.end()) {
238  while (iter != d_data.end() && iter->first < oIter->first) {
239  ++iter;
240  }
241  if (iter != d_data.end() && oIter->first == iter->first) {
242  // found it:
243  iter->second -= oIter->second;
244  if (!iter->second) {
245  typename StorageType::iterator tIter = iter;
246  ++tIter;
247  d_data.erase(iter);
248  iter = tIter;
249  } else {
250  ++iter;
251  }
252  } else {
253  d_data[oIter->first] = -oIter->second;
254  }
255  ++oIter;
256  }
257  return *this;
258  };
260  const SparseIntVect<IndexType> &other) const {
261  SparseIntVect<IndexType> res(*this);
262  return res -= other;
263  }
265  typename StorageType::iterator iter = d_data.begin();
266  while (iter != d_data.end()) {
267  iter->second *= v;
268  ++iter;
269  }
270  return *this;
271  };
273  SparseIntVect<IndexType> res(*this);
274  return res *= v;
275  };
277  typename StorageType::iterator iter = d_data.begin();
278  while (iter != d_data.end()) {
279  iter->second /= v;
280  ++iter;
281  }
282  return *this;
283  };
285  SparseIntVect<IndexType> res(*this);
286  return res /= v;
287  };
289  typename StorageType::iterator iter = d_data.begin();
290  while (iter != d_data.end()) {
291  iter->second += v;
292  ++iter;
293  }
294  return *this;
295  };
297  SparseIntVect<IndexType> res(*this);
298  return res += v;
299  };
301  typename StorageType::iterator iter = d_data.begin();
302  while (iter != d_data.end()) {
303  iter->second -= v;
304  ++iter;
305  }
306  return *this;
307  };
309  SparseIntVect<IndexType> res(*this);
310  return res -= v;
311  };
312 
313  bool operator==(const SparseIntVect<IndexType> &v2) const {
314  if (d_length != v2.d_length) {
315  return false;
316  }
317  return d_data == v2.d_data;
318  }
319  bool operator!=(const SparseIntVect<IndexType> &v2) const {
320  return !(*this == v2);
321  }
322 
323  //! returns a binary string representation (pickle)
324  std::string toString() const {
325  std::stringstream ss(std::ios_base::binary | std::ios_base::out |
326  std::ios_base::in);
327  boost::uint32_t tInt;
329  streamWrite(ss, tInt);
330  tInt = sizeof(IndexType);
331  streamWrite(ss, tInt);
332  streamWrite(ss, d_length);
333  IndexType nEntries = d_data.size();
334  streamWrite(ss, nEntries);
335 
336  typename StorageType::const_iterator iter = d_data.begin();
337  while (iter != d_data.end()) {
338  streamWrite(ss, iter->first);
339  boost::int32_t tInt = iter->second;
340  streamWrite(ss, tInt);
341  ++iter;
342  }
343  return ss.str();
344  };
345 
346  void fromString(const std::string &txt) {
347  initFromText(txt.c_str(), txt.length());
348  }
349 
350  private:
351  IndexType d_length;
352  StorageType d_data;
353 
354  void initFromText(const char *pkl, const unsigned int len) {
355  d_data.clear();
356  std::stringstream ss(std::ios_base::binary | std::ios_base::out |
357  std::ios_base::in);
358  ss.write(pkl, len);
359 
360  boost::uint32_t vers;
361  streamRead(ss, vers);
362  if (vers == 0x0001) {
363  boost::uint32_t tInt;
364  streamRead(ss, tInt);
365  if (tInt > sizeof(IndexType)) {
366  throw ValueErrorException(
367  "IndexType cannot accomodate index size in SparseIntVect pickle");
368  }
369  switch (tInt) {
370  case sizeof(char):
371  readVals<unsigned char>(ss);
372  break;
373  case sizeof(boost::int32_t):
374  readVals<boost::uint32_t>(ss);
375  break;
376  case sizeof(boost::int64_t):
377  readVals<boost::uint64_t>(ss);
378  break;
379  default:
380  throw ValueErrorException("unreadable format");
381  }
382  } else {
383  throw ValueErrorException("bad version in SparseIntVect pickle");
384  }
385  };
386  template <typename T>
387  void readVals(std::stringstream &ss) {
388  PRECONDITION(sizeof(T) <= sizeof(IndexType), "invalid size");
389  T tVal;
390  streamRead(ss, tVal);
391  d_length = tVal;
392  T nEntries;
393  streamRead(ss, nEntries);
394  for (T i = 0; i < nEntries; ++i) {
395  streamRead(ss, tVal);
396  boost::int32_t val;
397  streamRead(ss, val);
398  d_data[tVal] = val;
399  }
400  }
401 };
402 
403 template <typename IndexType, typename SequenceType>
405  const SequenceType &seq) {
406  typename SequenceType::const_iterator seqIt;
407  for (seqIt = seq.begin(); seqIt != seq.end(); ++seqIt) {
408  // EFF: probably not the most efficient approach
409  IndexType idx = *seqIt;
410  vect.setVal(idx, vect.getVal(idx) + 1);
411  }
412 }
413 
414 namespace {
415 template <typename IndexType>
416 void calcVectParams(const SparseIntVect<IndexType> &v1,
417  const SparseIntVect<IndexType> &v2, double &v1Sum,
418  double &v2Sum, double &andSum) {
419  if (v1.getLength() != v2.getLength()) {
420  throw ValueErrorException("SparseIntVect size mismatch");
421  }
422  v1Sum = v2Sum = andSum = 0.0;
423  // we're doing : (v1&v2).getTotalVal(), but w/o generating
424  // the other vector:
426  iter1 = v1.getNonzeroElements().begin();
427  if (iter1 != v1.getNonzeroElements().end()) v1Sum += abs(iter1->second);
428  iter2 = v2.getNonzeroElements().begin();
429  if (iter2 != v2.getNonzeroElements().end()) v2Sum += abs(iter2->second);
430  while (iter1 != v1.getNonzeroElements().end()) {
431  while (iter2 != v2.getNonzeroElements().end() &&
432  iter2->first < iter1->first) {
433  ++iter2;
434  if (iter2 != v2.getNonzeroElements().end()) v2Sum += abs(iter2->second);
435  }
436  if (iter2 != v2.getNonzeroElements().end()) {
437  if (iter2->first == iter1->first) {
438  if (abs(iter2->second) < abs(iter1->second)) {
439  andSum += abs(iter2->second);
440  } else {
441  andSum += abs(iter1->second);
442  }
443  ++iter2;
444  if (iter2 != v2.getNonzeroElements().end()) v2Sum += abs(iter2->second);
445  }
446  ++iter1;
447  if (iter1 != v1.getNonzeroElements().end()) v1Sum += abs(iter1->second);
448  } else {
449  break;
450  }
451  }
452  if (iter1 != v1.getNonzeroElements().end()) {
453  ++iter1;
454  while (iter1 != v1.getNonzeroElements().end()) {
455  v1Sum += abs(iter1->second);
456  ++iter1;
457  }
458  }
459  if (iter2 != v2.getNonzeroElements().end()) {
460  ++iter2;
461  while (iter2 != v2.getNonzeroElements().end()) {
462  v2Sum += abs(iter2->second);
463  ++iter2;
464  }
465  }
466 }
467 }
468 
469 template <typename IndexType>
471  const SparseIntVect<IndexType> &v2,
472  bool returnDistance = false, double bounds = 0.0) {
473  if (v1.getLength() != v2.getLength()) {
474  throw ValueErrorException("SparseIntVect size mismatch");
475  }
476  double v1Sum = 0.0;
477  double v2Sum = 0.0;
478  if (!returnDistance && bounds > 0.0) {
479  v1Sum = v1.getTotalVal(true);
480  v2Sum = v2.getTotalVal(true);
481  double denom = v1Sum + v2Sum;
482  if (fabs(denom) < 1e-6) {
483  if (returnDistance) {
484  return 1.0;
485  } else {
486  return 0.0;
487  }
488  }
489  double minV = v1Sum < v2Sum ? v1Sum : v2Sum;
490  if (2. * minV / denom < bounds) {
491  return 0.0;
492  }
493  v1Sum = 0.0;
494  v2Sum = 0.0;
495  }
496 
497  double numer = 0.0;
498 
499  calcVectParams(v1, v2, v1Sum, v2Sum, numer);
500 
501  double denom = v1Sum + v2Sum;
502  double sim;
503  if (fabs(denom) < 1e-6) {
504  sim = 0.0;
505  } else {
506  sim = 2. * numer / denom;
507  }
508  if (returnDistance) sim = 1. - sim;
509  // std::cerr<<" "<<v1Sum<<" "<<v2Sum<<" " << numer << " " << sim <<std::endl;
510  return sim;
511 }
512 
513 template <typename IndexType>
515  const SparseIntVect<IndexType> &v2, double a, double b,
516  bool returnDistance = false, double bounds = 0.0) {
517  RDUNUSED_PARAM(bounds);
518  if (v1.getLength() != v2.getLength()) {
519  throw ValueErrorException("SparseIntVect size mismatch");
520  }
521  double v1Sum = 0.0;
522  double v2Sum = 0.0;
523  double andSum = 0.0;
524 
525  calcVectParams(v1, v2, v1Sum, v2Sum, andSum);
526 
527  double denom = a * v1Sum + b * v2Sum + (1 - a - b) * andSum;
528  double sim;
529 
530  if (fabs(denom) < 1e-6) {
531  sim = 0.0;
532  } else {
533  sim = andSum / denom;
534  }
535  if (returnDistance) sim = 1. - sim;
536  // std::cerr<<" "<<v1Sum<<" "<<v2Sum<<" " << numer << " " << sim <<std::endl;
537  return sim;
538 }
539 
540 template <typename IndexType>
542  const SparseIntVect<IndexType> &v2,
543  bool returnDistance = false, double bounds = 0.0) {
544  return TverskySimilarity(v1, v2, 1.0, 1.0, returnDistance, bounds);
545 }
546 }
547 
548 #endif
const SparseIntVect< IndexType > operator|(const SparseIntVect< IndexType > &other) const
double DiceSimilarity(const SparseIntVect< IndexType > &v1, const SparseIntVect< IndexType > &v2, bool returnDistance=false, double bounds=0.0)
SparseIntVect(const char *pkl, const unsigned int len)
constructor from a pickle
Definition: SparseIntVect.h:48
const SparseIntVect< IndexType > operator &(const SparseIntVect< IndexType > &other) const
void updateFromSequence(SparseIntVect< IndexType > &vect, const SequenceType &seq)
const int ci_SPARSEINTVECT_VERSION
version number to use in pickles
Definition: SparseIntVect.h:23
const SparseIntVect< IndexType > operator-(const SparseIntVect< IndexType > &other) const
void streamRead(std::istream &ss, T &loc)
does a binary read of an object from a stream
Definition: StreamOps.h:245
~SparseIntVect()
destructor (doesn&#39;t need to do anything)
Definition: SparseIntVect.h:53
const StorageType & getNonzeroElements() const
returns our nonzero elements as a map(IndexType->int)
std::map< IndexType, int > StorageType
Definition: SparseIntVect.h:30
SparseIntVect< IndexType > & operator*(int v)
SparseIntVect(const SparseIntVect< IndexType > &other)
Copy constructor.
Definition: SparseIntVect.h:38
const SparseIntVect< IndexType > operator+(const SparseIntVect< IndexType > &other) const
int getVal(IndexType idx) const
return the value at an index
Definition: SparseIntVect.h:66
IndexType getLength() const
returns the length
Definition: SparseIntVect.h:99
double TanimotoSimilarity(const SparseIntVect< IndexType > &v1, const SparseIntVect< IndexType > &v2, bool returnDistance=false, double bounds=0.0)
SparseIntVect< IndexType > & operator/=(int v)
SparseIntVect< IndexType > & operator/(int v)
int operator[](IndexType idx) const
support indexing using []
Definition: SparseIntVect.h:96
Std stuff.
Definition: Atom.h:30
SparseIntVect< IndexType > & operator-=(const SparseIntVect< IndexType > &other)
Class to allow us to throw an IndexError from C++ and have it make it back to Python.
Definition: Exceptions.h:19
unsigned int size() const
returns the length
#define RDUNUSED_PARAM(x)
Definition: Invariant.h:195
SparseIntVect(IndexType length)
initialize with a particular length
Definition: SparseIntVect.h:35
SparseIntVect< IndexType > & operator*=(int v)
double TverskySimilarity(const SparseIntVect< IndexType > &v1, const SparseIntVect< IndexType > &v2, double a, double b, bool returnDistance=false, double bounds=0.0)
int getTotalVal(bool doAbs=false) const
bool operator!=(const SparseIntVect< IndexType > &v2) const
void streamWrite(std::ostream &ss, const T &val)
does a binary write of an object to a stream
Definition: StreamOps.h:225
SparseIntVect< IndexType > & operator &=(const SparseIntVect< IndexType > &other)
#define PRECONDITION(expr, mess)
Definition: Invariant.h:108
SparseIntVect(const std::string &pkl)
constructor from a pickle
Definition: SparseIntVect.h:44
a class for efficiently storing sparse vectors of ints
Definition: SparseIntVect.h:28
void setVal(IndexType idx, int val)
set the value at an index
Definition: SparseIntVect.h:79
SparseIntVect< IndexType > & operator+=(const SparseIntVect< IndexType > &other)
Class to allow us to throw a ValueError from C++ and have it make it back to Python.
Definition: Exceptions.h:33
SparseIntVect< IndexType > & operator-=(int v)
SparseIntVect< IndexType > & operator|=(const SparseIntVect< IndexType > &other)
SparseIntVect< IndexType > & operator+=(int v)
SparseIntVect< IndexType > & operator+(int v)
std::string toString() const
returns a binary string representation (pickle)
bool operator==(const SparseIntVect< IndexType > &v2) const
void fromString(const std::string &txt)
SparseIntVect< IndexType > & operator-(int v)