RDKit
Open-source cheminformatics and machine learning.
SparseIntVect.h
Go to the documentation of this file.
1 // $Id$
2 //
3 // Copyright (C) 2007-2008 Greg Landrum
4 //
5 // @@ All Rights Reserved @@
6 // This file is part of the RDKit.
7 // The contents are covered by the terms of the BSD license
8 // which is included in the file license.txt, found at the root
9 // of the RDKit source tree.
10 //
11 #ifndef __RD_SPARSE_INT_VECT_20070921__
12 #define __RD_SPARSE_INT_VECT_20070921__
13 
14 #include <map>
15 #include <string>
16 #include <RDGeneral/Invariant.h>
17 #include <sstream>
18 #include <RDGeneral/Exceptions.h>
19 #include <RDGeneral/StreamOps.h>
20 #include <boost/cstdint.hpp>
21 
22 
23 const int ci_SPARSEINTVECT_VERSION=0x0001; //!< version number to use in pickles
24 namespace RDKit{
25  //! a class for efficiently storing sparse vectors of ints
26  template <typename IndexType>
27  class SparseIntVect {
28  public:
29  typedef std::map<IndexType,int> StorageType;
30 
31  SparseIntVect() : d_length(0) {};
32 
33  //! initialize with a particular length
34  SparseIntVect(IndexType length) : d_length(length) {};
35 
36  //! Copy constructor
38  d_length=other.d_length;
39  d_data.insert(other.d_data.begin(),other.d_data.end());
40  }
41 
42  //! constructor from a pickle
43  SparseIntVect(const std::string pkl){
44  initFromText(pkl.c_str(),pkl.size());
45  };
46  //! constructor from a pickle
47  SparseIntVect(const char *pkl,const unsigned int len){
48  initFromText(pkl,len);
49  };
50 
51  //! destructor (doesn't need to do anything)
53 
54 #ifdef __clang__
55 #pragma clang diagnostic push
56 #pragma clang diagnostic ignored "-Wtautological-compare"
57 #endif
58  //! return the value at an index
59  int getVal(IndexType idx) const {
60  if(idx<0||idx>=d_length){
61  throw IndexErrorException(static_cast<int>(idx));
62  }
63  int res=0;
64  typename StorageType::const_iterator iter=d_data.find(idx);
65  if(iter!=d_data.end()){
66  res=iter->second;
67  }
68  return res;
69  };
70 
71  //! set the value at an index
72  void setVal(IndexType idx, int val){
73  if(idx<0||idx>=d_length){
74  throw IndexErrorException(static_cast<int>(idx));
75  }
76  if(val!=0){
77  d_data[idx]=val;
78  } else {
79  d_data.erase(idx);
80  }
81  };
82 #ifdef __clang__
83 #pragma clang diagnostic pop
84 #endif
85  //! support indexing using []
86  int operator[] (IndexType idx) const { return getVal(idx); };
87 
88  //! returns the length
89  IndexType getLength() const { return d_length; };
90 
91  //! returns the sum of all the elements in the vect
92  //! the doAbs argument toggles summing the absolute values of the elements
93  int getTotalVal(bool doAbs=false) const {
94  int res=0;
95  typename StorageType::const_iterator iter;
96  for(iter=d_data.begin();iter!=d_data.end();++iter){
97  if(!doAbs) res+=iter->second;
98  else res+=abs(iter->second);
99  }
100  return res;
101  };
102  //! returns the length
103  unsigned int size() const { return getLength(); };
104 
105 
106  //! returns our nonzero elements as a map(IndexType->int)
107  const StorageType &getNonzeroElements() const {
108  return d_data;
109  }
110 
111 
112  //! this is a "fuzzy" intesection, the final value
113  //! of each element is equal to the minimum from
114  //! the two vects.
117  if(other.d_length!=d_length){
118  throw ValueErrorException("SparseIntVect size mismatch");
119  }
120 
121  typename StorageType::iterator iter=d_data.begin();
122  typename StorageType::const_iterator oIter=other.d_data.begin();
123  while(iter!=d_data.end()){
124  // we're relying on the fact that the maps are sorted:
125  while(oIter!=other.d_data.end() && oIter->first < iter->first){
126  ++oIter;
127  }
128  if(oIter!=other.d_data.end() && oIter->first==iter->first){
129  // found it:
130  if(oIter->second<iter->second){
131  iter->second=oIter->second;
132  }
133  ++oIter;
134  ++iter;
135  } else {
136  // not there; our value is zero, which means
137  // we should remove this value:
138  typename StorageType::iterator tmpIter=iter;
139  ++tmpIter;
140  d_data.erase(iter);
141  iter=tmpIter;
142  }
143  }
144  return *this;
145  };
147  operator& (const SparseIntVect<IndexType> &other) const {
148  SparseIntVect<IndexType> res(*this);
149  return res&=other;
150  }
151 
152  //! this is a "fuzzy" union, the final value
153  //! of each element is equal to the maximum from
154  //! the two vects.
157  if(other.d_length!=d_length){
158  throw ValueErrorException("SparseIntVect size mismatch");
159  }
160 
161  typename StorageType::iterator iter=d_data.begin();
162  typename StorageType::const_iterator oIter=other.d_data.begin();
163  while(iter!=d_data.end()){
164  // we're relying on the fact that the maps are sorted:
165  while(oIter!=other.d_data.end() &&
166  oIter->first < iter->first){
167  d_data[oIter->first]=oIter->second;
168  ++oIter;
169  }
170  if(oIter!=other.d_data.end() && oIter->first==iter->first){
171  // found it:
172  if(oIter->second>iter->second){
173  iter->second=oIter->second;
174  }
175  ++oIter;
176  }
177  ++iter;
178  }
179  // finish up the other vect:
180  while(oIter!=other.d_data.end()){
181  d_data[oIter->first]=oIter->second;
182  ++oIter;
183  }
184  return *this;
185  };
187  operator| (const SparseIntVect<IndexType> &other) const {
188  SparseIntVect<IndexType> res(*this);
189  return res|=other;
190  }
191 
194  if(other.d_length!=d_length){
195  throw ValueErrorException("SparseIntVect size mismatch");
196  }
197  typename StorageType::iterator iter=d_data.begin();
198  typename StorageType::const_iterator oIter=other.d_data.begin();
199  while(oIter!=other.d_data.end()){
200  while(iter!=d_data.end() &&
201  iter->first < oIter->first){
202  ++iter;
203  }
204  if(iter!=d_data.end() && oIter->first==iter->first){
205  // found it:
206  iter->second+=oIter->second;
207  if(!iter->second){
208  typename StorageType::iterator tIter=iter;
209  ++tIter;
210  d_data.erase(iter);
211  iter=tIter;
212  } else {
213  ++iter;
214  }
215  } else {
216  d_data[oIter->first]=oIter->second;
217  }
218  ++oIter;
219  }
220  return *this;
221  };
223  operator+ (const SparseIntVect<IndexType> &other) const {
224  SparseIntVect<IndexType> res(*this);
225  return res+=other;
226  }
227 
230  if(other.d_length!=d_length){
231  throw ValueErrorException("SparseIntVect size mismatch");
232  }
233  typename StorageType::iterator iter=d_data.begin();
234  typename StorageType::const_iterator oIter=other.d_data.begin();
235  while(oIter!=other.d_data.end()){
236  while(iter!=d_data.end() &&
237  iter->first < oIter->first){
238  ++iter;
239  }
240  if(iter!=d_data.end() && oIter->first==iter->first){
241  // found it:
242  iter->second-=oIter->second;
243  if(!iter->second){
244  typename StorageType::iterator tIter=iter;
245  ++tIter;
246  d_data.erase(iter);
247  iter=tIter;
248  } else {
249  ++iter;
250  }
251  } else {
252  d_data[oIter->first] = -oIter->second;
253  }
254  ++oIter;
255  }
256  return *this;
257  };
259  operator- (const SparseIntVect<IndexType> &other) const {
260  SparseIntVect<IndexType> res(*this);
261  return res-=other;
262  }
264  operator*= (int v) {
265  typename StorageType::iterator iter=d_data.begin();
266  while(iter!=d_data.end()){
267  iter->second *= v;
268  ++iter;
269  }
270  return *this;
271  };
273  operator* (int v) {
274  SparseIntVect<IndexType> res(*this);
275  return res*=v;
276  };
278  operator/= (int v) {
279  typename StorageType::iterator iter=d_data.begin();
280  while(iter!=d_data.end()){
281  iter->second /= v;
282  ++iter;
283  }
284  return *this;
285  };
287  operator/ (int v) {
288  SparseIntVect<IndexType> res(*this);
289  return res/=v;
290  };
292  operator+= (int v) {
293  typename StorageType::iterator iter=d_data.begin();
294  while(iter!=d_data.end()){
295  iter->second += v;
296  ++iter;
297  }
298  return *this;
299  };
301  operator+ (int v) {
302  SparseIntVect<IndexType> res(*this);
303  return res+=v;
304  };
306  operator-= (int v) {
307  typename StorageType::iterator iter=d_data.begin();
308  while(iter!=d_data.end()){
309  iter->second -= v;
310  ++iter;
311  }
312  return *this;
313  };
315  operator- (int v) {
316  SparseIntVect<IndexType> res(*this);
317  return res-=v;
318  };
319 
320  bool operator==(const SparseIntVect<IndexType> &v2) const{
321  if(d_length!=v2.d_length){
322  return false;
323  }
324  return d_data==v2.d_data;
325  }
326  bool operator!=(const SparseIntVect<IndexType> &v2) const {
327  return !(*this==v2);
328  }
329 
330  //! returns a binary string representation (pickle)
331  std::string toString() const {
332  std::stringstream ss(std::ios_base::binary|std::ios_base::out|std::ios_base::in);
333  boost::uint32_t tInt;
335  streamWrite(ss,tInt);
336  tInt=sizeof(IndexType);
337  streamWrite(ss,tInt);
338  streamWrite(ss,d_length);
339  IndexType nEntries=d_data.size();
340  streamWrite(ss,nEntries);
341 
342  typename StorageType::const_iterator iter=d_data.begin();
343  while(iter!=d_data.end()){
344  streamWrite(ss,iter->first);
345  boost::int32_t tInt=iter->second;
346  streamWrite(ss,tInt);
347  ++iter;
348  }
349  return ss.str();
350  };
351 
352  void fromString(const std::string &txt) {
353  initFromText(txt.c_str(),txt.length());
354  }
355 
356  private:
357  IndexType d_length;
358  StorageType d_data;
359 
360  void initFromText(const char *pkl,const unsigned int len) {
361  d_data.clear();
362  std::stringstream ss(std::ios_base::binary|std::ios_base::out|std::ios_base::in);
363  ss.write(pkl,len);
364 
365  boost::uint32_t vers;
366  streamRead(ss,vers);
367  if(vers==0x0001){
368  boost::uint32_t tInt;
369  streamRead(ss,tInt);
370  if(tInt>sizeof(IndexType)){
371  throw ValueErrorException("IndexType cannot accomodate index size in SparseIntVect pickle");
372  }
373  switch(tInt){
374  case sizeof(char):
375  readVals<unsigned char>(ss);break;
376  case sizeof(boost::int32_t):
377  readVals<boost::uint32_t>(ss);break;
378  case sizeof(boost::int64_t):
379  readVals<boost::uint64_t>(ss);break;
380  default:
381  throw ValueErrorException("unreadable format");
382  }
383  } else {
384  throw ValueErrorException("bad version in SparseIntVect pickle");
385  }
386  };
387  template <typename T>
388  void readVals(std::stringstream &ss){
389  PRECONDITION(sizeof(T)<=sizeof(IndexType),"invalid size");
390  T tVal;
391  streamRead(ss,tVal);
392  d_length=tVal;
393  T nEntries;
394  streamRead(ss,nEntries);
395  for(T i=0;i<nEntries;++i){
396  streamRead(ss,tVal);
397  boost::int32_t val;
398  streamRead(ss,val);
399  d_data[tVal]=val;
400  }
401  }
402  };
403 
404  template <typename IndexType, typename SequenceType>
406  const SequenceType &seq){
407  typename SequenceType::const_iterator seqIt;
408  for(seqIt=seq.begin();seqIt!=seq.end();++seqIt){
409  // EFF: probably not the most efficient approach
410  IndexType idx=*seqIt;
411  vect.setVal(idx,vect.getVal(idx)+1);
412  }
413  }
414 
415  namespace {
416  template <typename IndexType>
417  void calcVectParams(const SparseIntVect<IndexType> &v1,
418  const SparseIntVect<IndexType> &v2,
419  double &v1Sum,double &v2Sum,
420  double &andSum){
421  if(v1.getLength()!=v2.getLength()){
422  throw ValueErrorException("SparseIntVect size mismatch");
423  }
424  v1Sum=v2Sum=andSum=0.0;
425  // we're doing : (v1&v2).getTotalVal(), but w/o generating
426  // the other vector:
428  iter1=v1.getNonzeroElements().begin();
429  if(iter1!=v1.getNonzeroElements().end()) v1Sum+=abs(iter1->second);
430  iter2=v2.getNonzeroElements().begin();
431  if(iter2!=v2.getNonzeroElements().end()) v2Sum+=abs(iter2->second);
432  while(iter1 != v1.getNonzeroElements().end()){
433  while(iter2!=v2.getNonzeroElements().end() && iter2->first < iter1->first){
434  ++iter2;
435  if(iter2!=v2.getNonzeroElements().end()) v2Sum+=abs(iter2->second);
436  }
437  if(iter2!=v2.getNonzeroElements().end()){
438  if(iter2->first == iter1->first){
439  if(abs(iter2->second)<abs(iter1->second)){
440  andSum += abs(iter2->second);
441  } else {
442  andSum += abs(iter1->second);
443  }
444  ++iter2;
445  if(iter2!=v2.getNonzeroElements().end()) v2Sum+=abs(iter2->second);
446  }
447  ++iter1;
448  if(iter1!=v1.getNonzeroElements().end()) v1Sum+=abs(iter1->second);
449  } else {
450  break;
451  }
452  }
453  if(iter1 != v1.getNonzeroElements().end()){
454  ++iter1;
455  while(iter1!=v1.getNonzeroElements().end()){
456  v1Sum+=abs(iter1->second);
457  ++iter1;
458  }
459  }
460  if(iter2!=v2.getNonzeroElements().end()){
461  ++iter2;
462  while(iter2!=v2.getNonzeroElements().end()){
463  v2Sum+=abs(iter2->second);
464  ++iter2;
465  }
466  }
467  }
468  }
469 
470  template <typename IndexType>
472  const SparseIntVect<IndexType> &v2,
473  bool returnDistance=false,
474  double bounds=0.0){
475  if(v1.getLength()!=v2.getLength()){
476  throw ValueErrorException("SparseIntVect size mismatch");
477  }
478  double v1Sum=0.0;
479  double v2Sum=0.0;
480  if(!returnDistance && bounds>0.0){
481  v1Sum=v1.getTotalVal(true);
482  v2Sum=v2.getTotalVal(true);
483  double denom=v1Sum+v2Sum;
484  if(fabs(denom)<1e-6){
485  if(returnDistance){
486  return 1.0;
487  } else {
488  return 0.0;
489  }
490  }
491  double minV=v1Sum<v2Sum?v1Sum:v2Sum;
492  if(2.*minV/denom<bounds){
493  return 0.0;
494  }
495  v1Sum=0.0;
496  v2Sum=0.0;
497  }
498 
499  double numer=0.0;
500 
501  calcVectParams(v1,v2,v1Sum,v2Sum,numer);
502 
503  double denom=v1Sum+v2Sum;
504  double sim;
505  if(fabs(denom)<1e-6){
506  sim=0.0;
507  } else {
508  sim=2.*numer/denom;
509  }
510  if(returnDistance) sim = 1.-sim;
511  //std::cerr<<" "<<v1Sum<<" "<<v2Sum<<" " << numer << " " << sim <<std::endl;
512  return sim;
513  }
514 
515 
516  template <typename IndexType>
518  const SparseIntVect<IndexType> &v2,
519  double a, double b,
520  bool returnDistance=false,
521  double bounds=0.0){
522  if(v1.getLength()!=v2.getLength()){
523  throw ValueErrorException("SparseIntVect size mismatch");
524  }
525  double v1Sum=0.0;
526  double v2Sum=0.0;
527  double andSum=0.0;
528 
529  calcVectParams(v1,v2,v1Sum,v2Sum,andSum);
530 
531  double denom=a*v1Sum+b*v2Sum+(1-a-b)*andSum;
532  double sim;
533 
534  if(fabs(denom)<1e-6){
535  sim=0.0;
536  } else {
537  sim=andSum/denom;
538  }
539  if(returnDistance) sim = 1.-sim;
540  //std::cerr<<" "<<v1Sum<<" "<<v2Sum<<" " << numer << " " << sim <<std::endl;
541  return sim;
542  }
543 
544  template <typename IndexType>
546  const SparseIntVect<IndexType> &v2,
547  bool returnDistance=false,
548  double bounds=0.0){
549  return TverskySimilarity(v1,v2,1.0,1.0,returnDistance,bounds);
550  }
551 
552 }
553 
554 
555 
556 #endif
std::string toString() const
returns a binary string representation (pickle)
double DiceSimilarity(const SparseIntVect< IndexType > &v1, const SparseIntVect< IndexType > &v2, bool returnDistance=false, double bounds=0.0)
SparseIntVect(const char *pkl, const unsigned int len)
constructor from a pickle
Definition: SparseIntVect.h:47
void updateFromSequence(SparseIntVect< IndexType > &vect, const SequenceType &seq)
const SparseIntVect< IndexType > operator+(const SparseIntVect< IndexType > &other) const
const int ci_SPARSEINTVECT_VERSION
version number to use in pickles
Definition: SparseIntVect.h:23
std::map< IndexType, int > StorageType
Definition: SparseIntVect.h:29
void streamRead(std::istream &ss, T &loc)
does a binary read of an object from a stream
Definition: StreamOps.h:241
SparseIntVect< IndexType > & operator&=(const SparseIntVect< IndexType > &other)
~SparseIntVect()
destructor (doesn&#39;t need to do anything)
Definition: SparseIntVect.h:52
int getVal(IndexType idx) const
return the value at an index
Definition: SparseIntVect.h:59
const SparseIntVect< IndexType > operator&(const SparseIntVect< IndexType > &other) const
SparseIntVect< IndexType > & operator*(int v)
SparseIntVect(const SparseIntVect< IndexType > &other)
Copy constructor.
Definition: SparseIntVect.h:37
unsigned int size() const
returns the length
double TanimotoSimilarity(const SparseIntVect< IndexType > &v1, const SparseIntVect< IndexType > &v2, bool returnDistance=false, double bounds=0.0)
const SparseIntVect< IndexType > operator-(const SparseIntVect< IndexType > &other) const
bool operator!=(const SparseIntVect< IndexType > &v2) const
SparseIntVect< IndexType > & operator/=(int v)
SparseIntVect< IndexType > & operator/(int v)
Includes a bunch of functionality for handling Atom and Bond queries.
Definition: Atom.h:28
bool operator==(const SparseIntVect< IndexType > &v2) const
SparseIntVect< IndexType > & operator-=(const SparseIntVect< IndexType > &other)
Class to allow us to throw an IndexError from C++ and have it make it back to Python.
Definition: Exceptions.h:18
const StorageType & getNonzeroElements() const
returns our nonzero elements as a map(IndexType->int)
SparseIntVect(const std::string pkl)
constructor from a pickle
Definition: SparseIntVect.h:43
SparseIntVect(IndexType length)
initialize with a particular length
Definition: SparseIntVect.h:34
SparseIntVect< IndexType > & operator*=(int v)
const SparseIntVect< IndexType > operator|(const SparseIntVect< IndexType > &other) const
double TverskySimilarity(const SparseIntVect< IndexType > &v1, const SparseIntVect< IndexType > &v2, double a, double b, bool returnDistance=false, double bounds=0.0)
int getTotalVal(bool doAbs=false) const
Definition: SparseIntVect.h:93
void streamWrite(std::ostream &ss, const T &val)
does a binary write of an object to a stream
Definition: StreamOps.h:235
#define PRECONDITION(expr, mess)
Definition: Invariant.h:119
a class for efficiently storing sparse vectors of ints
Definition: SparseIntVect.h:27
void setVal(IndexType idx, int val)
set the value at an index
Definition: SparseIntVect.h:72
SparseIntVect< IndexType > & operator+=(const SparseIntVect< IndexType > &other)
Class to allow us to throw a ValueError from C++ and have it make it back to Python.
Definition: Exceptions.h:31
SparseIntVect< IndexType > & operator|=(const SparseIntVect< IndexType > &other)
IndexType getLength() const
returns the length
Definition: SparseIntVect.h:89
int operator[](IndexType idx) const
support indexing using []
Definition: SparseIntVect.h:86
void fromString(const std::string &txt)