casacore
ClassicalStatistics.h
Go to the documentation of this file.
1 //# Copyright (C) 2000,2001
2 //# Associated Universities, Inc. Washington DC, USA.
3 //#
4 //# This library is free software; you can redistribute it and/or modify it
5 //# under the terms of the GNU Library General Public License as published by
6 //# the Free Software Foundation; either version 2 of the License, or (at your
7 //# option) any later version.
8 //#
9 //# This library is distributed in the hope that it will be useful, but WITHOUT
10 //# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 //# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
12 //# License for more details.
13 //#
14 //# You should have received a copy of the GNU Library General Public License
15 //# along with this library; if not, write to the Free Software Foundation,
16 //# Inc., 675 Massachusetts Ave, Cambridge, MA 02139, USA.
17 //#
18 //# Correspondence concerning AIPS++ should be addressed as follows:
19 //# Internet email: aips2-request@nrao.edu.
20 //# Postal address: AIPS++ Project Office
21 //# National Radio Astronomy Observatory
22 //# 520 Edgemont Road
23 //# Charlottesville, VA 22903-2475 USA
24 //#
25 //# $Id: Array.h 21545 2015-01-22 19:36:35Z gervandiepen $
26 
27 #ifndef SCIMATH_CLASSICALSTATS_H
28 #define SCIMATH_CLASSICALSTATS_H
29 
30 #include <casacore/casa/aips.h>
31 
32 #include <casacore/scimath/Mathematics/StatisticsAlgorithm.h>
33 
34 #include <casacore/scimath/Mathematics/StatisticsTypes.h>
35 #include <casacore/scimath/Mathematics/StatisticsUtilities.h>
36 
37 #include <set>
38 #include <vector>
39 #include <utility>
40 
41 namespace casacore {
42 
43 template <class T> class PtrHolder;
44 
45 // Class to calculate statistics in a "classical" sense, ie using accumulators with no
46 // special filtering beyond optional range filtering etc.
47 //
48 // setCalculateAsAdded() allows one to specify if statistics should be calculated and updated
49 // on upon each call to set/addData(). If False, statistics will be calculated only when
50 // getStatistic(), getStatistics(), or similar methods are called. Setting this value to True
51 // allows the caller to not have to keep all the data accessible at once. Note however, that all
52 // data must be simultaneously accessible if quantile (eg median) calculations are desired.
53 
54 // I attempted to write this class using the Composite design pattern, with eg the
55 // _unweightedStats() and _weightedStats() methods in their own class, but for reasons I
56 // don't understand, that impacted performance significantly. So I'm using the current
57 // architecture, which I know is a bit a maintenance nightmare.
58 
59 template <class AccumType, class DataIterator, class MaskIterator=const Bool*, class WeightsIterator=DataIterator>
61  : public StatisticsAlgorithm<AccumType, DataIterator, MaskIterator, WeightsIterator> {
62 public:
63 
65 
66  // copy semantics
68 
69  virtual ~ClassicalStatistics();
70 
71  // copy semantics
74  );
75 
76  // get the algorithm that this object uses for computing stats
79  };
80 
81  // <group>
82  // In the following group of methods, if the size of the composite dataset
83  // is smaller than
84  // <src>binningThreshholdSizeBytes</src>, the composite dataset
85  // will be (perhaps partially) sorted and persisted in memory during the
86  // call. In that case, and if <src>persistSortedArray</src> is True, this
87  // sorted array will remain in memory after the call and will be used on
88  // subsequent calls of this method when <src>binningThreshholdSizeBytes</src>
89  // is greater than the size of the composite dataset. If
90  // <src>persistSortedArray</src> is False, the sorted array will not be
91  // stored after this call completes and so any subsequent calls for which the
92  // dataset size is less than <src>binningThreshholdSizeBytes</src>, the
93  // dataset will be sorted from scratch. Values which are not included due to
94  // non-unity strides, are not included in any specified ranges, are masked,
95  // or have associated weights of zero are not considered as dataset members
96  // for quantile computations.
97  // If one has a priori information regarding the number of points (npts) and/or
98  // the minimum and maximum values of the data set, these can be supplied to
99  // improve performance. Note however, that if these values are not correct, the
100  // resulting median
101  // and/or quantile values will also not be correct (although see the following notes regarding
102  // max/min). Note that if this object has already had getStatistics()
103  // called, and the min and max were calculated, there is no need to pass these values in
104  // as they have been stored internally and used (although passing them in shouldn't hurt
105  // anything). If provided, npts, the number of points falling in the specified ranges which are
106  // not masked and have weights > 0, should be exactly correct. <src>min</src> can be less than
107  // the true minimum, and <src>max</src> can be greater than the True maximum, but for best
108  // performance, these should be as close to the actual min and max as possible.
109  // In order for quantile computations to occur over multiple datasets, all datasets
110  // must be available. This means that if setCalculateAsAdded()
111  // was previously called by passing in a value of True, these methods will throw
112  // an exception as the previous call indicates that there is no guarantee that
113  // all datasets will be available. If one uses a data provider (by having called
114  // setDataProvider()), then this should not be an issue.
115 
116  // get the median of the distribution.
117  // For a dataset with an odd number of good points, the median is just the value
118  // at index int(N/2) in the equivalent sorted dataset, where N is the number of points.
119  // For a dataset with an even number of points, the median is the mean of the values at
120  // indices int(N/2)-1 and int(N/2) in the sorted dataset.
121  // <src>nBins</src> is the number of bins, per histogram, to use to bin the data. More
122  // bins decrease the likelihood that multiple passes of the data set will be necessary, but
123  // also increase the amount of memory used. If nBins is set to less than 1,000, it is
124  // automatically increased to 1,000; there should be no reason to ever set nBins to be
125  // this small.
126  virtual AccumType getMedian(
127  CountedPtr<uInt64> knownNpts=NULL, CountedPtr<AccumType> knownMin=NULL,
128  CountedPtr<AccumType> knownMax=NULL, uInt binningThreshholdSizeBytes=4096*4096,
129  Bool persistSortedArray=False, uInt64 nBins=10000
130  );
131 
132  // If one needs to compute both the median and quantile values, it is better to call
133  // getMedianAndQuantiles() rather than getMedian() and getQuantiles() separately, as the
134  // first will scan large data sets fewer times than calling the separate methods.
135  // The return value is the median; the quantiles are returned in the <src>quantiles</src> map.
136  // Values in the <src>fractions</src> set represent the locations in the CDF and should be
137  // between 0 and 1, exclusive.
138  virtual AccumType getMedianAndQuantiles(
139  std::map<Double, AccumType>& quantiles, const std::set<Double>& fractions,
140  CountedPtr<uInt64> knownNpts=NULL, CountedPtr<AccumType> knownMin=NULL,
141  CountedPtr<AccumType> knownMax=NULL,
142  uInt binningThreshholdSizeBytes=4096*4096, Bool persistSortedArray=False,
143  uInt64 nBins=10000
144  );
145 
146  // get the median of the absolute deviation about the median of the data.
147  virtual AccumType getMedianAbsDevMed(
148  CountedPtr<uInt64> knownNpts=NULL,
149  CountedPtr<AccumType> knownMin=NULL, CountedPtr<AccumType> knownMax=NULL,
150  uInt binningThreshholdSizeBytes=4096*4096, Bool persistSortedArray=False,
151  uInt64 nBins=10000
152  );
153 
154  // Get the specified quantiles. <src>fractions</src> must be between 0 and 1,
155  // noninclusive.
156  virtual std::map<Double, AccumType> getQuantiles(
157  const std::set<Double>& fractions, CountedPtr<uInt64> knownNpts=NULL,
158  CountedPtr<AccumType> knownMin=NULL, CountedPtr<AccumType> knownMax=NULL,
159  uInt binningThreshholdSizeBytes=4096*4096, Bool persistSortedArray=False,
160  uInt64 nBins=10000
161  );
162 
163  // </group>
164 
165  // scan the dataset(s) that have been added, and find the min and max.
166  // This method may be called even if setStatsToCaclulate has been called and
167  // MAX and MIN has been excluded. If setCalculateAsAdded(True) has previously been
168  // called after this object has been (re)initialized, an exception will be thrown.
169  virtual void getMinMax(AccumType& mymin, AccumType& mymax);
170 
171  // scan the dataset(s) that have been added, and find the number of good points.
172  // This method may be called even if setStatsToCaclulate has been called and
173  // NPTS has been excluded. If setCalculateAsAdded(True) has previously been
174  // called after this object has been (re)initialized, an exception will be thrown.
175  virtual uInt64 getNPts();
176 
177  // see base class description
178  virtual std::pair<Int64, Int64> getStatisticIndex(StatisticsData::STATS stat);
179 
180  // Has any data been added to this object? Will return False if the object has
181  // been reset and no data have been added afterward.
182  Bool hasData() const { return _hasData; }
183 
184  // reset object to initial state. Clears all private fields including data,
185  // accumulators, etc.
186  virtual void reset();
187 
188  // Should statistics be updated with calls to addData or should they only be calculated
189  // upon calls to getStatistics etc? Beware that calling this will automatically reinitialize
190  // the object, so that it will contain no references to data et al. after this method has
191  // been called.
192  virtual void setCalculateAsAdded(Bool c);
193 
194  // An exception will be thrown if setCalculateAsAdded(True) has been called.
196 
197  void setStatsToCalculate(std::set<StatisticsData::STATS>& stats);
198 
199 protected:
200 
201  // <group>
202  // scan through the data set to determine the number of good (unmasked, weight > 0,
203  // within range) points. The first with no mask, no
204  // ranges, and no weights is trivial with npts = nr in this class, but is implemented here
205  // so that derived classes may override it.
206  inline virtual void _accumNpts(
207  uInt64& npts,
208  const DataIterator& dataBegin, Int64 nr, uInt dataStride
209  ) const;
210 
211  virtual void _accumNpts(
212  uInt64& npts,
213  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
214  const DataRanges& ranges, Bool isInclude
215  ) const;
216 
217  virtual void _accumNpts(
218  uInt64& npts,
219  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
220  const MaskIterator& maskBegin, uInt maskStride
221  ) const;
222 
223  virtual void _accumNpts(
224  uInt64& npts,
225  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
226  const MaskIterator& maskBegin, uInt maskStride, const DataRanges& ranges,
227  Bool isInclude
228  ) const;
229 
230  virtual void _accumNpts(
231  uInt64& npts,
232  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
233  Int64 nr, uInt dataStride
234  ) const;
235 
236  virtual void _accumNpts(
237  uInt64& npts,
238  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
239  Int64 nr, uInt dataStride, const DataRanges& ranges, Bool isInclude
240  ) const;
241 
242  virtual void _accumNpts(
243  uInt64& npts,
244  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
245  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
246  const DataRanges& ranges, Bool isInclude
247  ) const;
248 
249  virtual void _accumNpts(
250  uInt64& npts,
251  const DataIterator& dataBegin, const WeightsIterator& weightBegin,
252  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride
253  ) const;
254  // </group>
255 
256  // <group>
257  inline void _accumulate(
258  StatsData<AccumType>& stats, const AccumType& datum,
259  const LocationType& location
260  );
261 
262  inline void _accumulate(
263  StatsData<AccumType>& stats, const AccumType& datum,
264  const AccumType& weight, const LocationType& location
265  );
266  // </group>
267 
268  void _addData();
269 
270  void _clearStats();
271 
272  // scan dataset(s) to find min and max
273  void _doMinMax(AccumType& vmin, AccumType& vmax);
274 
275  // <group>
276  // Get the counts of data within the specified histogram bins. The number of
277  // arrays within binCounts will be equal to the number of histograms in <src>binDesc</src>.
278  // Each array within <src>binCounts</src> will have the same number of elements as the
279  // number of bins in its corresponding histogram in <src>binDesc</src>.
280  virtual void _findBins(
281  vector<vector<uInt64> >& binCounts,
282  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
283  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
284  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc,
285  const vector<AccumType>& maxLimit
286  ) const;
287 
288  virtual void _findBins(
289  vector<vector<uInt64> >& binCounts,
290  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
291  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
292  const DataRanges& ranges, Bool isInclude,
293  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, const vector<AccumType>& maxLimit
294  ) const;
295 
296  virtual void _findBins(
297  vector<vector<uInt64> >& binCounts,
298  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
299  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
300  const MaskIterator& maskBegin, uInt maskStride,
301  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, const vector<AccumType>& maxLimit
302  ) const;
303 
304  virtual void _findBins(
305  vector<vector<uInt64> >& binCounts,
306  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
307  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
308  const MaskIterator& maskBegin, uInt maskStride, const DataRanges& ranges,
309  Bool isInclude,
310  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, const vector<AccumType>& maxLimit
311  ) const;
312 
313  virtual void _findBins(
314  vector<vector<uInt64> >& binCounts,
315  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
316  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
317  Int64 nr, uInt dataStride,
318  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, const vector<AccumType>& maxLimit
319  ) const ;
320 
321  virtual void _findBins(
322  vector<vector<uInt64> >& binCounts,
323  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
324  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
325  Int64 nr, uInt dataStride, const DataRanges& ranges, Bool isInclude,
326  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, const vector<AccumType>& maxLimit
327  ) const;
328 
329  virtual void _findBins(
330  vector<vector<uInt64> >& binCounts,
331  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
332  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
333  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
334  const DataRanges& ranges, Bool isInclude,
335  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, const vector<AccumType>& maxLimit
336  ) const;
337 
338  virtual void _findBins(
339  vector<vector<uInt64> >& binCounts,
340  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
341  const DataIterator& dataBegin, const WeightsIterator& weightBegin,
342  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
343  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, const vector<AccumType>& maxLimit
344  ) const;
345  // </group>
346 
347  Bool _getDoMaxMin() const { return _doMaxMin; }
348 
349  Int64 _getIDataset() const { return _idataset; }
350 
351  virtual StatsData<AccumType> _getInitialStats() const;
352 
353  AccumType _getStatistic(StatisticsData::STATS stat);
354 
356 
357  // retreive stats structure. Allows derived classes to maintain their own
358  // StatsData structs.
359  inline virtual StatsData<AccumType>& _getStatsData() { return _statsData; }
360 
361  inline virtual const StatsData<AccumType>& _getStatsData() const { return _statsData; }
362 
363  // <group>
364  virtual void _minMax(
366  const DataIterator& dataBegin, Int64 nr, uInt dataStride
367  ) const;
368 
369  virtual void _minMax(
371  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
372  const DataRanges& ranges, Bool isInclude
373  ) const;
374 
375  virtual void _minMax(
377  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
378  const MaskIterator& maskBegin, uInt maskStride
379  ) const;
380 
381  virtual void _minMax(
383  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
384  const MaskIterator& maskBegin, uInt maskStride, const DataRanges& ranges,
385  Bool isInclude
386  ) const;
387 
388  virtual void _minMax(
390  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
391  Int64 nr, uInt dataStride
392  ) const;
393 
394  virtual void _minMax(
396  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
397  Int64 nr, uInt dataStride, const DataRanges& ranges, Bool isInclude
398  ) const;
399 
400  virtual void _minMax(
402  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
403  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
404  const DataRanges& ranges, Bool isInclude
405  ) const;
406 
407  virtual void _minMax(
409  const DataIterator& dataBegin, const WeightsIterator& weightBegin,
410  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride
411  ) const;
412  // </group>
413 
414  //<group>
415  // populate an unsorted array with valid data.
416  // no weights, no mask, no ranges
417  virtual void _populateArray(
418  vector<AccumType>& ary, const DataIterator& dataBegin, Int64 nr, uInt dataStride
419  ) const;
420 
421  // ranges
422  virtual void _populateArray(
423  vector<AccumType>& ary, const DataIterator& dataBegin, Int64 nr,
424  uInt dataStride, const DataRanges& ranges, Bool isInclude
425  ) const;
426 
427  virtual void _populateArray(
428  vector<AccumType>& ary, const DataIterator& dataBegin,
429  Int64 nr, uInt dataStride, const MaskIterator& maskBegin,
430  uInt maskStride
431  ) const;
432 
433  // mask and ranges
434  virtual void _populateArray(
435  vector<AccumType>& ary, const DataIterator& dataBegin, Int64 nr,
436  uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
437  const DataRanges& ranges, Bool isInclude
438  ) const;
439 
440  // weights
441  virtual void _populateArray(
442  vector<AccumType>& ary, const DataIterator& dataBegin,
443  const WeightsIterator& weightsBegin, Int64 nr, uInt dataStride
444  ) const;
445 
446  // weights and ranges
447  virtual void _populateArray(
448  vector<AccumType>& ary, const DataIterator& dataBegin,
449  const WeightsIterator& weightsBegin, Int64 nr, uInt dataStride,
450  const DataRanges& ranges, Bool isInclude
451  ) const;
452 
453  // weights and mask
454  virtual void _populateArray(
455  vector<AccumType>& ary, const DataIterator& dataBegin,
456  const WeightsIterator& weightBegin, Int64 nr, uInt dataStride,
457  const MaskIterator& maskBegin, uInt maskStride
458  ) const;
459 
460  // weights, mask, ranges
461  virtual void _populateArray(
462  vector<AccumType>& ary, const DataIterator& dataBegin, const WeightsIterator& weightBegin,
463  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
464  const DataRanges& ranges, Bool isInclude
465  ) const;
466  // </group>
467 
468  // <group>
469  // Create a vector of unsorted arrays, one array for each bin defined by <src>includeLimits</src>.
470  // <src>includeLimits</src> should be non-overlapping and should be given in ascending order (the
471  // algorithm used assumes this). Once the sum of the lengths of all arrays equals <src>maxCount</src>
472  // the method will return with no further processing.
473  // no weights, no mask, no ranges
474  virtual void _populateArrays(
475  vector<vector<AccumType> >& arys, uInt64& currentCount, const DataIterator& dataBegin, Int64 nr, uInt dataStride,
476  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt64 maxCount
477  ) const;
478 
479  // ranges
480  virtual void _populateArrays(
481  vector<vector<AccumType> >& arys, uInt64& currentCount, const DataIterator& dataBegin, Int64 nr,
482  uInt dataStride, const DataRanges& ranges, Bool isInclude,
483  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt64 maxCount
484  ) const;
485 
486  virtual void _populateArrays(
487  vector<vector<AccumType> >& arys, uInt64& currentCount, const DataIterator& dataBegin,
488  Int64 nr, uInt dataStride, const MaskIterator& maskBegin,
489  uInt maskStride,
490  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt64 maxCount
491  ) const;
492 
493  // mask and ranges
494  virtual void _populateArrays(
495  vector<vector<AccumType> >& arys, uInt64& currentCount, const DataIterator& dataBegin, Int64 nr,
496  uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
497  const DataRanges& ranges, Bool isInclude,
498  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt64 maxCount
499  ) const;
500 
501  // weights
502  virtual void _populateArrays(
503  vector<vector<AccumType> >& arys, uInt64& currentCount, const DataIterator& dataBegin,
504  const WeightsIterator& weightsBegin, Int64 nr, uInt dataStride,
505  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt64 maxCount
506  ) const;
507 
508  // weights and ranges
509  virtual void _populateArrays(
510  vector<vector<AccumType> >& arys, uInt64& currentCount, const DataIterator& dataBegin,
511  const WeightsIterator& weightsBegin, Int64 nr, uInt dataStride,
512  const DataRanges& ranges, Bool isInclude,
513  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt64 maxCount
514  ) const;
515 
516  // weights and mask
517  virtual void _populateArrays(
518  vector<vector<AccumType> >& arys, uInt64& currentCount, const DataIterator& dataBegin,
519  const WeightsIterator& weightBegin, Int64 nr, uInt dataStride,
520  const MaskIterator& maskBegin, uInt maskStride,
521  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt64 maxCount
522  ) const;
523 
524  // weights, mask, ranges
525  virtual void _populateArrays(
526  vector<vector<AccumType> >& arys, uInt64& currentCount, const DataIterator& dataBegin, const WeightsIterator& weightBegin,
527  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
528  const DataRanges& ranges, Bool isInclude,
529  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt64 maxCount
530  ) const;
531  // </group>
532 
533  // <group>
534  // no weights, no mask, no ranges
535  virtual Bool _populateTestArray(
536  vector<AccumType>& ary, const DataIterator& dataBegin,
537  Int64 nr, uInt dataStride, uInt maxElements
538  ) const;
539 
540  // ranges
541  virtual Bool _populateTestArray(
542  vector<AccumType>& ary, const DataIterator& dataBegin, Int64 nr,
543  uInt dataStride, const DataRanges& ranges, Bool isInclude,
544  uInt maxElements
545  ) const;
546 
547  // mask
548  virtual Bool _populateTestArray(
549  vector<AccumType>& ary, const DataIterator& dataBegin,
550  Int64 nr, uInt dataStride, const MaskIterator& maskBegin,
551  uInt maskStride, uInt maxElements
552  ) const;
553 
554  // mask and ranges
555  virtual Bool _populateTestArray(
556  vector<AccumType>& ary, const DataIterator& dataBegin, Int64 nr,
557  uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
558  const DataRanges& ranges, Bool isInclude, uInt maxElements
559  ) const;
560 
561  // weights
562  virtual Bool _populateTestArray(
563  vector<AccumType>& ary, const DataIterator& dataBegin,
564  const WeightsIterator& weightBegin, Int64 nr, uInt dataStride,
565  uInt maxElements
566  ) const;
567 
568  // weights and ranges
569  virtual Bool _populateTestArray(
570  vector<AccumType>& ary, const DataIterator& dataBegin,
571  const WeightsIterator& weightsBegin, Int64 nr, uInt dataStride,
572  const DataRanges& ranges, Bool isInclude, uInt maxElements
573  ) const;
574 
575  // weights and mask
576  virtual Bool _populateTestArray(
577  vector<AccumType>& ary, const DataIterator& dataBegin,
578  const WeightsIterator& weightBegin, Int64 nr,
579  uInt dataStride, const MaskIterator& maskBegin,
580  uInt maskStride, uInt maxElements
581  ) const;
582 
583  // weights, mask, ranges
584  virtual Bool _populateTestArray(
585  vector<AccumType>& ary, const DataIterator& dataBegin, const WeightsIterator& weightBegin,
586  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
587  const DataRanges& ranges, Bool isInclude,
588  uInt maxElements
589  ) const;
590  // </group>
591 
592  // <group>
593  // no weights, no mask, no ranges
594  virtual void _unweightedStats(
595  StatsData<AccumType>& stats, uInt64& ngood, LocationType& location,
596  const DataIterator& dataBegin, Int64 nr, uInt dataStride
597  );
598 
599  // no weights, no mask
600  virtual void _unweightedStats(
601  StatsData<AccumType>& stats, uInt64& ngood, LocationType& location,
602  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
603  const DataRanges& ranges, Bool isInclude
604  );
605 
606  virtual void _unweightedStats(
607  StatsData<AccumType>& stats, uInt64& ngood, LocationType& location,
608  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
609  const MaskIterator& maskBegin, uInt maskStride
610  );
611 
612  virtual void _unweightedStats(
613  StatsData<AccumType>& stats, uInt64& ngood, LocationType& location,
614  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
615  const MaskIterator& maskBegin, uInt maskStride,
616  const DataRanges& ranges, Bool isInclude
617  );
618 
619  // </group>
620  virtual void _updateDataProviderMaxMin(
621  const StatsData<AccumType>& threadStats
622  );
623 
624  // <group>
625  // has weights, but no mask, no ranges
626  virtual void _weightedStats(
627  StatsData<AccumType>& stats, LocationType& location,
628  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
629  Int64 nr, uInt dataStride
630  );
631 
632  virtual void _weightedStats(
633  StatsData<AccumType>& stats, LocationType& location,
634  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
635  Int64 nr, uInt dataStride, const DataRanges& ranges, Bool isInclude
636  );
637 
638  virtual void _weightedStats(
639  StatsData<AccumType>& stats, LocationType& location,
640  const DataIterator& dataBegin, const WeightsIterator& weightBegin,
641  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride
642  );
643 
644  virtual void _weightedStats(
645  StatsData<AccumType>& stats, LocationType& location,
646  const DataIterator& dataBegin, const WeightsIterator& weightBegin,
647  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
648  const DataRanges& ranges, Bool isInclude
649  );
650  // </group>
651 
652 private:
656  _hasData;
657 
658  // mutables, used to mitigate repeated code
659  mutable typename vector<DataIterator>::const_iterator _dend, _diter;
660  mutable vector<Int64>::const_iterator _citer;
661  mutable vector<uInt>::const_iterator _dsiter;
662  mutable std::map<uInt, MaskIterator> _masks;
663  mutable uInt _maskStride;
664  mutable std::map<uInt, WeightsIterator> _weights;
665  mutable std::map<uInt, DataRanges> _ranges;
666  mutable std::map<uInt, Bool> _isIncludeRanges;
669  mutable MaskIterator _myMask;
670  mutable DataIterator _myData;
671  mutable WeightsIterator _myWeights;
673  mutable uInt64 _myCount;
674 
675  static const uInt CACHE_PADDING;
676  static const uInt BLOCK_SIZE;
677 
678  // tally the number of data points that fall into each bin provided by <src>binDesc</src>
679  // Any points that are less than binDesc.minLimit or greater than
680  // binDesc.minLimit + binDesc.nBins*binDesc.binWidth are not included in the counts. A data
681  // point that falls exactly on a bin boundary is considered to be in the higher index bin.
682  // <src>sameVal</src> will be non-null if all the good values in the histogram range are the
683  // same. In that case, the value held will be the value of each of those data points.
684  vector<vector<uInt64> > _binCounts(
685  vector<CountedPtr<AccumType> >& sameVal,
686  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc
687  );
688 
689  void _computeBins(
690  vector<vector<uInt64> >& bins, vector<CountedPtr<AccumType> >& sameVal,
691  vector<Bool>& allSame, DataIterator dataIter, MaskIterator maskIter,
692  WeightsIterator weightsIter, uInt64 count,
693  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc,
694  const vector<AccumType>& maxLimit
695  );
696 
697  void _computeDataArray(
698  vector<AccumType>& ary, DataIterator dataIter,
699  MaskIterator maskIter, WeightsIterator weightsIter,
700  uInt64 dataCount
701  );
702 
703  void _computeDataArrays(
704  vector<vector<AccumType> >& arys, uInt64& currentCount,
705  DataIterator dataIter, MaskIterator maskIter,
706  WeightsIterator weightsIter, uInt64 dataCount,
707  const vector<std::pair<AccumType, AccumType> >& includeLimits,
708  uInt64 maxCount
709  );
710 
711  void _computeMinMax(
713  DataIterator dataIter, MaskIterator maskIter,
714  WeightsIterator weightsIter, uInt64 dataCount
715  );
716 
717  void _computeStats(
718  StatsData<AccumType>& stats, uInt64& ngood, LocationType& location,
719  DataIterator dataIter, MaskIterator maskIter,
720  WeightsIterator weightsIter, uInt64 count
721  );
722 
723  // convert in place by taking the absolute value of the difference of the vector and the median
724  static void _convertToAbsDevMedArray(vector<AccumType>& myArray, AccumType median);
725 
726  // Create an unsorted array of the complete data set. If <src>includeLimits</src> is specified,
727  // only points within those limits (including min but excluding max, as per definition of bins),
728  // are included.
729  void _createDataArray(
730  vector<AccumType>& array
731  );
732 
733  void _createDataArrays(
734  vector<vector<AccumType> >& arrays,
735  const vector<std::pair<AccumType, AccumType> > &includeLimits,
736  uInt64 maxCount
737  );
738  // extract data from multiple histograms given by <src>binDesc</src>.
739  // <src>dataIndices</src> represent the indices of the sorted arrays of values to
740  // extract. There should be exactly one set of data indices to extract for each
741  // supplied histogram. The data indices are relative to the minimum value of the minimum
742  // bin in their repsective histograms. The ordering of the maps in the returned vector represent
743  // the ordering of histograms in <src>binDesc</src>. <src>binDesc</src> should contain
744  // non-overlapping histograms and the histograms should be specified in ascending order.
745  vector<std::map<uInt64, AccumType> > _dataFromMultipleBins(
746  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, uInt64 maxArraySize,
747  const vector<std::set<uInt64> >& dataIndices, uInt64 nBins
748  );
749 
750  vector<std::map<uInt64, AccumType> > _dataFromSingleBins(
751  const vector<uInt64>& binNpts, uInt64 maxArraySize,
752  const vector<std::pair<AccumType, AccumType> >& binLimits,
753  const vector<std::set<uInt64> >& dataIndices, uInt64 nBins
754  );
755 
756  Int64 _doNpts();
757 
758  // increment the relevant loop counters
759  Bool _increment(Bool includeIDataset);
760 
761  // increment thread-based iterators
763  DataIterator& dataIter, MaskIterator& maskIter,
764  WeightsIterator& weightsIter, uInt64& offset, uInt nthreads
765  ) const;
766 
767  // get the values for the specified indices in the sorted array of all good data
768  std::map<uInt64, AccumType> _indicesToValues(
769  CountedPtr<uInt64> knownNpts, CountedPtr<AccumType> knownMin,
770  CountedPtr<AccumType> knownMax, uInt64 maxArraySize,
771  const std::set<uInt64>& dataIndices, Bool persistSortedArray,
772  uInt64 nBins
773  );
774 
775  void _initIterators();
776 
777  void _initLoopVars();
778 
779  void _initThreadVars(
780  uInt& nBlocks, uInt64& extra, uInt& nthreads, PtrHolder<DataIterator>& dataIter,
781  PtrHolder<MaskIterator>& maskIter, PtrHolder<WeightsIterator>& weightsIter,
782  PtrHolder<uInt64>& offset, uInt nThreadsMax
783  ) const;
784 
785  // Determine by scanning the dataset if the number of good points is smaller than
786  // <src>maxArraySize</src>. If so, <src>arrayToSort</src> will contain the unsorted
787  // data values. If not, this vector will be empty.
788  Bool _isNptsSmallerThan(vector<AccumType>& arrayToSort, uInt maxArraySize);
789 
790  // If <src>allowPad</src> is True, then pad the lower side of the lowest bin and the
791  // higher side of the highest bin so that minData and maxData do not fall on the edge
792  // of their respective bins. If false, no padding so that minData and maxData are also
793  // exactly the histogram abscissa limits.
794  static void _makeBins(
795  typename StatisticsUtilities<AccumType>::BinDesc& bins, AccumType minData, AccumType maxData, uInt maxBins,
796  Bool allowPad
797  );
798 
799  static void _mergeResults(
800  vector<vector<uInt64> >& bins, vector<CountedPtr<AccumType> >& sameVal,
801  vector<Bool>& allSame, const PtrHolder<vector<vector<uInt64> > >& tBins,
802  const PtrHolder<vector<CountedPtr<AccumType> > >& tSameVal,
803  const PtrHolder<vector<Bool> >& tAllSame, uInt nThreadsMax
804  );
805 
806  // get the index (for odd npts) or indices (for even npts) of the median of the sorted array.
807  // If knownNpts is not null, it will be used and must be correct. If it is null, the value of
808  // _npts will be used if it has been previously calculated. If not, the data sets will
809  // be scanned to determine npts.
810  std::set<uInt64> _medianIndices(CountedPtr<uInt64> knownNpts);
811 
812  uInt _nThreadsMax() const;
813 
814  uInt _threadIdx() const;
815 
816  // get values from sorted array if the array is small enough to be held in
817  // memory. Note that this is the array containing all good data, not data in
818  // just a single bin representing a subset of good data.
819  // Returns True if the data were successfully retrieved.
820  // If True is returned, the values map will contain a map of index to value.
822  std::map<uInt64, AccumType>& values, CountedPtr<uInt64> knownNpts,
823  const std::set<uInt64>& indices, uInt64 maxArraySize,
824  Bool persistSortedArray
825  );
826 };
827 
828 }
829 
830 #ifndef CASACORE_NO_AUTO_TEMPLATES
831 #include <casacore/scimath/Mathematics/ClassicalStatistics.tcc>
832 #endif //# CASACORE_NO_AUTO_TEMPLATES
833 
834 #endif
void _doMinMax(AccumType &vmin, AccumType &vmax)
scan dataset(s) to find min and max
vector< std::map< uInt64, AccumType > > _dataFromMultipleBins(const vector< typename StatisticsUtilities< AccumType >::BinDesc > &binDesc, uInt64 maxArraySize, const vector< std::set< uInt64 > > &dataIndices, uInt64 nBins)
extract data from multiple histograms given by binDesc.
Bool _valuesFromSortedArray(std::map< uInt64, AccumType > &values, CountedPtr< uInt64 > knownNpts, const std::set< uInt64 > &indices, uInt64 maxArraySize, Bool persistSortedArray)
get values from sorted array if the array is small enough to be held in memory.
vector< DataIterator >::const_iterator _dend
mutables, used to mitigate repeated code
long long Int64
Define the extra non-standard types used by Casacore (like proposed uSize, Size)
Definition: aipsxtype.h:38
void _computeMinMax(CountedPtr< AccumType > &mymax, CountedPtr< AccumType > &mymin, DataIterator dataIter, MaskIterator maskIter, WeightsIterator weightsIter, uInt64 dataCount)
LatticeExprNode median(const LatticeExprNode &expr)
AccumType _getStatistic(StatisticsData::STATS stat)
ClassicalStatistics< AccumType, DataIterator, MaskIterator, WeightsIterator > & operator=(const ClassicalStatistics< AccumType, DataIterator, MaskIterator, WeightsIterator > &other)
copy semantics
vector< DataIterator >::const_iterator _diter
virtual void _minMax(CountedPtr< AccumType > &mymin, CountedPtr< AccumType > &mymax, const DataIterator &dataBegin, Int64 nr, uInt dataStride) const
StatsData< AccumType > _getStatistics()
virtual StatsData< AccumType > & _getStatsData()
retreive stats structure.
TableExprNode array(const TableExprNode &values, const TableExprNodeSet &shape)
Create an array of the given shape and fill it with the values.
Definition: ExprNode.h:2093
void _createDataArray(vector< AccumType > &array)
Create an unsorted array of the complete data set.
unsigned long long uInt64
Definition: aipsxtype.h:39
std::set< uInt64 > _medianIndices(CountedPtr< uInt64 > knownNpts)
get the index (for odd npts) or indices (for even npts) of the median of the sorted array...
PtrHolder(const PtrHolder< T > &other)
virtual std::pair< Int64, Int64 > getStatisticIndex(StatisticsData::STATS stat)
see base class description
std::map< uInt64, AccumType > _indicesToValues(CountedPtr< uInt64 > knownNpts, CountedPtr< AccumType > knownMin, CountedPtr< AccumType > knownMax, uInt64 maxArraySize, const std::set< uInt64 > &dataIndices, Bool persistSortedArray, uInt64 nBins)
get the values for the specified indices in the sorted array of all good data
std::pair< Int64, Int64 > LocationType
std::map< uInt, DataRanges > _ranges
void setStatsToCalculate(std::set< StatisticsData::STATS > &stats)
Provide guidance to algorithms by specifying a priori which statistics the caller would like calculat...
Abstract base class which defines interface for providing "datasets" to the statistics framework when...
Class to calculate statistics in a "classical" sense, ie using accumulators with no special filtering...
void _computeBins(vector< vector< uInt64 > > &bins, vector< CountedPtr< AccumType > > &sameVal, vector< Bool > &allSame, DataIterator dataIter, MaskIterator maskIter, WeightsIterator weightsIter, uInt64 count, const vector< typename StatisticsUtilities< AccumType >::BinDesc > &binDesc, const vector< AccumType > &maxLimit)
virtual uInt64 getNPts()
scan the dataset(s) that have been added, and find the number of good points.
Hold and delete pointers not deleted by object destructors.
Definition: PtrHolder.h:81
virtual void getMinMax(AccumType &mymin, AccumType &mymax)
scan the dataset(s) that have been added, and find the min and max.
vector< vector< uInt64 > > _binCounts(vector< CountedPtr< AccumType > > &sameVal, const vector< typename StatisticsUtilities< AccumType >::BinDesc > &binDesc)
tally the number of data points that fall into each bin provided by binDesc Any points that are less ...
std::map< uInt, WeightsIterator > _weights
void _computeDataArray(vector< AccumType > &ary, DataIterator dataIter, MaskIterator maskIter, WeightsIterator weightsIter, uInt64 dataCount)
ALGORITHM
implemented algorithms
virtual const StatsData< AccumType > & _getStatsData() const
virtual void _populateArrays(vector< vector< AccumType > > &arys, uInt64 &currentCount, const DataIterator &dataBegin, Int64 nr, uInt dataStride, const vector< std::pair< AccumType, AccumType > > &includeLimits, uInt64 maxCount) const
Create a vector of unsorted arrays, one array for each bin defined by includeLimits.
void _addData()
Allows derived classes to do things after data is set or added.
Referenced counted pointer for constant data.
Definition: CountedPtr.h:86
virtual void _accumNpts(uInt64 &npts, const DataIterator &dataBegin, Int64 nr, uInt dataStride) const
scan through the data set to determine the number of good (unmasked, weight > 0, within range) points...
virtual AccumType getMedianAbsDevMed(CountedPtr< uInt64 > knownNpts=NULL, CountedPtr< AccumType > knownMin=NULL, CountedPtr< AccumType > knownMax=NULL, uInt binningThreshholdSizeBytes=4096 *4096, Bool persistSortedArray=False, uInt64 nBins=10000)
get the median of the absolute deviation about the median of the data.
std::map< uInt, Bool > _isIncludeRanges
Bool hasData() const
Has any data been added to this object? Will return False if the object has been reset and no data ha...
void _accumulate(StatsData< AccumType > &stats, const AccumType &datum, const LocationType &location)
virtual void _findBins(vector< vector< uInt64 > > &binCounts, vector< CountedPtr< AccumType > > &sameVal, vector< Bool > &allSame, const DataIterator &dataBegin, Int64 nr, uInt dataStride, const vector< typename StatisticsUtilities< AccumType >::BinDesc > &binDesc, const vector< AccumType > &maxLimit) const
Get the counts of data within the specified histogram bins.
static void _convertToAbsDevMedArray(vector< AccumType > &myArray, AccumType median)
convert in place by taking the absolute value of the difference of the vector and the median ...
void _computeDataArrays(vector< vector< AccumType > > &arys, uInt64 &currentCount, DataIterator dataIter, MaskIterator maskIter, WeightsIterator weightsIter, uInt64 dataCount, const vector< std::pair< AccumType, AccumType > > &includeLimits, uInt64 maxCount)
void _initThreadVars(uInt &nBlocks, uInt64 &extra, uInt &nthreads, PtrHolder< DataIterator > &dataIter, PtrHolder< MaskIterator > &maskIter, PtrHolder< WeightsIterator > &weightsIter, PtrHolder< uInt64 > &offset, uInt nThreadsMax) const
#define DataRanges
Commonly used types in statistics framework.
bool Bool
Define the standard types used by Casacore.
Definition: aipstype.h:42
virtual Bool _populateTestArray(vector< AccumType > &ary, const DataIterator &dataBegin, Int64 nr, uInt dataStride, uInt maxElements) const
no weights, no mask, no ranges
virtual void setCalculateAsAdded(Bool c)
Should statistics be updated with calls to addData or should they only be calculated upon calls to ge...
void setDataProvider(StatsDataProvider< AccumType, DataIterator, MaskIterator, WeightsIterator > *dataProvider)
An exception will be thrown if setCalculateAsAdded(True) has been called.
virtual AccumType getMedianAndQuantiles(std::map< Double, AccumType > &quantiles, const std::set< Double > &fractions, CountedPtr< uInt64 > knownNpts=NULL, CountedPtr< AccumType > knownMin=NULL, CountedPtr< AccumType > knownMax=NULL, uInt binningThreshholdSizeBytes=4096 *4096, Bool persistSortedArray=False, uInt64 nBins=10000)
If one needs to compute both the median and quantile values, it is better to call getMedianAndQuantil...
virtual std::map< Double, AccumType > getQuantiles(const std::set< Double > &fractions, CountedPtr< uInt64 > knownNpts=NULL, CountedPtr< AccumType > knownMin=NULL, CountedPtr< AccumType > knownMax=NULL, uInt binningThreshholdSizeBytes=4096 *4096, Bool persistSortedArray=False, uInt64 nBins=10000)
Get the specified quantiles.
std::map< uInt, MaskIterator > _masks
void _createDataArrays(vector< vector< AccumType > > &arrays, const vector< std::pair< AccumType, AccumType > > &includeLimits, uInt64 maxCount)
static void _makeBins(typename StatisticsUtilities< AccumType >::BinDesc &bins, AccumType minData, AccumType maxData, uInt maxBins, Bool allowPad)
If allowPad is True, then pad the lower side of the lowest bin and the higher side of the highest bin...
vector< uInt >::const_iterator _dsiter
const Bool False
Definition: aipstype.h:44
virtual void _unweightedStats(StatsData< AccumType > &stats, uInt64 &ngood, LocationType &location, const DataIterator &dataBegin, Int64 nr, uInt dataStride)
no weights, no mask, no ranges
StatsData< AccumType > _statsData
vector< std::map< uInt64, AccumType > > _dataFromSingleBins(const vector< uInt64 > &binNpts, uInt64 maxArraySize, const vector< std::pair< AccumType, AccumType > > &binLimits, const vector< std::set< uInt64 > > &dataIndices, uInt64 nBins)
Bool _increment(Bool includeIDataset)
increment the relevant loop counters
virtual void reset()
reset object to initial state.
virtual StatisticsData::ALGORITHM algorithm() const
get the algorithm that this object uses for computing stats
vector< Int64 >::const_iterator _citer
const Double c
Fundamental physical constants (SI units):
virtual AccumType getMedian(CountedPtr< uInt64 > knownNpts=NULL, CountedPtr< AccumType > knownMin=NULL, CountedPtr< AccumType > knownMax=NULL, uInt binningThreshholdSizeBytes=4096 *4096, Bool persistSortedArray=False, uInt64 nBins=10000)
In the following group of methods, if the size of the composite dataset is smaller than binningThresh...
static void _mergeResults(vector< vector< uInt64 > > &bins, vector< CountedPtr< AccumType > > &sameVal, vector< Bool > &allSame, const PtrHolder< vector< vector< uInt64 > > > &tBins, const PtrHolder< vector< CountedPtr< AccumType > > > &tSameVal, const PtrHolder< vector< Bool > > &tAllSame, uInt nThreadsMax)
virtual void _weightedStats(StatsData< AccumType > &stats, LocationType &location, const DataIterator &dataBegin, const WeightsIterator &weightsBegin, Int64 nr, uInt dataStride)
has weights, but no mask, no ranges
void _incrementThreadIters(DataIterator &dataIter, MaskIterator &maskIter, WeightsIterator &weightsIter, uInt64 &offset, uInt nthreads) const
increment thread-based iterators
Bool _isNptsSmallerThan(vector< AccumType > &arrayToSort, uInt maxArraySize)
Determine by scanning the dataset if the number of good points is smaller than maxArraySize.
virtual void _updateDataProviderMaxMin(const StatsData< AccumType > &threadStats)
virtual void _populateArray(vector< AccumType > &ary, const DataIterator &dataBegin, Int64 nr, uInt dataStride) const
populate an unsorted array with valid data.
virtual StatsData< AccumType > _getInitialStats() const
void _computeStats(StatsData< AccumType > &stats, uInt64 &ngood, LocationType &location, DataIterator dataIter, MaskIterator maskIter, WeightsIterator weightsIter, uInt64 count)
Base class of statistics algorithm class hierarchy.
this file contains all the compiler specific defines
Definition: mainpage.dox:28
unsigned int uInt
Definition: aipstype.h:51
description of a regularly spaced bins with the first bin having lower limit of minLimit and having n...