VTK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Public Types | Public Member Functions | Static Public Member Functions | Protected Member Functions | Protected Attributes | List of all members
vtkKMeansStatistics Class Reference

A class for KMeans clustering. More...

#include <vtkKMeansStatistics.h>

Inheritance diagram for vtkKMeansStatistics:
[legend]
Collaboration diagram for vtkKMeansStatistics:
[legend]

Public Types

typedef vtkStatisticsAlgorithm Superclass
 
- Public Types inherited from vtkStatisticsAlgorithm
typedef vtkTableAlgorithm Superclass
 
enum  InputPorts { INPUT_DATA = 0, LEARN_PARAMETERS = 1, INPUT_MODEL = 2 }
 
enum  OutputIndices { OUTPUT_DATA = 0, OUTPUT_MODEL = 1, ASSESSMENT = 2, OUTPUT_TEST = 2 }
 

Public Member Functions

virtual int IsA (const char *type)
 
vtkKMeansStatisticsNewInstance () const
 
virtual void PrintSelf (ostream &os, vtkIndent indent)
 
virtual void SetDistanceFunctor (vtkKMeansDistanceFunctor *)
 
virtual vtkKMeansDistanceFunctorGetDistanceFunctor ()
 
virtual void SetDefaultNumberOfClusters (int)
 
virtual int GetDefaultNumberOfClusters ()
 
virtual void SetKValuesArrayName (const char *)
 
virtual char * GetKValuesArrayName ()
 
virtual void SetMaxNumIterations (int)
 
virtual int GetMaxNumIterations ()
 
virtual void SetTolerance (double)
 
virtual double GetTolerance ()
 
virtual void Aggregate (vtkDataObjectCollection *, vtkMultiBlockDataSet *)
 
virtual bool SetParameter (const char *parameter, int index, vtkVariant value)
 
- Public Member Functions inherited from vtkStatisticsAlgorithm
vtkStatisticsAlgorithmNewInstance () const
 
void PrintSelf (ostream &os, vtkIndent indent)
 
virtual void SetColumnStatus (const char *namCol, int status)
 
virtual void ResetAllColumnStates ()
 
virtual int RequestSelectedColumns ()
 
virtual void ResetRequests ()
 
virtual vtkIdType GetNumberOfRequests ()
 
virtual vtkIdType GetNumberOfColumnsForRequest (vtkIdType request)
 
void AddColumn (const char *namCol)
 
void AddColumnPair (const char *namColX, const char *namColY)
 
virtual void SetLearnOptionParameterConnection (vtkAlgorithmOutput *params)
 
virtual void SetLearnOptionParameters (vtkDataObject *params)
 
virtual void SetInputModelConnection (vtkAlgorithmOutput *model)
 
virtual void SetInputModel (vtkDataObject *model)
 
virtual void SetLearnOption (bool)
 
virtual bool GetLearnOption ()
 
virtual void SetDeriveOption (bool)
 
virtual bool GetDeriveOption ()
 
virtual void SetAssessOption (bool)
 
virtual bool GetAssessOption ()
 
virtual void SetTestOption (bool)
 
virtual bool GetTestOption ()
 
virtual void SetNumberOfPrimaryTables (vtkIdType)
 
virtual vtkIdType GetNumberOfPrimaryTables ()
 
virtual void SetAssessNames (vtkStringArray *)
 
virtual vtkStringArray * GetAssessNames ()
 
virtual const char * GetColumnForRequest (vtkIdType r, vtkIdType c)
 
virtual int GetColumnForRequest (vtkIdType r, vtkIdType c, vtkStdString &columnName)
 

Static Public Member Functions

static int IsTypeOf (const char *type)
 
static vtkKMeansStatisticsSafeDownCast (vtkObjectBase *o)
 
static vtkKMeansStatisticsNew ()
 
- Static Public Member Functions inherited from vtkStatisticsAlgorithm
static int IsTypeOf (const char *type)
 
static vtkStatisticsAlgorithmSafeDownCast (vtkObjectBase *o)
 

Protected Member Functions

virtual vtkObjectBase * NewInstanceInternal () const
 
 vtkKMeansStatistics ()
 
 ~vtkKMeansStatistics ()
 
virtual void Derive (vtkMultiBlockDataSet *)
 
virtual vtkIdType GetTotalNumberOfObservations (vtkIdType numObservations)
 
virtual void Learn (vtkTable *, vtkTable *, vtkMultiBlockDataSet *)
 
virtual void Assess (vtkTable *, vtkMultiBlockDataSet *, vtkTable *)
 
virtual void Test (vtkTable *, vtkMultiBlockDataSet *, vtkTable *)
 
virtual void SelectAssessFunctor (vtkTable *inData, vtkDataObject *inMeta, vtkStringArray *rowNames, AssessFunctor *&dfunc)
 
virtual void UpdateClusterCenters (vtkTable *newClusterElements, vtkTable *curClusterElements, vtkIdTypeArray *numMembershipChanges, vtkIdTypeArray *numElementsInCluster, vtkDoubleArray *error, vtkIdTypeArray *startRunID, vtkIdTypeArray *endRunID, vtkIntArray *computeRun)
 
int InitializeDataAndClusterCenters (vtkTable *inParameters, vtkTable *inData, vtkTable *dataElements, vtkIdTypeArray *numberOfClusters, vtkTable *curClusterElements, vtkTable *newClusterElements, vtkIdTypeArray *startRunID, vtkIdTypeArray *endRunID)
 
virtual void CreateInitialClusterCenters (vtkIdType numToAllocate, vtkIdTypeArray *numberOfClusters, vtkTable *inData, vtkTable *curClusterElements, vtkTable *newClusterElements)
 
- Protected Member Functions inherited from vtkStatisticsAlgorithm
 vtkStatisticsAlgorithm ()
 
 ~vtkStatisticsAlgorithm ()
 
virtual int FillInputPortInformation (int port, vtkInformation *info)
 
virtual int FillOutputPortInformation (int port, vtkInformation *info)
 
virtual int RequestData (vtkInformation *, vtkInformationVector **, vtkInformationVector *)
 
void Assess (vtkTable *, vtkMultiBlockDataSet *, vtkTable *, int)
 

Protected Attributes

int DefaultNumberOfClusters
 
char * KValuesArrayName
 
int MaxNumIterations
 
double Tolerance
 
vtkKMeansDistanceFunctorDistanceFunctor
 
- Protected Attributes inherited from vtkStatisticsAlgorithm
int NumberOfPrimaryTables
 
bool LearnOption
 
bool DeriveOption
 
bool AssessOption
 
bool TestOption
 
vtkStringArray * AssessNames
 
vtkStatisticsAlgorithmPrivateInternals
 

Detailed Description

A class for KMeans clustering.

This class takes as input an optional vtkTable on port LEARN_PARAMETERS specifying initial set(s) of cluster values of the following form:

          K     | Col1            |  ...    | ColN
     -----------+-----------------+---------+---------------
          M     |clustCoord(1, 1) |  ...    | clustCoord(1, N)
          M     |clustCoord(2, 1) |  ...    | clustCoord(2, N)
          .     |       .         |   .     |        .
          .     |       .         |   .     |        .
          .     |       .         |   .     |        .
          M     |clustCoord(M, 1) |  ...    | clustCoord(M, N)
          L     |clustCoord(1, 1) |  ...    | clustCoord(1, N)
          L     |clustCoord(2, 1) |  ...    | clustCoord(2, N)
          .     |       .         |   .     |        .
          .     |       .         |   .     |        .
          .     |       .         |   .     |        .
          L     |clustCoord(L, 1) |  ...    | clustCoord(L, N)

Because the desired value of K is often not known in advance and the results of the algorithm are dependent on the initial cluster centers, we provide a mechanism for the user to test multiple runs or sets of cluster centers within a single call to the Learn phase. The first column of the table identifies the number of clusters K in the particular run (the entries in this column should be of type vtkIdType), while the remaining columns are a subset of the columns contained in the table on port INPUT_DATA. We require that all user specified clusters be of the same dimension N and consequently, that the LEARN_PARAMETERS table have N+1 columns. Due to this restriction, only one request can be processed for each call to the Learn phase and subsequent requests are silently ignored. Note that, if the first column of the LEARN_PARAMETERS table is not of type vtkIdType, then the table will be ignored and a single run will be performed using the first DefaultNumberOfClusters input data observations as initial cluster centers.

When the user does not supply an initial set of clusters, then the first DefaultNumberOfClusters input data observations are used as initial cluster centers and a single run is performed.

This class provides the following functionalities, depending on the operation in which it is executed: Learn: calculates new cluster centers for each run. The output metadata on port OUTPUT_MODEL is a multiblock dataset containing at a minimum one vtkTable with columns specifying the following for each run: the run ID, number of clusters, number of iterations required for convergence, total error associated with the cluster (sum of squared Euclidean distance from each observation to its nearest cluster center), the cardinality of the cluster, and the new cluster coordinates.

Derive: An additional vtkTable is stored in the multiblock dataset output on port OUTPUT_MODEL. This table contains columns that store for each run: the runID, number of clusters, total error for all clusters in the run, local rank, and global rank. The local rank is computed by comparing squared Euclidean errors of all runs with the same number of clusters. The global rank is computed analagously across all runs.

Assess: This requires a multiblock dataset (as computed from Learn and Derive) on input port INPUT_MODEL and tabular data on input port INPUT_DATA that contains column names matching those of the tables on input port INPUT_MODEL. The assess mode reports the closest cluster center and associated squared Euclidean distance of each observation in port INPUT_DATA's table to the cluster centers for each run in the multiblock dataset provided on port INPUT_MODEL.

The code can handle a wide variety of data types as it operates on vtkAbstractArrays and is not limited to vtkDataArrays. A default distance functor that computes the sum of the squares of the Euclidean distance between two objects is provided (vtkKMeansDistanceFunctor). The default distance functor can be overridden to use alternative distance metrics.

Thanks:
Thanks to Janine Bennett, David Thompson, and Philippe Pebay of Sandia National Laboratories for implementing this class. Updated by Philippe Pebay, Kitware SAS 2012
Examples:
vtkKMeansStatistics (Examples)
Tests:
vtkKMeansStatistics (Tests)

Definition at line 113 of file vtkKMeansStatistics.h.

Member Typedef Documentation

Definition at line 116 of file vtkKMeansStatistics.h.

Constructor & Destructor Documentation

vtkKMeansStatistics::vtkKMeansStatistics ( )
protected
vtkKMeansStatistics::~vtkKMeansStatistics ( )
protected

Member Function Documentation

static int vtkKMeansStatistics::IsTypeOf ( const char *  type)
static
virtual int vtkKMeansStatistics::IsA ( const char *  type)
virtual

Reimplemented from vtkStatisticsAlgorithm.

Reimplemented in vtkPKMeansStatistics.

static vtkKMeansStatistics* vtkKMeansStatistics::SafeDownCast ( vtkObjectBase *  o)
static
virtual vtkObjectBase* vtkKMeansStatistics::NewInstanceInternal ( ) const
protectedvirtual

Reimplemented from vtkStatisticsAlgorithm.

Reimplemented in vtkPKMeansStatistics.

vtkKMeansStatistics* vtkKMeansStatistics::NewInstance ( ) const
virtual void vtkKMeansStatistics::PrintSelf ( ostream &  os,
vtkIndent  indent 
)
virtual

Reimplemented in vtkPKMeansStatistics.

static vtkKMeansStatistics* vtkKMeansStatistics::New ( )
static
virtual void vtkKMeansStatistics::SetDistanceFunctor ( vtkKMeansDistanceFunctor )
virtual

Set the DistanceFunctor.

virtual vtkKMeansDistanceFunctor* vtkKMeansStatistics::GetDistanceFunctor ( )
virtual

Set the DistanceFunctor.

virtual void vtkKMeansStatistics::SetDefaultNumberOfClusters ( int  )
virtual

Set/get the DefaultNumberOfClusters, used when no initial cluster coordinates are specified.

virtual int vtkKMeansStatistics::GetDefaultNumberOfClusters ( )
virtual

Set/get the DefaultNumberOfClusters, used when no initial cluster coordinates are specified.

virtual void vtkKMeansStatistics::SetKValuesArrayName ( const char *  )
virtual

Set/get the KValuesArrayName.

virtual char* vtkKMeansStatistics::GetKValuesArrayName ( )
virtual

Set/get the KValuesArrayName.

virtual void vtkKMeansStatistics::SetMaxNumIterations ( int  )
virtual

Set/get the MaxNumIterations used to terminate iterations on cluster center coordinates when the relative tolerance can not be met.

virtual int vtkKMeansStatistics::GetMaxNumIterations ( )
virtual

Set/get the MaxNumIterations used to terminate iterations on cluster center coordinates when the relative tolerance can not be met.

virtual void vtkKMeansStatistics::SetTolerance ( double  )
virtual

Set/get the relative Tolerance used to terminate iterations on cluster center coordinates.

virtual double vtkKMeansStatistics::GetTolerance ( )
virtual

Set/get the relative Tolerance used to terminate iterations on cluster center coordinates.

virtual void vtkKMeansStatistics::Aggregate ( vtkDataObjectCollection *  ,
vtkMultiBlockDataSet *   
)
inlinevirtual

Given a collection of models, calculate aggregate model NB: not implemented

Implements vtkStatisticsAlgorithm.

Definition at line 156 of file vtkKMeansStatistics.h.

virtual bool vtkKMeansStatistics::SetParameter ( const char *  parameter,
int  index,
vtkVariant  value 
)
virtual

A convenience method for setting properties by name.

Reimplemented from vtkStatisticsAlgorithm.

virtual void vtkKMeansStatistics::Learn ( vtkTable *  ,
vtkTable *  ,
vtkMultiBlockDataSet *   
)
protectedvirtual

Execute the calculations required by the Learn option.

Implements vtkStatisticsAlgorithm.

virtual void vtkKMeansStatistics::Derive ( vtkMultiBlockDataSet *  )
protectedvirtual

Execute the calculations required by the Derive option.

Implements vtkStatisticsAlgorithm.

virtual void vtkKMeansStatistics::Assess ( vtkTable *  ,
vtkMultiBlockDataSet *  ,
vtkTable *   
)
protectedvirtual

Execute the calculations required by the Assess option.

Implements vtkStatisticsAlgorithm.

virtual void vtkKMeansStatistics::Test ( vtkTable *  ,
vtkMultiBlockDataSet *  ,
vtkTable *   
)
inlineprotectedvirtual

Execute the calculations required by the Test option.

Implements vtkStatisticsAlgorithm.

Definition at line 191 of file vtkKMeansStatistics.h.

virtual void vtkKMeansStatistics::SelectAssessFunctor ( vtkTable *  inData,
vtkDataObject *  inMeta,
vtkStringArray *  rowNames,
AssessFunctor *&  dfunc 
)
protectedvirtual

Provide the appropriate assessment functor.

Implements vtkStatisticsAlgorithm.

virtual void vtkKMeansStatistics::UpdateClusterCenters ( vtkTable *  newClusterElements,
vtkTable *  curClusterElements,
vtkIdTypeArray *  numMembershipChanges,
vtkIdTypeArray *  numElementsInCluster,
vtkDoubleArray *  error,
vtkIdTypeArray *  startRunID,
vtkIdTypeArray *  endRunID,
vtkIntArray *  computeRun 
)
protectedvirtual

Subroutine to update new cluster centers from the old centers. Called from within Learn (and will be overridden by vtkPKMeansStatistics to handle distributed datasets).

Reimplemented in vtkPKMeansStatistics.

virtual vtkIdType vtkKMeansStatistics::GetTotalNumberOfObservations ( vtkIdType  numObservations)
protectedvirtual

Subroutine to get the total number of observations. Called from within Learn (and will be overridden by vtkPKMeansStatistics to handle distributed datasets).

Reimplemented in vtkPKMeansStatistics.

int vtkKMeansStatistics::InitializeDataAndClusterCenters ( vtkTable *  inParameters,
vtkTable *  inData,
vtkTable *  dataElements,
vtkIdTypeArray *  numberOfClusters,
vtkTable *  curClusterElements,
vtkTable *  newClusterElements,
vtkIdTypeArray *  startRunID,
vtkIdTypeArray *  endRunID 
)
protected

Subroutine to initalize the cluster centers using those provided by the user in input port LEARN_PARAMETERS. If no cluster centers are provided, the subroutine uses the first DefaultNumberOfClusters input data points as initial cluster centers. Called from within Learn.

virtual void vtkKMeansStatistics::CreateInitialClusterCenters ( vtkIdType  numToAllocate,
vtkIdTypeArray *  numberOfClusters,
vtkTable *  inData,
vtkTable *  curClusterElements,
vtkTable *  newClusterElements 
)
protectedvirtual

Subroutine to initialize cluster centerss if not provided by the user. Called from within Learn (and will be overridden by vtkPKMeansStatistics to handle distributed datasets).

Reimplemented in vtkPKMeansStatistics.

Member Data Documentation

int vtkKMeansStatistics::DefaultNumberOfClusters
protected

This is the default number of clusters used when the user does not provide initial cluster centers.

Definition at line 253 of file vtkKMeansStatistics.h.

char* vtkKMeansStatistics::KValuesArrayName
protected

This is the name of the column that specifies the number of clusters in each run. This is only used if the user has not specified initial clusters.

Definition at line 257 of file vtkKMeansStatistics.h.

int vtkKMeansStatistics::MaxNumIterations
protected

This is the maximum number of iterations allowed if the new cluster centers have not yet converged.

Definition at line 260 of file vtkKMeansStatistics.h.

double vtkKMeansStatistics::Tolerance
protected

This is the percentage of data elements that swap cluster IDs

Definition at line 262 of file vtkKMeansStatistics.h.

vtkKMeansDistanceFunctor* vtkKMeansStatistics::DistanceFunctor
protected

This is the Distance functor. The default is Euclidean distance, however this can be overridden.

Definition at line 265 of file vtkKMeansStatistics.h.


The documentation for this class was generated from the following file: