Package rdkit :: Package ML :: Package Cluster :: Module ClusterUtils
[hide private]
[frames] | no frames]

Source Code for Module rdkit.ML.Cluster.ClusterUtils

  1  # $Id$ 
  2  # 
  3  # Copyright (C) 2001-2008  greg Landrum 
  4  # 
  5  #   @@ All Rights Reserved @@ 
  6  #  This file is part of the RDKit. 
  7  #  The contents are covered by the terms of the BSD license 
  8  #  which is included in the file license.txt, found at the root 
  9  #  of the RDKit source tree. 
 10  # 
 11  """utility functions for clustering 
 12   
 13  """ 
 14   
15 -def GetNodeList(cluster):
16 """returns an ordered list of all nodes below cluster 17 18 the ordering is done using the lengths of the child nodes 19 20 **Arguments** 21 22 - cluster: the cluster in question 23 24 **Returns** 25 26 - a list of the leaves below this cluster 27 28 """ 29 if len(cluster) == 1: 30 return [cluster] 31 else: 32 children = cluster.GetChildren() 33 children.sort(key=lambda x:len(x), reverse=True) 34 res = [] 35 for child in children: 36 res += GetNodeList(child) 37 res += [cluster] 38 return res
39
40 -def GetNodesDownToCentroids(cluster,above=1):
41 """returns an ordered list of all nodes below cluster 42 43 44 """ 45 if hasattr(cluster,'_isCentroid'): 46 cluster._aboveCentroid = 0 47 above = -1 48 else: 49 cluster._aboveCentroid = above 50 if len(cluster) == 1: 51 return [cluster] 52 else: 53 res = [] 54 children = cluster.GetChildren() 55 children.sort(lambda x,y:cmp(len(y),len(x))) 56 for child in children: 57 res = res + GetNodesDownToCentroids(child,above) 58 res = res + [cluster] 59 return res
60
61 -def FindClusterCentroidFromDists(cluster,dists):
62 """ find the point in a cluster which has the smallest summed 63 Euclidean distance to all others 64 65 **Arguments** 66 67 - cluster: the cluster to work with 68 69 - dists: the distance matrix to use for the points 70 71 **Returns** 72 73 - the index of the centroid point 74 75 """ 76 children = cluster.GetPoints() 77 pts = [x.GetData() for x in children] 78 79 best = 1e24 80 bestIdx = -1 81 for pt in pts: 82 dAccum = 0.0 83 # loop over others and add'em up 84 for other in pts: 85 if other != pt: 86 if other > pt: row,col = pt,other 87 else: row,col = other,pt 88 dAccum += dists[col*(col-1)/2+row] 89 if dAccum >= best: 90 # minor efficiency hack 91 break 92 if dAccum < best: 93 best = dAccum 94 bestIdx = pt 95 for i in range(len(pts)): 96 pt = pts[i] 97 if pt != bestIdx: 98 if pt > bestIdx: row,col = bestIdx,pt 99 else: row,col = pt,bestIdx 100 children[i]._distToCenter = dists[col*(col-1)/2+row] 101 else: 102 children[i]._distToCenter = 0.0 103 children[i]._clustCenter = bestIdx 104 cluster._clustCenter=bestIdx 105 cluster._distToCenter=0.0 106 107 return bestIdx
108
109 -def _BreadthFirstSplit(cluster,n):
110 """ *Internal Use Only* 111 112 """ 113 if len(cluster)<n: 114 raise ValueError('Cannot split cluster of length %d into %d pieces'%(len(cluster),n)) 115 if len(cluster)==n: 116 return cluster.GetPoints() 117 clusters = [cluster] 118 nxtIdx = 0 119 for i in range(n-1): 120 while nxtIdx<len(clusters) and len(clusters[nxtIdx])==1: 121 nxtIdx+=1 122 assert nxtIdx<len(clusters) 123 124 children = clusters[nxtIdx].GetChildren() 125 children.sort(key=lambda x: x.GetMetric(), reverse=True) 126 for child in children: 127 clusters.append(child) 128 del clusters[nxtIdx] 129 return clusters
130
131 -def _HeightFirstSplit(cluster,n):
132 """ *Internal Use Only* 133 134 """ 135 if len(cluster)<n: 136 raise ValueError('Cannot split cluster of length %d into %d pieces'%(len(cluster),n)) 137 if len(cluster)==n: 138 return cluster.GetPoints() 139 clusters = [cluster] 140 for i in range(n-1): 141 nxtIdx = 0 142 while nxtIdx<len(clusters) and len(clusters[nxtIdx])==1: 143 nxtIdx+=1 144 assert nxtIdx<len(clusters) 145 146 children = clusters[nxtIdx].GetChildren() 147 for child in children: 148 clusters.append(child) 149 del clusters[nxtIdx] 150 clusters.sort(key=lambda x: x.GetMetric(), reverse=True) 151 return clusters
152 153
154 -def SplitIntoNClusters(cluster,n,breadthFirst=1):
155 """ splits a cluster tree into a set of branches 156 157 **Arguments** 158 159 - cluster: the root of the cluster tree 160 161 - n: the number of clusters to include in the split 162 163 - breadthFirst: toggles breadth first (vs depth first) cleavage 164 of the cluster tree. 165 166 **Returns** 167 168 - a list of sub clusters 169 170 """ 171 if breadthFirst: 172 return _BreadthFirstSplit(cluster,n) 173 else: 174 return _HeightFirstSplit(cluster,n)
175