Package rdkit :: Package ML :: Package Cluster :: Module Butina
[hide private]
[frames] | no frames]

Source Code for Module rdkit.ML.Cluster.Butina

  1  # $Id$ 
  2  # 
  3  # Copyright (C) 2007-2008 Greg Landrum 
  4  #   All Rights Reserved 
  5  # 
  6  """ Implementation of the clustering algorithm published in: 
  7    Butina JCICS 39 747-750 (1999) 
  8   
  9  """ 
 10  import numpy 
 11  from rdkit import RDLogger 
 12  logger=RDLogger.logger() 
 13   
14 -def EuclideanDist(pi,pj):
15 dv = numpy.array(pi)- numpy.array(pj) 16 return numpy.sqrt(dv*dv)
17 18
19 -def ClusterData(data,nPts,distThresh,isDistData=False,distFunc=EuclideanDist,reordering=False):
20 """ clusters the data points passed in and returns the list of clusters 21 22 **Arguments** 23 24 - data: a list of items with the input data 25 (see discussion of _isDistData_ argument for the exception) 26 27 - nPts: the number of points to be used 28 29 - distThresh: elements within this range of each other are considered 30 to be neighbors 31 32 - isDistData: set this toggle when the data passed in is a 33 distance matrix. The distance matrix should be stored 34 symmetrically. An example of how to do this: 35 36 dists = [] 37 for i in range(nPts): 38 for j in range(i): 39 dists.append( distfunc(i,j) ) 40 41 - distFunc: a function to calculate distances between points. 42 Receives 2 points as arguments, should return a float 43 44 - reodering: if this toggle is set, the number of neighbors is updated 45 for the unassigned molecules after a new cluster is created such 46 that always the molecule with the largest number of unassigned 47 neighbors is selected as the next cluster center. 48 49 **Returns** 50 51 - a tuple of tuples containing information about the clusters: 52 ( (cluster1_elem1, cluster1_elem2, ...), 53 (cluster2_elem1, cluster2_elem2, ...), 54 ... 55 ) 56 The first element for each cluster is its centroid. 57 58 """ 59 if isDistData and len(data)>(nPts*(nPts-1)/2): 60 logger.warning("Distance matrix is too long") 61 nbrLists = [None]*nPts 62 for i in range(nPts): nbrLists[i] = [] 63 64 dmIdx=0 65 for i in range(nPts): 66 for j in range(i): 67 if not isDistData: 68 dij = distFunc(data[i],data[j]) 69 else: 70 dij = data[dmIdx] 71 dmIdx+=1 72 if dij<=distThresh: 73 nbrLists[i].append(j) 74 nbrLists[j].append(i) 75 #print nbrLists 76 # sort by the number of neighbors: 77 tLists = [(len(y),x) for x,y in enumerate(nbrLists)] 78 tLists.sort(reverse=True) 79 80 res = [] 81 seen = [0]*nPts 82 while tLists: 83 nNbrs,idx = tLists.pop(0) 84 if seen[idx]: 85 continue 86 tRes = [idx] 87 for nbr in nbrLists[idx]: 88 if not seen[nbr]: 89 tRes.append(nbr) 90 seen[nbr]=1 91 # update the number of neighbors: 92 # remove all members of the new cluster from the list of 93 # neighbors and reorder the tLists 94 if reordering: 95 # get the list of affected molecules, i.e. all molecules 96 # which have at least one of the members of the new cluster 97 # as a neighbor 98 nbrNbr = [nbrLists[t] for t in tRes] 99 nbrNbr = frozenset().union(*nbrNbr) 100 # loop over all remaining molecules in tLists but only 101 # consider unassigned and affected compounds 102 for x,y in enumerate(tLists): 103 y1 = y[1] 104 if seen[y1] or (y1 not in nbrNbr): continue 105 # update the number of neighbors 106 nbrLists[y1] = set(nbrLists[y1]).difference(tRes) 107 tLists[x] = (len(nbrLists[y1]), y1) 108 # now reorder the list 109 tLists.sort(reverse=True) 110 res.append(tuple(tRes)) 111 return tuple(res)
112