1
2
3
4
5
6 """ Implementation of the clustering algorithm published in:
7 Butina JCICS 39 747-750 (1999)
8
9 """
10 import numpy
11 from rdkit import RDLogger
12 logger=RDLogger.logger()
13
15 dv = numpy.array(pi)- numpy.array(pj)
16 return numpy.sqrt(dv*dv)
17
18
20 """ clusters the data points passed in and returns the list of clusters
21
22 **Arguments**
23
24 - data: a list of items with the input data
25 (see discussion of _isDistData_ argument for the exception)
26
27 - nPts: the number of points to be used
28
29 - distThresh: elements within this range of each other are considered
30 to be neighbors
31
32 - isDistData: set this toggle when the data passed in is a
33 distance matrix. The distance matrix should be stored
34 symmetrically. An example of how to do this:
35
36 dists = []
37 for i in range(nPts):
38 for j in range(i):
39 dists.append( distfunc(i,j) )
40
41 - distFunc: a function to calculate distances between points.
42 Receives 2 points as arguments, should return a float
43
44 - reodering: if this toggle is set, the number of neighbors is updated
45 for the unassigned molecules after a new cluster is created such
46 that always the molecule with the largest number of unassigned
47 neighbors is selected as the next cluster center.
48
49 **Returns**
50
51 - a tuple of tuples containing information about the clusters:
52 ( (cluster1_elem1, cluster1_elem2, ...),
53 (cluster2_elem1, cluster2_elem2, ...),
54 ...
55 )
56 The first element for each cluster is its centroid.
57
58 """
59 if isDistData and len(data)>(nPts*(nPts-1)/2):
60 logger.warning("Distance matrix is too long")
61 nbrLists = [None]*nPts
62 for i in range(nPts): nbrLists[i] = []
63
64 dmIdx=0
65 for i in range(nPts):
66 for j in range(i):
67 if not isDistData:
68 dij = distFunc(data[i],data[j])
69 else:
70 dij = data[dmIdx]
71 dmIdx+=1
72 if dij<=distThresh:
73 nbrLists[i].append(j)
74 nbrLists[j].append(i)
75
76
77 tLists = [(len(y),x) for x,y in enumerate(nbrLists)]
78 tLists.sort(reverse=True)
79
80 res = []
81 seen = [0]*nPts
82 while tLists:
83 nNbrs,idx = tLists.pop(0)
84 if seen[idx]:
85 continue
86 tRes = [idx]
87 for nbr in nbrLists[idx]:
88 if not seen[nbr]:
89 tRes.append(nbr)
90 seen[nbr]=1
91
92
93
94 if reordering:
95
96
97
98 nbrNbr = [nbrLists[t] for t in tRes]
99 nbrNbr = frozenset().union(*nbrNbr)
100
101
102 for x,y in enumerate(tLists):
103 y1 = y[1]
104 if seen[y1] or (y1 not in nbrNbr): continue
105
106 nbrLists[y1] = set(nbrLists[y1]).difference(tRes)
107 tLists[x] = (len(nbrLists[y1]), y1)
108
109 tLists.sort(reverse=True)
110 res.append(tuple(tRes))
111 return tuple(res)
112