1
2
3
4
5
6
7
8
9
10
11 """ utility functionality for clustering molecules using fingerprints
12 includes a command line app for clustering
13
14
15 Sample Usage:
16 python ClusterMols.py -d data.gdb -t daylight_sig \
17 --idName="CAS_TF" -o clust1.pkl \
18 --actTable="dop_test" --actName="moa_quant"
19
20 """
21 from rdkit.Dbase.DbConnection import DbConnect
22 from rdkit.Dbase import DbInfo,DbUtils
23 from rdkit.ML.Data import DataUtils
24 from rdkit.ML.Cluster import Clusters
25 from rdkit.ML.Cluster import Murtagh
26 import sys
27 from rdkit.six.moves import cPickle
28 from rdkit.Chem.Fingerprints import FingerprintMols,MolSimilarity
29 from rdkit import DataStructs
30 import numpy
31 _cvsVersion="$Id$"
32 idx1 = _cvsVersion.find(':')+1
33 idx2 = _cvsVersion.rfind('$')
34 __VERSION_STRING="%s"%(_cvsVersion[idx1:idx2])
35
36 message=FingerprintMols.message
37 error=FingerprintMols.error
38
40 """ data should be a list of tuples with fingerprints in position 1
41 (the rest of the elements of the tuple are not important)
42
43 Returns the symmetric distance matrix
44 (see ML.Cluster.Resemblance for layout documentation)
45
46 """
47 nPts = len(data)
48 res = numpy.zeros((nPts*(nPts-1)/2),numpy.float)
49 nSoFar=0
50 for col in xrange(1,nPts):
51 for row in xrange(col):
52 fp1 = data[col][1]
53 fp2 = data[row][1]
54 if fp1.GetNumBits()>fp2.GetNumBits():
55 fp1 = DataStructs.FoldFingerprint(fp1,fp1.GetNumBits()/fp2.GetNumBits())
56 elif fp2.GetNumBits()>fp1.GetNumBits():
57 fp2 = DataStructs.FoldFingerprint(fp2,fp2.GetNumBits()/fp1.GetNumBits())
58 sim = metric(fp1,fp2)
59 if isSimilarity:
60 sim = 1.-sim
61 res[nSoFar] = sim
62 nSoFar += 1
63 return res
64
65 -def ClusterPoints(data,metric,algorithmId,haveLabels=False,haveActs=True,returnDistances=False):
66 message('Generating distance matrix.\n')
67 dMat = GetDistanceMatrix(data,metric)
68 message('Clustering\n')
69 clustTree = Murtagh.ClusterData(dMat,len(data),algorithmId,
70 isDistData=1)[0]
71 acts = []
72 if haveActs and len(data[0])>2:
73
74 acts = [int(x[2]) for x in data]
75
76 if not haveLabels:
77 labels = ['Mol: %s'%str(x[0]) for x in data]
78 else:
79 labels = [x[0] for x in data]
80 clustTree._ptLabels = labels
81 if acts:
82 clustTree._ptValues = acts
83 for pt in clustTree.GetPoints():
84 idx = pt.GetIndex()-1
85 pt.SetName(labels[idx])
86 if acts:
87 try:
88 pt.SetData(int(acts[idx]))
89 except:
90 pass
91 if not returnDistances:
92 return clustTree
93 else:
94 return clustTree,dMat
95
120
121 _usageDoc="""
122 Usage: ClusterMols.py [args] <fName>
123
124 If <fName> is provided and no tableName is specified (see below),
125 data will be read from the text file <fName>. Text files delimited
126 with either commas (extension .csv) or tabs (extension .txt) are
127 supported.
128
129 Command line arguments are:
130
131 - -d _dbName_: set the name of the database from which
132 to pull input fingerprint information.
133
134 - -t _tableName_: set the name of the database table
135 from which to pull input fingerprint information
136
137 - --idName=val: sets the name of the id column in the input
138 database. Default is *ID*.
139
140 - -o _outFileName_: name of the output file (output will
141 be a pickle (.pkl) file with the cluster tree)
142
143 - --actTable=val: name of table containing activity values
144 (used to color points in the cluster tree).
145
146 - --actName=val: name of column with activities in the activity
147 table. The values in this column should either be integers or
148 convertible into integers.
149
150 - --SLINK: use the single-linkage clustering algorithm
151 (default is Ward's minimum variance)
152
153 - --CLINK: use the complete-linkage clustering algorithm
154 (default is Ward's minimum variance)
155
156 - --UPGMA: use the group-average clustering algorithm
157 (default is Ward's minimum variance)
158
159 - --dice: use the DICE similarity metric instead of Tanimoto
160
161 - --cosine: use the cosine similarity metric instead of Tanimoto
162
163 - --fpColName=val: name to use for the column which stores
164 fingerprints (in pickled format) in the input db table.
165 Default is *AutoFragmentFP*
166
167 - --minPath=val: minimum path length to be included in
168 fragment-based fingerprints. Default is *2*.
169
170 - --maxPath=val: maximum path length to be included in
171 fragment-based fingerprints. Default is *7*.
172
173 - --nBitsPerHash: number of bits to be set in the output
174 fingerprint for each fragment. Default is *4*.
175
176 - --discrim: use of path-based discriminators to hash bits.
177 Default is *false*.
178
179 - -V: include valence information in the fingerprints
180 Default is *false*.
181
182 - -H: include Hs in the fingerprint
183 Default is *false*.
184
185 - --useMACCS: use the public MACCS keys to do the fingerprinting
186 (instead of a daylight-type fingerprint)
187
188
189 """
190 if __name__ == '__main__':
191 message("This is ClusterMols version %s\n\n"%(__VERSION_STRING))
192 FingerprintMols._usageDoc=_usageDoc
193 details = FingerprintMols.ParseArgs()
194 ClusterFromDetails(details)
195