1
2
3
4
5
6
7
8
9
10
11 """ SMARTS definitions for the publically available MACCS keys
12 and a MACCS fingerprinter
13
14 I compared the MACCS fingerprints generated here with those from two
15 other packages (not MDL, unfortunately). Of course there are
16 disagreements between the various fingerprints still, but I think
17 these definitions work pretty well. Some notes:
18
19 1) most of the differences have to do with aromaticity
20 2) there's a discrepancy sometimes because the current RDKit
21 definitions do not require multiple matches to be distinct. e.g. the
22 SMILES C(=O)CC(=O) can match the (hypothetical) key O=CC twice in my
23 definition. It's not clear to me what the correct behavior is.
24 3) Some keys are not fully defined in the MDL documentation
25 4) Two keys, 125 and 166, have to be done outside of SMARTS.
26 5) Key 1 (ISOTOPE) isn't defined
27
28 Rev history:
29 2006 (gl): Original open-source release
30 May 2011 (gl): Update some definitions based on feedback from Andrew Dalke
31
32 """
33 from __future__ import print_function
34 from rdkit import Chem
35 from rdkit.Chem import rdMolDescriptors
36 from rdkit import DataStructs
37
38 smartsPatts={
39 1:('?',0),
40
41 2:('[#104]',0),
42 3:('[#32,#33,#34,#50,#51,#52,#82,#83,#84]',0),
43 4:('[Ac,Th,Pa,U,Np,Pu,Am,Cm,Bk,Cf,Es,Fm,Md,No,Lr]',0),
44 5:('[Sc,Ti,Y,Zr,Hf]',0),
45 6:('[La,Ce,Pr,Nd,Pm,Sm,Eu,Gd,Tb,Dy,Ho,Er,Tm,Yb,Lu]',0),
46 7:('[V,Cr,Mn,Nb,Mo,Tc,Ta,W,Re]',0),
47 8:('[!#6;!#1]1~*~*~*~1',0),
48 9:('[Fe,Co,Ni,Ru,Rh,Pd,Os,Ir,Pt]',0),
49 10:('[Be,Mg,Ca,Sr,Ba,Ra]',0),
50 11:('*1~*~*~*~1',0),
51 12:('[Cu,Zn,Ag,Cd,Au,Hg]',0),
52 13:('[#8]~[#7](~[#6])~[#6]',0),
53 14:('[#16]-[#16]',0),
54 15:('[#8]~[#6](~[#8])~[#8]',0),
55 16:('[!#6;!#1]1~*~*~1',0),
56 17:('[#6]#[#6]',0),
57 18:('[#5,#13,#31,#49,#81]',0),
58 19:('*1~*~*~*~*~*~*~1',0),
59 20:('[#14]',0),
60 21:('[#6]=[#6](~[!#6;!#1])~[!#6;!#1]',0),
61 22:('*1~*~*~1',0),
62 23:('[#7]~[#6](~[#8])~[#8]',0),
63 24:('[#7]-[#8]',0),
64 25:('[#7]~[#6](~[#7])~[#7]',0),
65 26:('[#6]=;@[#6](@*)@*',0),
66 27:('[I]',0),
67 28:('[!#6;!#1]~[CH2]~[!#6;!#1]',0),
68 29:('[#15]',0),
69 30:('[#6]~[!#6;!#1](~[#6])(~[#6])~*',0),
70 31:('[!#6;!#1]~[F,Cl,Br,I]',0),
71 32:('[#6]~[#16]~[#7]',0),
72 33:('[#7]~[#16]',0),
73 34:('[CH2]=*',0),
74 35:('[Li,Na,K,Rb,Cs,Fr]',0),
75 36:('[#16R]',0),
76 37:('[#7]~[#6](~[#8])~[#7]',0),
77 38:('[#7]~[#6](~[#6])~[#7]',0),
78 39:('[#8]~[#16](~[#8])~[#8]',0),
79 40:('[#16]-[#8]',0),
80 41:('[#6]#[#7]',0),
81 42:('F',0),
82 43:('[!#6;!#1;!H0]~*~[!#6;!#1;!H0]',0),
83 44:('[!#1;!#6;!#7;!#8;!#9;!#14;!#15;!#16;!#17;!#35;!#53]',0),
84 45:('[#6]=[#6]~[#7]',0),
85 46:('Br',0),
86 47:('[#16]~*~[#7]',0),
87 48:('[#8]~[!#6;!#1](~[#8])(~[#8])',0),
88 49:('[!+0]',0),
89 50:('[#6]=[#6](~[#6])~[#6]',0),
90 51:('[#6]~[#16]~[#8]',0),
91 52:('[#7]~[#7]',0),
92 53:('[!#6;!#1;!H0]~*~*~*~[!#6;!#1;!H0]',0),
93 54:('[!#6;!#1;!H0]~*~*~[!#6;!#1;!H0]',0),
94 55:('[#8]~[#16]~[#8]',0),
95 56:('[#8]~[#7](~[#8])~[#6]',0),
96 57:('[#8R]',0),
97 58:('[!#6;!#1]~[#16]~[!#6;!#1]',0),
98 59:('[#16]!:*:*',0),
99 60:('[#16]=[#8]',0),
100 61:('*~[#16](~*)~*',0),
101 62:('*@*!@*@*',0),
102 63:('[#7]=[#8]',0),
103 64:('*@*!@[#16]',0),
104 65:('c:n',0),
105 66:('[#6]~[#6](~[#6])(~[#6])~*',0),
106 67:('[!#6;!#1]~[#16]',0),
107 68:('[!#6;!#1;!H0]~[!#6;!#1;!H0]',0),
108 69:('[!#6;!#1]~[!#6;!#1;!H0]',0),
109 70:('[!#6;!#1]~[#7]~[!#6;!#1]',0),
110 71:('[#7]~[#8]',0),
111 72:('[#8]~*~*~[#8]',0),
112 73:('[#16]=*',0),
113 74:('[CH3]~*~[CH3]',0),
114 75:('*!@[#7]@*',0),
115 76:('[#6]=[#6](~*)~*',0),
116 77:('[#7]~*~[#7]',0),
117 78:('[#6]=[#7]',0),
118 79:('[#7]~*~*~[#7]',0),
119 80:('[#7]~*~*~*~[#7]',0),
120 81:('[#16]~*(~*)~*',0),
121 82:('*~[CH2]~[!#6;!#1;!H0]',0),
122 83:('[!#6;!#1]1~*~*~*~*~1',0),
123 84:('[NH2]',0),
124 85:('[#6]~[#7](~[#6])~[#6]',0),
125 86:('[C;H2,H3][!#6;!#1][C;H2,H3]',0),
126 87:('[F,Cl,Br,I]!@*@*',0),
127 88:('[#16]',0),
128 89:('[#8]~*~*~*~[#8]',0),
129 90:('[$([!#6;!#1;!H0]~*~*~[CH2]~*),$([!#6;!#1;!H0;R]1@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~[R]1@[R]@[CH2;R]1)]',0),
130 91:('[$([!#6;!#1;!H0]~*~*~*~[CH2]~*),$([!#6;!#1;!H0;R]1@[R]@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~[R]1@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~*~[R]1@[R]@[CH2;R]1)]',0),
131 92:('[#8]~[#6](~[#7])~[#6]',0),
132 93:('[!#6;!#1]~[CH3]',0),
133 94:('[!#6;!#1]~[#7]',0),
134 95:('[#7]~*~*~[#8]',0),
135 96:('*1~*~*~*~*~1',0),
136 97:('[#7]~*~*~*~[#8]',0),
137 98:('[!#6;!#1]1~*~*~*~*~*~1',0),
138 99:('[#6]=[#6]',0),
139 100:('*~[CH2]~[#7]',0),
140 101:('[$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1)]',0),
141 102:('[!#6;!#1]~[#8]',0),
142 103:('Cl',0),
143 104:('[!#6;!#1;!H0]~*~[CH2]~*',0),
144 105:('*@*(@*)@*',0),
145 106:('[!#6;!#1]~*(~[!#6;!#1])~[!#6;!#1]',0),
146 107:('[F,Cl,Br,I]~*(~*)~*',0),
147 108:('[CH3]~*~*~*~[CH2]~*',0),
148 109:('*~[CH2]~[#8]',0),
149 110:('[#7]~[#6]~[#8]',0),
150 111:('[#7]~*~[CH2]~*',0),
151 112:('*~*(~*)(~*)~*',0),
152 113:('[#8]!:*:*',0),
153 114:('[CH3]~[CH2]~*',0),
154 115:('[CH3]~*~[CH2]~*',0),
155 116:('[$([CH3]~*~*~[CH2]~*),$([CH3]~*1~*~[CH2]1)]',0),
156 117:('[#7]~*~[#8]',0),
157 118:('[$(*~[CH2]~[CH2]~*),$(*1~[CH2]~[CH2]1)]',1),
158 119:('[#7]=*',0),
159 120:('[!#6;R]',1),
160 121:('[#7;R]',0),
161 122:('*~[#7](~*)~*',0),
162 123:('[#8]~[#6]~[#8]',0),
163 124:('[!#6;!#1]~[!#6;!#1]',0),
164 125:('?',0),
165 126:('*!@[#8]!@*',0),
166 127:('*@*!@[#8]',1),
167 128:('[$(*~[CH2]~*~*~*~[CH2]~*),$([R]1@[CH2;R]@[R]@[R]@[R]@[CH2;R]1),$(*~[CH2]~[R]1@[R]@[R]@[CH2;R]1),$(*~[CH2]~*~[R]1@[R]@[CH2;R]1)]',0),
168 129:('[$(*~[CH2]~*~*~[CH2]~*),$([R]1@[CH2]@[R]@[R]@[CH2;R]1),$(*~[CH2]~[R]1@[R]@[CH2;R]1)]',0),
169 130:('[!#6;!#1]~[!#6;!#1]',1),
170 131:('[!#6;!#1;!H0]',1),
171 132:('[#8]~*~[CH2]~*',0),
172 133:('*@*!@[#7]',0),
173 134:('[F,Cl,Br,I]',0),
174 135:('[#7]!:*:*',0),
175 136:('[#8]=*',1),
176 137:('[!C;!c;R]',0),
177 138:('[!#6;!#1]~[CH2]~*',1),
178 139:('[O;!H0]',0),
179 140:('[#8]',3),
180 141:('[CH3]',2),
181 142:('[#7]',1),
182 143:('*@*!@[#8]',0),
183 144:('*!:*:*!:*',0),
184 145:('*1~*~*~*~*~*~1',1),
185 146:('[#8]',2),
186 147:('[$(*~[CH2]~[CH2]~*),$([R]1@[CH2;R]@[CH2;R]1)]',0),
187 148:('*~[!#6;!#1](~*)~*',0),
188 149:('[C;H3,H4]',1),
189 150:('*!@*@*!@*',0),
190 151:('[#7;!H0]',0),
191 152:('[#8]~[#6](~[#6])~[#6]',0),
192 153:('[!#6;!#1]~[CH2]~*',0),
193 154:('[#6]=[#8]',0),
194 155:('*!@[CH2]!@*',0),
195 156:('[#7]~*(~*)~*',0),
196 157:('[#6]-[#8]',0),
197 158:('[#6]-[#7]',0),
198 159:('[#8]',1),
199 160:('[C;H3,H4]',0),
200 161:('[#7]',0),
201 162:('a',0),
202 163:('*1~*~*~*~*~*~1',0),
203 164:('[#8]',0),
204 165:('[R]',0),
205 166:('?',0),
206 }
207
208 maccsKeys = None
209
211 """ *Internal Use Only*
212
213 generates SMARTS patterns for the keys, run once
214
215 """
216 assert len(keyList) == len(keyDict.keys()),'length mismatch'
217 for key in keyDict.keys():
218 patt,count = keyDict[key]
219 if patt != '?':
220 try:
221 sma = Chem.MolFromSmarts(patt)
222 except:
223 sma = None
224 if not sma:
225 print('SMARTS parser error for key #%d: %s'%(key,patt))
226 else:
227 keyList[key-1] = sma,count
228
230 """ generates the MACCS fingerprint for a molecules
231
232 **Arguments**
233
234 - mol: the molecule to be fingerprinted
235
236 - any extra keyword arguments are ignored
237
238 **Returns**
239
240 a _DataStructs.SparseBitVect_ containing the fingerprint.
241
242 >>> m = Chem.MolFromSmiles('CNO')
243 >>> bv = GenMACCSKeys(m)
244 >>> tuple(bv.GetOnBits())
245 (24, 68, 69, 71, 93, 94, 102, 124, 131, 139, 151, 158, 160, 161, 164)
246 >>> bv = GenMACCSKeys(Chem.MolFromSmiles('CCC'))
247 >>> tuple(bv.GetOnBits())
248 (74, 114, 149, 155, 160)
249
250 """
251 global maccsKeys
252 if maccsKeys is None:
253 maccsKeys = [(None,0)]*len(smartsPatts.keys())
254 _InitKeys(maccsKeys,smartsPatts)
255 ctor=kwargs.get('ctor',DataStructs.SparseBitVect)
256
257 res = ctor(len(maccsKeys)+1)
258 for i,(patt,count) in enumerate(maccsKeys):
259 if patt is not None:
260 if count==0:
261 res[i+1] = mol.HasSubstructMatch(patt)
262 else:
263 matches = mol.GetSubstructMatches(patt)
264 if len(matches) > count:
265 res[i+1] = 1
266 elif (i+1)==125:
267
268 ri = mol.GetRingInfo()
269 nArom=0
270 res[125]=0
271 for ring in ri.BondRings():
272 isArom=True
273 for bondIdx in ring:
274 if not mol.GetBondWithIdx(bondIdx).GetIsAromatic():
275 isArom=False
276 break
277 if isArom:
278 nArom+=1
279 if nArom>1:
280 res[125]=1
281 break
282 elif (i+1)==166:
283 res[166]=0
284
285 if len(Chem.GetMolFrags(mol))>1:
286 res[166]=1
287
288 return res
289 GenMACCSKeys = rdMolDescriptors.GetMACCSKeysFingerprint
290 FingerprintMol = rdMolDescriptors.GetMACCSKeysFingerprint
291
292
293
294
295
297 import doctest,sys
298 return doctest.testmod(sys.modules["__main__"])
299
300 if __name__ == '__main__':
301 import sys
302 failed,tried = _test()
303 sys.exit(failed)
304