Package rdkit :: Package Chem :: Module MACCSkeys
[hide private]
[frames] | no frames]

Source Code for Module rdkit.Chem.MACCSkeys

  1  # $Id$ 
  2  # 
  3  # Copyright (C) 2001-2011 greg Landrum and Rational Discovery LLC 
  4  # 
  5  #   @@ All Rights Reserved @@ 
  6  #  This file is part of the RDKit. 
  7  #  The contents are covered by the terms of the BSD license 
  8  #  which is included in the file license.txt, found at the root 
  9  #  of the RDKit source tree. 
 10  # 
 11  """ SMARTS definitions for the publically available MACCS keys 
 12  and a MACCS fingerprinter 
 13   
 14  I compared the MACCS fingerprints generated here with those from two 
 15  other packages (not MDL, unfortunately). Of course there are 
 16  disagreements between the various fingerprints still, but I think 
 17  these definitions work pretty well. Some notes: 
 18   
 19  1) most of the differences have to do with aromaticity 
 20  2) there's a discrepancy sometimes because the current RDKit 
 21  definitions do not require multiple matches to be distinct. e.g. the 
 22  SMILES C(=O)CC(=O) can match the (hypothetical) key O=CC twice in my 
 23  definition. It's not clear to me what the correct behavior is. 
 24  3) Some keys are not fully defined in the MDL documentation 
 25  4) Two keys, 125 and 166, have to be done outside of SMARTS. 
 26  5) Key 1 (ISOTOPE) isn't defined 
 27   
 28  Rev history: 
 29  2006 (gl): Original open-source release 
 30  May 2011 (gl): Update some definitions based on feedback from Andrew Dalke 
 31   
 32  """ 
 33  from __future__ import print_function 
 34  from rdkit import Chem 
 35  from rdkit.Chem import rdMolDescriptors 
 36  from rdkit import DataStructs 
 37  # these are SMARTS patterns corresponding to the MDL MACCS keys 
 38  smartsPatts={ 
 39    1:('?',0), # ISOTOPE 
 40    #2:('[#104,#105,#106,#107,#106,#109,#110,#111,#112]',0),  # atomic num >103 Not complete 
 41    2:('[#104]',0),  # limit the above def'n since the RDKit only accepts up to #104 
 42    3:('[#32,#33,#34,#50,#51,#52,#82,#83,#84]',0), # Group IVa,Va,VIa Rows 4-6  
 43    4:('[Ac,Th,Pa,U,Np,Pu,Am,Cm,Bk,Cf,Es,Fm,Md,No,Lr]',0), # actinide 
 44    5:('[Sc,Ti,Y,Zr,Hf]',0), # Group IIIB,IVB (Sc...)   
 45    6:('[La,Ce,Pr,Nd,Pm,Sm,Eu,Gd,Tb,Dy,Ho,Er,Tm,Yb,Lu]',0), # Lanthanide 
 46    7:('[V,Cr,Mn,Nb,Mo,Tc,Ta,W,Re]',0), # Group VB,VIB,VIIB 
 47    8:('[!#6;!#1]1~*~*~*~1',0), # QAAA@1 
 48    9:('[Fe,Co,Ni,Ru,Rh,Pd,Os,Ir,Pt]',0), # Group VIII (Fe...) 
 49    10:('[Be,Mg,Ca,Sr,Ba,Ra]',0), # Group IIa (Alkaline earth) 
 50    11:('*1~*~*~*~1',0), # 4M Ring 
 51    12:('[Cu,Zn,Ag,Cd,Au,Hg]',0), # Group IB,IIB (Cu..) 
 52    13:('[#8]~[#7](~[#6])~[#6]',0), # ON(C)C 
 53    14:('[#16]-[#16]',0), # S-S 
 54    15:('[#8]~[#6](~[#8])~[#8]',0), # OC(O)O 
 55    16:('[!#6;!#1]1~*~*~1',0), # QAA@1 
 56    17:('[#6]#[#6]',0), #CTC 
 57    18:('[#5,#13,#31,#49,#81]',0), # Group IIIA (B...)  
 58    19:('*1~*~*~*~*~*~*~1',0), # 7M Ring 
 59    20:('[#14]',0), #Si 
 60    21:('[#6]=[#6](~[!#6;!#1])~[!#6;!#1]',0), # C=C(Q)Q 
 61    22:('*1~*~*~1',0), # 3M Ring 
 62    23:('[#7]~[#6](~[#8])~[#8]',0), # NC(O)O 
 63    24:('[#7]-[#8]',0), # N-O 
 64    25:('[#7]~[#6](~[#7])~[#7]',0), # NC(N)N 
 65    26:('[#6]=;@[#6](@*)@*',0), # C$=C($A)$A 
 66    27:('[I]',0), # I 
 67    28:('[!#6;!#1]~[CH2]~[!#6;!#1]',0), # QCH2Q 
 68    29:('[#15]',0),# P 
 69    30:('[#6]~[!#6;!#1](~[#6])(~[#6])~*',0), # CQ(C)(C)A 
 70    31:('[!#6;!#1]~[F,Cl,Br,I]',0), # QX 
 71    32:('[#6]~[#16]~[#7]',0), # CSN 
 72    33:('[#7]~[#16]',0), # NS 
 73    34:('[CH2]=*',0), # CH2=A 
 74    35:('[Li,Na,K,Rb,Cs,Fr]',0), # Group IA (Alkali Metal) 
 75    36:('[#16R]',0), # S Heterocycle 
 76    37:('[#7]~[#6](~[#8])~[#7]',0), # NC(O)N 
 77    38:('[#7]~[#6](~[#6])~[#7]',0), # NC(C)N 
 78    39:('[#8]~[#16](~[#8])~[#8]',0), # OS(O)O 
 79    40:('[#16]-[#8]',0), # S-O 
 80    41:('[#6]#[#7]',0), # CTN 
 81    42:('F',0), # F 
 82    43:('[!#6;!#1;!H0]~*~[!#6;!#1;!H0]',0), # QHAQH 
 83    44:('[!#1;!#6;!#7;!#8;!#9;!#14;!#15;!#16;!#17;!#35;!#53]',0), # OTHER 
 84    45:('[#6]=[#6]~[#7]',0), # C=CN 
 85    46:('Br',0), # BR 
 86    47:('[#16]~*~[#7]',0), # SAN 
 87    48:('[#8]~[!#6;!#1](~[#8])(~[#8])',0), # OQ(O)O 
 88    49:('[!+0]',0), # CHARGE   
 89    50:('[#6]=[#6](~[#6])~[#6]',0), # C=C(C)C 
 90    51:('[#6]~[#16]~[#8]',0), # CSO 
 91    52:('[#7]~[#7]',0), # NN 
 92    53:('[!#6;!#1;!H0]~*~*~*~[!#6;!#1;!H0]',0), # QHAAAQH 
 93    54:('[!#6;!#1;!H0]~*~*~[!#6;!#1;!H0]',0), # QHAAQH 
 94    55:('[#8]~[#16]~[#8]',0), #OSO 
 95    56:('[#8]~[#7](~[#8])~[#6]',0), # ON(O)C 
 96    57:('[#8R]',0), # O Heterocycle 
 97    58:('[!#6;!#1]~[#16]~[!#6;!#1]',0), # QSQ 
 98    59:('[#16]!:*:*',0), # Snot%A%A 
 99    60:('[#16]=[#8]',0), # S=O 
100    61:('*~[#16](~*)~*',0), # AS(A)A 
101    62:('*@*!@*@*',0), # A$!A$A 
102    63:('[#7]=[#8]',0), # N=O 
103    64:('*@*!@[#16]',0), # A$A!S 
104    65:('c:n',0), # C%N 
105    66:('[#6]~[#6](~[#6])(~[#6])~*',0), # CC(C)(C)A 
106    67:('[!#6;!#1]~[#16]',0), # QS 
107    68:('[!#6;!#1;!H0]~[!#6;!#1;!H0]',0), # QHQH (&...) SPEC Incomplete 
108    69:('[!#6;!#1]~[!#6;!#1;!H0]',0), # QQH 
109    70:('[!#6;!#1]~[#7]~[!#6;!#1]',0), # QNQ 
110    71:('[#7]~[#8]',0), # NO 
111    72:('[#8]~*~*~[#8]',0), # OAAO 
112    73:('[#16]=*',0), # S=A 
113    74:('[CH3]~*~[CH3]',0), # CH3ACH3 
114    75:('*!@[#7]@*',0), # A!N$A 
115    76:('[#6]=[#6](~*)~*',0), # C=C(A)A 
116    77:('[#7]~*~[#7]',0), # NAN 
117    78:('[#6]=[#7]',0), # C=N 
118    79:('[#7]~*~*~[#7]',0), # NAAN 
119    80:('[#7]~*~*~*~[#7]',0), # NAAAN 
120    81:('[#16]~*(~*)~*',0), # SA(A)A 
121    82:('*~[CH2]~[!#6;!#1;!H0]',0), # ACH2QH 
122    83:('[!#6;!#1]1~*~*~*~*~1',0), # QAAAA@1 
123    84:('[NH2]',0), #NH2 
124    85:('[#6]~[#7](~[#6])~[#6]',0), # CN(C)C 
125    86:('[C;H2,H3][!#6;!#1][C;H2,H3]',0), # CH2QCH2 
126    87:('[F,Cl,Br,I]!@*@*',0), # X!A$A 
127    88:('[#16]',0), # S 
128    89:('[#8]~*~*~*~[#8]',0), # OAAAO 
129    90:('[$([!#6;!#1;!H0]~*~*~[CH2]~*),$([!#6;!#1;!H0;R]1@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~[R]1@[R]@[CH2;R]1)]',0), # QHAACH2A 
130    91:('[$([!#6;!#1;!H0]~*~*~*~[CH2]~*),$([!#6;!#1;!H0;R]1@[R]@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~[R]1@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~*~[R]1@[R]@[CH2;R]1)]',0), # QHAAACH2A 
131    92:('[#8]~[#6](~[#7])~[#6]',0), # OC(N)C 
132    93:('[!#6;!#1]~[CH3]',0), # QCH3 
133    94:('[!#6;!#1]~[#7]',0), # QN 
134    95:('[#7]~*~*~[#8]',0), # NAAO 
135    96:('*1~*~*~*~*~1',0), # 5 M ring 
136    97:('[#7]~*~*~*~[#8]',0), # NAAAO 
137    98:('[!#6;!#1]1~*~*~*~*~*~1',0), # QAAAAA@1 
138    99:('[#6]=[#6]',0), # C=C 
139    100:('*~[CH2]~[#7]',0), # ACH2N 
140    101:('[$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1)]',0), # 8M Ring or larger. This only handles up to ring sizes of 14 
141    102:('[!#6;!#1]~[#8]',0), # QO 
142    103:('Cl',0), # CL 
143    104:('[!#6;!#1;!H0]~*~[CH2]~*',0), # QHACH2A 
144    105:('*@*(@*)@*',0), # A$A($A)$A 
145    106:('[!#6;!#1]~*(~[!#6;!#1])~[!#6;!#1]',0), # QA(Q)Q 
146    107:('[F,Cl,Br,I]~*(~*)~*',0), # XA(A)A 
147    108:('[CH3]~*~*~*~[CH2]~*',0), # CH3AAACH2A 
148    109:('*~[CH2]~[#8]',0), # ACH2O 
149    110:('[#7]~[#6]~[#8]',0), # NCO 
150    111:('[#7]~*~[CH2]~*',0), # NACH2A 
151    112:('*~*(~*)(~*)~*',0), # AA(A)(A)A 
152    113:('[#8]!:*:*',0), # Onot%A%A 
153    114:('[CH3]~[CH2]~*',0), # CH3CH2A 
154    115:('[CH3]~*~[CH2]~*',0), # CH3ACH2A 
155    116:('[$([CH3]~*~*~[CH2]~*),$([CH3]~*1~*~[CH2]1)]',0), # CH3AACH2A 
156    117:('[#7]~*~[#8]',0), # NAO 
157    118:('[$(*~[CH2]~[CH2]~*),$(*1~[CH2]~[CH2]1)]',1), # ACH2CH2A > 1 
158    119:('[#7]=*',0), # N=A 
159    120:('[!#6;R]',1), # Heterocyclic atom > 1 (&...) Spec Incomplete 
160    121:('[#7;R]',0), # N Heterocycle 
161    122:('*~[#7](~*)~*',0), # AN(A)A 
162    123:('[#8]~[#6]~[#8]',0), # OCO 
163    124:('[!#6;!#1]~[!#6;!#1]',0), # QQ 
164    125:('?',0), # Aromatic Ring > 1 
165    126:('*!@[#8]!@*',0), # A!O!A 
166    127:('*@*!@[#8]',1), # A$A!O > 1 (&...) Spec Incomplete 
167    128:('[$(*~[CH2]~*~*~*~[CH2]~*),$([R]1@[CH2;R]@[R]@[R]@[R]@[CH2;R]1),$(*~[CH2]~[R]1@[R]@[R]@[CH2;R]1),$(*~[CH2]~*~[R]1@[R]@[CH2;R]1)]',0), # ACH2AAACH2A 
168    129:('[$(*~[CH2]~*~*~[CH2]~*),$([R]1@[CH2]@[R]@[R]@[CH2;R]1),$(*~[CH2]~[R]1@[R]@[CH2;R]1)]',0), # ACH2AACH2A 
169    130:('[!#6;!#1]~[!#6;!#1]',1), # QQ > 1 (&...)  Spec Incomplete 
170    131:('[!#6;!#1;!H0]',1), # QH > 1 
171    132:('[#8]~*~[CH2]~*',0), # OACH2A 
172    133:('*@*!@[#7]',0), # A$A!N 
173    134:('[F,Cl,Br,I]',0), # X (HALOGEN) 
174    135:('[#7]!:*:*',0), # Nnot%A%A 
175    136:('[#8]=*',1), # O=A>1  
176    137:('[!C;!c;R]',0), # Heterocycle 
177    138:('[!#6;!#1]~[CH2]~*',1), # QCH2A>1 (&...) Spec Incomplete 
178    139:('[O;!H0]',0), # OH 
179    140:('[#8]',3), # O > 3 (&...) Spec Incomplete 
180    141:('[CH3]',2), # CH3 > 2  (&...) Spec Incomplete 
181    142:('[#7]',1), # N > 1 
182    143:('*@*!@[#8]',0), # A$A!O 
183    144:('*!:*:*!:*',0), # Anot%A%Anot%A 
184    145:('*1~*~*~*~*~*~1',1), # 6M ring > 1 
185    146:('[#8]',2), # O > 2 
186    147:('[$(*~[CH2]~[CH2]~*),$([R]1@[CH2;R]@[CH2;R]1)]',0), # ACH2CH2A 
187    148:('*~[!#6;!#1](~*)~*',0), # AQ(A)A 
188    149:('[C;H3,H4]',1), # CH3 > 1 
189    150:('*!@*@*!@*',0), # A!A$A!A 
190    151:('[#7;!H0]',0), # NH 
191    152:('[#8]~[#6](~[#6])~[#6]',0), # OC(C)C 
192    153:('[!#6;!#1]~[CH2]~*',0), # QCH2A 
193    154:('[#6]=[#8]',0), # C=O 
194    155:('*!@[CH2]!@*',0), # A!CH2!A 
195    156:('[#7]~*(~*)~*',0), # NA(A)A 
196    157:('[#6]-[#8]',0), # C-O 
197    158:('[#6]-[#7]',0), # C-N 
198    159:('[#8]',1), # O>1 
199    160:('[C;H3,H4]',0), #CH3 
200    161:('[#7]',0), # N 
201    162:('a',0), # Aromatic 
202    163:('*1~*~*~*~*~*~1',0), # 6M Ring 
203    164:('[#8]',0), # O 
204    165:('[R]',0), # Ring 
205    166:('?',0), # Fragments  FIX: this can't be done in SMARTS 
206    } 
207   
208  maccsKeys = None 
209   
210 -def _InitKeys(keyList,keyDict):
211 """ *Internal Use Only* 212 213 generates SMARTS patterns for the keys, run once 214 215 """ 216 assert len(keyList) == len(keyDict.keys()),'length mismatch' 217 for key in keyDict.keys(): 218 patt,count = keyDict[key] 219 if patt != '?': 220 try: 221 sma = Chem.MolFromSmarts(patt) 222 except: 223 sma = None 224 if not sma: 225 print('SMARTS parser error for key #%d: %s'%(key,patt)) 226 else: 227 keyList[key-1] = sma,count
228
229 -def _pyGenMACCSKeys(mol,**kwargs):
230 """ generates the MACCS fingerprint for a molecules 231 232 **Arguments** 233 234 - mol: the molecule to be fingerprinted 235 236 - any extra keyword arguments are ignored 237 238 **Returns** 239 240 a _DataStructs.SparseBitVect_ containing the fingerprint. 241 242 >>> m = Chem.MolFromSmiles('CNO') 243 >>> bv = GenMACCSKeys(m) 244 >>> tuple(bv.GetOnBits()) 245 (24, 68, 69, 71, 93, 94, 102, 124, 131, 139, 151, 158, 160, 161, 164) 246 >>> bv = GenMACCSKeys(Chem.MolFromSmiles('CCC')) 247 >>> tuple(bv.GetOnBits()) 248 (74, 114, 149, 155, 160) 249 250 """ 251 global maccsKeys 252 if maccsKeys is None: 253 maccsKeys = [(None,0)]*len(smartsPatts.keys()) 254 _InitKeys(maccsKeys,smartsPatts) 255 ctor=kwargs.get('ctor',DataStructs.SparseBitVect) 256 257 res = ctor(len(maccsKeys)+1) 258 for i,(patt,count) in enumerate(maccsKeys): 259 if patt is not None: 260 if count==0: 261 res[i+1] = mol.HasSubstructMatch(patt) 262 else: 263 matches = mol.GetSubstructMatches(patt) 264 if len(matches) > count: 265 res[i+1] = 1 266 elif (i+1)==125: 267 # special case: num aromatic rings > 1 268 ri = mol.GetRingInfo() 269 nArom=0 270 res[125]=0 271 for ring in ri.BondRings(): 272 isArom=True 273 for bondIdx in ring: 274 if not mol.GetBondWithIdx(bondIdx).GetIsAromatic(): 275 isArom=False 276 break 277 if isArom: 278 nArom+=1 279 if nArom>1: 280 res[125]=1 281 break 282 elif (i+1)==166: 283 res[166]=0 284 # special case: num frags > 1 285 if len(Chem.GetMolFrags(mol))>1: 286 res[166]=1 287 288 return res
289 GenMACCSKeys = rdMolDescriptors.GetMACCSKeysFingerprint 290 FingerprintMol = rdMolDescriptors.GetMACCSKeysFingerprint 291 292 #------------------------------------ 293 # 294 # doctest boilerplate 295 #
296 -def _test():
297 import doctest,sys 298 return doctest.testmod(sys.modules["__main__"])
299 300 if __name__ == '__main__': 301 import sys 302 failed,tried = _test() 303 sys.exit(failed) 304