Package rdkit :: Package Chem :: Package Fraggle :: Module FraggleSim
[hide private]
[frames] | no frames]

Source Code for Module rdkit.Chem.Fraggle.FraggleSim

  1  # Copyright (c) 2013, GlaxoSmithKline Research & Development Ltd. 
  2  # All rights reserved. 
  3  # 
  4  # Redistribution and use in source and binary forms, with or without 
  5  # modification, are permitted provided that the following conditions are 
  6  # met: 
  7  # 
  8  #     * Redistributions of source code must retain the above copyright 
  9  #       notice, this list of conditions and the following disclaimer. 
 10  #     * Redistributions in binary form must reproduce the above 
 11  #       copyright notice, this list of conditions and the following 
 12  #       disclaimer in the documentation and/or other materials provided 
 13  #       with the distribution. 
 14  #     * Neither the name of GlaxoSmithKline Research & Development Ltd. 
 15  #       nor the names of its contributors may be used to endorse or promote 
 16  #       products derived from this software without specific prior written 
 17  #       permission. 
 18  # 
 19  # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
 20  # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
 21  # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 
 22  # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 
 23  # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 
 24  # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
 25  # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
 26  # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
 27  # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
 28  # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
 29  # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
 30  # 
 31  # Created by Jameed Hussain, May 2013 
 32  import sys 
 33  from rdkit import Chem,DataStructs 
 34   
 35  # our default rdkit fingerprinter parameters: 
 36  rdkitFpParams=dict(maxPath=5,fpSize=1024,nBitsPerHash=2) 
 37   
 38   
 39  #Fragmentation algorithm 
 40  #----------------------- 
 41   
 42  #identify acyclic bonds 
 43  #enumerate all single cuts 
 44      #make sure you chop off more that 1 atom 
 45      #keeps bits which are >60% query mol 
 46  #enumerate all double cuts 
 47      #keeps bits with 1 attachment point (i.e throw middle bit away) 
 48      #need to be >60% query mol 
 49   
 50  #identify exocyclic bonds 
 51  #enumerate all single "ring" cuts 
 52      #Check if it results in more that one component 
 53      #keep correct bit if >40% query mol 
 54   
 55  #enumerate successful "rings" cuts with an acyclic cut 
 56      #Check if it results in more that one component 
 57      #keep correct if >60% query mol 
 58   
 59  #start 
60 -def delete_bonds(mol,bonds,ftype,hac):
61 62 #use the same parent mol object and create editable mol 63 em = Chem.EditableMol(mol) 64 65 #loop through the bonds to delete 66 #print "Breaking bonds between atoms: ",bonds 67 68 for b in bonds: 69 #remove the bond 70 em.RemoveBond(b[0],b[1]) 71 72 #now add attachement points 73 newAtomA = em.AddAtom(Chem.Atom(0)) 74 em.AddBond(b[0],newAtomA,Chem.BondType.SINGLE) 75 76 newAtomB = em.AddAtom(Chem.Atom(0)) 77 em.AddBond(b[1],newAtomB,Chem.BondType.SINGLE) 78 79 #should be able to get away without sanitising mol 80 #as the valencies should be okay 81 modifiedMol = em.GetMol() 82 83 #do not do a full sanitization, but do find rings and calculate valences: 84 Chem.SanitizeMol(modifiedMol,Chem.SanitizeFlags.SANITIZE_PROPERTIES|Chem.SanitizeFlags.SANITIZE_SYMMRINGS) 85 86 fragmented_smi = Chem.MolToSmiles(modifiedMol,True) 87 88 #print fragmented_smi 89 fraggle_framentation = select_fragments(fragmented_smi,ftype,hac) 90 91 return fraggle_framentation
92
93 -def is_ring_cut_valid(smi):
94 #to check is a fragment is a valid ring cut, it needs to match the 95 #smarts: [$([#0][r].[r][#0]),$([#0][r][#0])] 96 atom_count = 0 97 valid = False 98 99 m = Chem.MolFromSmiles(smi) 100 if m is not None: 101 #use gobal smarts 102 if(m.HasSubstructMatch(cSma1) or m.HasSubstructMatch(cSma2)): 103 atom_count = m.GetNumAtoms() 104 valid = True 105 106 return valid,atom_count
107 108
109 -def select_fragments(f_smi,ftype,hac):
110 111 result = "" 112 result_hcount = 0 113 fragments = f_smi.split(".") 114 115 if(ftype == "acyclic"): 116 for f in fragments: 117 attachments = f.count("*") 118 119 #check if terminal fragment 120 if(attachments == 1): 121 fMol = Chem.MolFromSmiles(f) 122 fhac = fMol.GetNumAtoms() 123 124 #if the fragment is 2 atoms (or less - includes attachement) it is too small 125 #to be interesting. This check has the additional benefit 126 #of pulling out the relevant single cuts as it discards 127 #fragments where we only chop off a small part of the input cmpd 128 if(fhac > 3): 129 result = "%s.%s" % (result,f) 130 result_hcount = result_hcount + fhac 131 132 #needs to be greater than 60% of parent mol 133 if( (result != "") and (result_hcount > 0.6*hac) ): 134 #remove first character as it will always be "." 135 result = result[1:] 136 else: 137 result = None 138 139 elif(ftype == "cyclic"): 140 result = None 141 #make sure it is 2 components 142 if( len(fragments) == 2): 143 for f in fragments: 144 #check if a valid cut 145 valid,result_hcount = is_ring_cut_valid(f) 146 147 if(valid): 148 #needs to be greater 3 heavy atoms and greater than 40% of parent mol 149 if((result_hcount > 3) and (result_hcount > 0.4*hac)): 150 result = f 151 152 153 elif(ftype == "cyclic_and_acyclic"): 154 #print f_smi 155 result = "" 156 157 #need to find the fragments which are valid which means they must be: 158 # Terminal (one attachment point) or valid ring cut 159 for f in fragments: 160 attachments = f.count("*") 161 162 if(attachments >= 3): 163 continue 164 165 if(attachments == 2): 166 #check if a valid cut 167 valid,result_hcount = is_ring_cut_valid(f) 168 if(valid): 169 #needs to be greater 3 heavy atoms 170 if(result_hcount > 3): 171 result = "%s.%s" % (result,f) 172 173 elif(attachments == 1): 174 fMol = Chem.MolFromSmiles(f) 175 fhac = fMol.GetNumAtoms() 176 #needs to be greater 3 heavy atoms 177 if(fhac > 3): 178 result = "%s.%s" % (result,f) 179 result_hcount = result_hcount + fhac 180 181 #print "F: %s" % (result) 182 183 #appropriate fragmentations must have 2 components 184 #result will always start with . because of the way it is constructed 185 #hence 2 component result wil contain 2 dots 186 if( (result != "") and (result.count(".") == 2) ): 187 #take off the starting dot when building smiles 188 fMol = Chem.MolFromSmiles(result[1:]) 189 result_hcount = fMol.GetNumAtoms() 190 #needs to be greater 3 heavy atoms and greater than 60% of parent mol 191 if((result_hcount > 3) and (result_hcount > 0.6*hac)): 192 #take off the starting dot 193 result = result[1:] 194 else: 195 result = None 196 else: 197 result = None 198 199 return result
200 201 #Global smarts used by the program 202 #acyclic bond smarts 203 acyc_smarts = Chem.MolFromSmarts("[*]!@!=!#[*]") 204 205 #exocyclic/fused exocyclic bond smarts 206 cyc_smarts = Chem.MolFromSmarts("[R1,R2]@[r;!R1]") 207 208 #smarts used to find appropriate fragment for 209 #would use SMARTS: [$([#0][r].[r][#0]),$([#0][r][#0])] 210 #but rdkit doesn't support component SMARTS in recursive one - $([#0][r].[r][#0]) 211 #hence split into two 212 cSma1 = Chem.MolFromSmarts("[#0][r].[r][#0]") 213 cSma2 = Chem.MolFromSmarts("[#0][r][#0]") 214
215 -def generate_fraggle_fragmentation(mol):
216 """ 217 >>> q = Chem.MolFromSmiles('COc1cc(CN2CCC(NC(=O)c3cncc(C)c3)CC2)c(OC)c2ccccc12') 218 >>> list(generate_fraggle_fragmentation(q)) 219 ['[*]C(=O)NC1CCN(Cc2cc(OC)c3ccccc3c2OC)CC1', '[*]C(=O)c1cncc(C)c1.[*]C1CCN(Cc2cc(OC)c3ccccc3c2OC)CC1', '[*]C(=O)c1cncc(C)c1.[*]Cc1cc(OC)c2ccccc2c1OC', '[*]C(=O)c1cncc(C)c1.[*]c1cc(OC)c2ccccc2c1OC', '[*]C1CCN(Cc2cc(OC)c3ccccc3c2OC)CC1', '[*]C1CCN(Cc2cc(OC)c3ccccc3c2OC)CC1.[*]c1cncc(C)c1', '[*]Cc1cc(OC)c2ccccc2c1OC.[*]NC(=O)c1cncc(C)c1', '[*]Cc1cc(OC)c2ccccc2c1OC.[*]c1cncc(C)c1', '[*]N1CCC(NC(=O)c2cncc(C)c2)CC1.[*]c1cc(OC)c2ccccc2c1OC', '[*]NC(=O)c1cncc(C)c1.[*]c1cc(OC)c2ccccc2c1OC', '[*]NC1CCN(Cc2cc(OC)c3ccccc3c2OC)CC1', '[*]NC1CCN(Cc2cc(OC)c3ccccc3c2OC)CC1.[*]c1cncc(C)c1', '[*]c1c(CN2CCC(NC(=O)c3cncc(C)c3)CC2)cc(OC)c2ccccc12', '[*]c1c(OC)cc(CN2CCC(NC(=O)c3cncc(C)c3)CC2)c(OC)c1[*]', '[*]c1cc(CN2CCC(NC(=O)c3cncc(C)c3)CC2)c(OC)c2ccccc12', '[*]c1cc(OC)c2ccccc2c1OC.[*]c1cncc(C)c1'] 220 """ 221 #query mol heavy atom count 222 hac = mol.GetNumAtoms() 223 224 #different cuts can give the same fragments 225 #to use out_fragments to remove them 226 out_fragments = set() 227 228 ###################### 229 # Single acyclic Cuts 230 ###################### 231 232 #find the relevant bonds to break 233 acyclic_matching_atoms = mol.GetSubstructMatches(acyc_smarts) 234 #print "Matching Atoms:" 235 #print("acyclic matching atoms: ",acyclic_matching_atoms) 236 237 total_acyclic = len(acyclic_matching_atoms) 238 bonds_selected = [] 239 240 #loop to generate every single and double cut in the molecule 241 for x in range( total_acyclic ): 242 #single cuts are not required 243 #relevant single cut fragments can be found from the double cuts 244 #for explanation see check_fragments method 245 for y in range(x+1,total_acyclic): 246 #print matching_atoms[x],matching_atoms[y] 247 bonds_selected.append(acyclic_matching_atoms[x]) 248 bonds_selected.append(acyclic_matching_atoms[y]) 249 fragment = delete_bonds(mol,bonds_selected,"acyclic",hac) 250 if fragment is not None: 251 #print(fragment) 252 out_fragments.add(fragment) 253 bonds_selected = [] 254 255 #print(out_fragments) 256 ################################## 257 # Fused/Spiro exocyclic bond Cuts 258 ################################## 259 260 #find the relevant bonds to break 261 cyclic_matching_atoms = mol.GetSubstructMatches(cyc_smarts) 262 #print("cyclic matching atoms: ",cyclic_matching_atoms) 263 #print "Matching Atoms:" 264 #print matching_atoms 265 266 total_cyclic = len(cyclic_matching_atoms) 267 bonds_selected = [] 268 269 #loop to generate every double cut of relevant bonds 270 for x in range( total_cyclic ): 271 for y in range(x+1,total_cyclic): 272 #print matching_atoms[x],matching_atoms[y] 273 bonds_selected.append(cyclic_matching_atoms[x]) 274 bonds_selected.append(cyclic_matching_atoms[y]) 275 fragment = delete_bonds(mol,bonds_selected,"cyclic",hac) 276 bonds_selected = [] 277 278 if fragment is not None: 279 #print "%s" % (fragment) 280 out_fragments.add(fragment) 281 282 #now do an acyclic cut with the successful cyclic cut 283 for z in range(total_acyclic): 284 bonds_selected.append(cyclic_matching_atoms[x]) 285 bonds_selected.append(cyclic_matching_atoms[y]) 286 bonds_selected.append(acyclic_matching_atoms[z]) 287 fragment = delete_bonds(mol,bonds_selected,"cyclic_and_acyclic",hac) 288 if fragment is not None: 289 #print "%s" % (fragment) 290 out_fragments.add(fragment) 291 bonds_selected = [] 292 293 return sorted(list(out_fragments))
294 295 #atomcontrib algorithm 296 #generate fp of query_substructs (qfp) 297 # 298 #loop through atoms of smiles 299 # For each atom 300 # Generate partial fp of the atom (pfp) 301 # Find Tversky sim of pfp in qfp 302 # If Tversky < 0.8, mark atom in smiles 303 # 304 #Loop thru marked atoms 305 # If marked atom in ring - turn all atoms in that ring to * (aromatic) or Sc (aliphatic) 306 # For each marked atom 307 # If aromatic turn to a * 308 # If aliphatic turn to a Sc 309 # 310 # Return modified smiles
311 -def atomContrib(subs,mol,tverskyThresh=0.8):
312 marked = {} 313 314 def partialFP(atomID,tverskyThresh): 315 316 #create empty fp 317 modifiedFP = DataStructs.ExplicitBitVect(1024) 318 319 modifiedFP.SetBitsFromList(aBits[atomID]) 320 321 tverskySim = DataStructs.TverskySimilarity(subsFp,modifiedFP,0,1) 322 323 if(tverskySim < tverskyThresh): 324 #print "%i %s: %f" % (atomID+1, pMol.GetAtomWithIdx(atomID).GetSymbol(), tverskySim) 325 marked[atomID] = 1
326 327 #generate mol object & fp for input mol 328 aBits = []; 329 330 pMol = Chem.Mol(mol.ToBinary()) 331 pMolFp = Chem.RDKFingerprint(pMol, atomBits=aBits, **rdkitFpParams) 332 333 #generate fp of query_substructs 334 qsMol = Chem.MolFromSmiles(subs) 335 subsFp = Chem.RDKFingerprint(qsMol, **rdkitFpParams) 336 337 #loop through atoms of smiles and mark 338 for atom in pMol.GetAtoms(): 339 #store atoms to change 340 partialFP(atom.GetIdx(),tverskyThresh) 341 342 #get rings to change 343 ssr = pMol.GetRingInfo().AtomRings() 344 345 #loop thru rings and records rings to change 346 ringsToChange = {} 347 for ringList in range(len(ssr)): 348 #print "New ring" 349 for ringAtom in range(len(ssr[ringList])): 350 #print ssr[ringList][ringAtoms] 351 if ssr[ringList][ringAtom] in marked: 352 #print ssr[ringList][ringAtoms] 353 ringsToChange[ringList] = 1 354 355 #now add these ring atoms to marked 356 for ringList in ringsToChange: 357 for ringAtom in range(len(ssr[ringList])): 358 marked[ ssr[ringList][ringAtom] ] = 1 359 360 if(len(marked) > 0): 361 #now mutate the marked atoms 362 for key in marked: 363 #print key 364 if( pMol.GetAtomWithIdx(key).GetIsAromatic() ): 365 #pMol.GetAtomWithIdx(key).SetAtomicNum(91) 366 #this works everytime and causes far fewer problems 367 pMol.GetAtomWithIdx(key).SetAtomicNum(0) 368 pMol.GetAtomWithIdx(key).SetNoImplicit(True) 369 else: 370 #gives best sim 371 pMol.GetAtomWithIdx(key).SetAtomicNum(21) 372 #works better but when replace S it fails due to valency 373 #pMol.GetAtomWithIdx(key).SetAtomicNum(6) 374 375 try: 376 Chem.SanitizeMol(pMol) 377 except: 378 sys.stderr.write("Can't parse smiles: %s\n" % (Chem.MolToSmiles(pMol))) 379 pMol = Chem.Mol(mol.ToBinary()) 380 381 return pMol 382 383 384 modified_query_fps = {}
385 -def compute_fraggle_similarity_for_subs(inMol,qMol,qSmi,qSubs,tverskyThresh=0.8):
386 qFP = Chem.RDKFingerprint(qMol, **rdkitFpParams) 387 iFP = Chem.RDKFingerprint(inMol, **rdkitFpParams) 388 389 rdkit_sim = DataStructs.TanimotoSimilarity(qFP,iFP) 390 391 qm_key = "%s_%s" % (qSubs,qSmi) 392 if qm_key in modified_query_fps: 393 qmMolFp = modified_query_fps[qm_key] 394 else: 395 qmMol = atomContrib(qSubs,qMol,tverskyThresh) 396 qmMolFp = Chem.RDKFingerprint(qmMol, **rdkitFpParams) 397 modified_query_fps[qm_key] = qmMolFp 398 399 rmMol = atomContrib(qSubs,inMol,tverskyThresh) 400 401 #wrap in a try, catch 402 try: 403 rmMolFp = Chem.RDKFingerprint(rmMol, **rdkitFpParams) 404 405 fraggle_sim=max(DataStructs.FingerprintSimilarity(qmMolFp,rmMolFp), 406 rdkit_sim) 407 #print '\t',qSubs,fraggle_sim,rdkit_sim 408 409 except: 410 sys.stderr.write("Can't generate fp for: %s\n" % (Chem.MolToSmiles(rmMol,True))) 411 fraggle_sim = 0.0 412 413 return rdkit_sim,fraggle_sim
414
415 -def GetFraggleSimilarity(queryMol,refMol,tverskyThresh=0.8):
416 """ return the Fraggle similarity between two molecules 417 418 >>> q = Chem.MolFromSmiles('COc1cc(CN2CCC(NC(=O)c3cncc(C)c3)CC2)c(OC)c2ccccc12') 419 >>> m = Chem.MolFromSmiles('COc1cc(CN2CCC(NC(=O)c3ccccc3)CC2)c(OC)c2ccccc12') 420 >>> sim,match = GetFraggleSimilarity(q,m) 421 >>> sim 422 0.980... 423 >>> match 424 '[*]C1CCN(Cc2cc(OC)c3ccccc3c2OC)CC1' 425 426 >>> m = Chem.MolFromSmiles('COc1cc(CN2CCC(Nc3nc4ccccc4s3)CC2)c(OC)c2ccccc12') 427 >>> sim,match = GetFraggleSimilarity(q,m) 428 >>> sim 429 0.794... 430 >>> match 431 '[*]C1CCN(Cc2cc(OC)c3ccccc3c2OC)CC1' 432 433 >>> q = Chem.MolFromSmiles('COc1ccccc1') 434 >>> sim,match = GetFraggleSimilarity(q,m) 435 >>> sim 436 0.347... 437 >>> match 438 '[*]c1ccccc1' 439 440 """ 441 if hasattr(queryMol,'_fraggleDecomp'): 442 frags = queryMol._fraggleDecomp 443 else: 444 frags = generate_fraggle_fragmentation(queryMol) 445 queryMol._fraggleDecomp = frags 446 qSmi = Chem.MolToSmiles(queryMol,True) 447 result=0.0 448 bestMatch=None 449 for frag in frags: 450 rdksim,fragsim= compute_fraggle_similarity_for_subs(refMol,queryMol,qSmi,frag,tverskyThresh) 451 if fragsim>result: 452 result=fragsim 453 bestMatch=frag 454 return result,bestMatch
455 456 457 458 #------------------------------------ 459 # 460 # doctest boilerplate 461 #
462 -def _test():
463 import doctest,sys 464 return doctest.testmod(sys.modules["__main__"],optionflags=doctest.ELLIPSIS+doctest.NORMALIZE_WHITESPACE)
465 466 467 if __name__ == '__main__': 468 import sys 469 failed,tried = _test() 470 sys.exit(failed) 471