Package rdkit :: Package SimDivFilters :: Module SimilarityPickers
[hide private]
[frames] | no frames]

Source Code for Module rdkit.SimDivFilters.SimilarityPickers

  1  # $Id$ 
  2  # 
  3  # Copyright (C) 2003-2008 Greg Landrum and Rational Discovery LLC 
  4  #  All Rights Reserved 
  5  # 
  6  from __future__ import print_function 
  7  from rdkit import RDConfig 
  8  from rdkit import DataStructs 
  9  from rdkit.DataStructs.TopNContainer import TopNContainer 
 10  import bisect 
 11   
12 -class GenericPicker(object):
13 _picks = None
14 - def MakePicks(self,force=0):
15 raise NotImplementedError("GenericPicker is a virtual base class")
16 - def __len__(self):
17 if self._picks is None: 18 self.MakePicks() 19 return len(self._picks)
20 - def __getitem__(self,which):
21 if self._picks is None: 22 self.MakePicks() 23 return self._picks[which]
24
25 -class TopNOverallPicker(GenericPicker):
26 """ A class for picking the top N overall best matches across a library 27 28 Connect to a database and build molecules: 29 >>> from rdkit import Chem 30 >>> import os.path 31 >>> from rdkit.Dbase.DbConnection import DbConnect 32 >>> dbName = RDConfig.RDTestDatabase 33 >>> conn = DbConnect(dbName,'simple_mols1') 34 >>> [x.upper() for x in conn.GetColumnNames()] 35 ['SMILES', 'ID'] 36 >>> mols = [] 37 >>> for smi,id in conn.GetData(): 38 ... mol = Chem.MolFromSmiles(str(smi)) 39 ... mol.SetProp('_Name',str(id)) 40 ... mols.append(mol) 41 >>> len(mols) 42 12 43 44 Calculate fingerprints: 45 >>> probefps = [] 46 >>> for mol in mols: 47 ... fp = Chem.RDKFingerprint(mol) 48 ... fp._id = mol.GetProp('_Name') 49 ... probefps.append(fp) 50 51 Start by finding the top matches for a single probe. This ether should pull 52 other ethers from the db: 53 >>> mol = Chem.MolFromSmiles('COC') 54 >>> probeFp = Chem.RDKFingerprint(mol) 55 >>> picker = TopNOverallPicker(numToPick=2,probeFps=[probeFp],dataSet=probefps) 56 >>> len(picker) 57 2 58 >>> fp,score = picker[0] 59 >>> id = fp._id 60 >>> str(id) 61 'ether-1' 62 >>> score 63 1.0 64 65 The results come back in order: 66 >>> fp,score = picker[1] 67 >>> id = fp._id 68 >>> str(id) 69 'ether-2' 70 71 Now find the top matches for 2 probes. We'll get one ether and one acid: 72 >>> fps = [] 73 >>> fps.append(Chem.RDKFingerprint(Chem.MolFromSmiles('COC'))) 74 >>> fps.append(Chem.RDKFingerprint(Chem.MolFromSmiles('CC(=O)O'))) 75 >>> picker = TopNOverallPicker(numToPick=3,probeFps=fps,dataSet=probefps) 76 >>> len(picker) 77 3 78 >>> fp,score = picker[0] 79 >>> id = fp._id 80 >>> str(id) 81 'acid-1' 82 >>> fp,score = picker[1] 83 >>> id = fp._id 84 >>> str(id) 85 'ether-1' 86 >>> score 87 1.0 88 >>> fp,score = picker[2] 89 >>> id = fp._id 90 >>> str(id) 91 'acid-2' 92 93 """
94 - def __init__(self,numToPick=10,probeFps=None,dataSet=None, 95 simMetric=DataStructs.TanimotoSimilarity):
96 """ 97 98 dataSet should be a sequence of BitVectors 99 100 """ 101 self.numToPick = numToPick 102 self.probes = probeFps 103 self.data = dataSet 104 self.simMetric = simMetric 105 self._picks = None
106
107 - def MakePicks(self,force=0):
108 if self._picks is not None and not force: 109 return 110 picks = TopNContainer(self.numToPick) 111 for fp in self.data: 112 origFp = fp 113 bestScore = -1.0 114 for probeFp in self.probes: 115 score = DataStructs.FingerprintSimilarity(origFp,probeFp, 116 self.simMetric) 117 bestScore = max(score,bestScore) 118 picks.Insert(bestScore,fp) 119 self._picks = [] 120 for score,pt in picks: 121 self._picks.append((pt,score)) 122 self._picks.reverse()
123
124 -class SpreadPicker(GenericPicker):
125 """ A class for picking the best matches across a library 126 127 Connect to a database: 128 >>> from rdkit import Chem 129 >>> import os.path 130 >>> from rdkit.Dbase.DbConnection import DbConnect 131 >>> dbName = RDConfig.RDTestDatabase 132 >>> conn = DbConnect(dbName,'simple_mols1') 133 >>> [x.upper() for x in conn.GetColumnNames()] 134 ['SMILES', 'ID'] 135 >>> mols = [] 136 >>> for smi,id in conn.GetData(): 137 ... mol = Chem.MolFromSmiles(str(smi)) 138 ... mol.SetProp('_Name',str(id)) 139 ... mols.append(mol) 140 >>> len(mols) 141 12 142 143 Calculate fingerprints: 144 >>> probefps = [] 145 >>> for mol in mols: 146 ... fp = Chem.RDKFingerprint(mol) 147 ... fp._id = mol.GetProp('_Name') 148 ... probefps.append(fp) 149 150 Start by finding the top matches for a single probe. This ether should pull 151 other ethers from the db: 152 >>> mol = Chem.MolFromSmiles('COC') 153 >>> probeFp = Chem.RDKFingerprint(mol) 154 >>> picker = SpreadPicker(numToPick=2,probeFps=[probeFp],dataSet=probefps) 155 >>> len(picker) 156 2 157 >>> fp,score = picker[0] 158 >>> id = fp._id 159 >>> str(id) 160 'ether-1' 161 >>> score 162 1.0 163 164 The results come back in order: 165 >>> fp,score = picker[1] 166 >>> id = fp._id 167 >>> str(id) 168 'ether-2' 169 170 Now find the top matches for 2 probes. We'll get one ether and one acid: 171 >>> fps = [] 172 >>> fps.append(Chem.RDKFingerprint(Chem.MolFromSmiles('COC'))) 173 >>> fps.append(Chem.RDKFingerprint(Chem.MolFromSmiles('CC(=O)O'))) 174 >>> picker = SpreadPicker(numToPick=3,probeFps=fps,dataSet=probefps) 175 >>> len(picker) 176 3 177 >>> fp,score = picker[0] 178 >>> id = fp._id 179 >>> str(id) 180 'ether-1' 181 >>> score 182 1.0 183 >>> fp,score = picker[1] 184 >>> id = fp._id 185 >>> str(id) 186 'acid-1' 187 >>> score 188 1.0 189 >>> fp,score = picker[2] 190 >>> id = fp._id 191 >>> str(id) 192 'ether-2' 193 194 """
195 - def __init__(self,numToPick=10,probeFps=None,dataSet=None, 196 simMetric=DataStructs.TanimotoSimilarity, 197 expectPickles=True,onlyNames=False):
198 """ 199 200 dataSet should be a sequence of BitVectors or, if expectPickles 201 is False, a set of strings that can be converted to bit vectors 202 203 """ 204 self.numToPick = numToPick 205 self.probes = probeFps 206 self.data = dataSet 207 self.simMetric = simMetric 208 self.expectPickles = expectPickles 209 self.onlyNames=onlyNames 210 211 self._picks = None
212
213 - def MakePicks(self,force=0,silent=True):
214 if self._picks is not None and not force: 215 return 216 217 # start by getting the NxM score matrix 218 # (N=num probes, M=num fps) 219 nProbes = len(self.probes) 220 scores = [None]*nProbes 221 for i in range(nProbes): 222 scores[i] = [] 223 j = 0 224 fps = [] 225 for origFp in self.data: 226 for i in range(nProbes): 227 score = DataStructs.FingerprintSimilarity(self.probes[i],origFp, 228 self.simMetric) 229 bisect.insort(scores[i],(score,j)) 230 if len(scores[i])>=self.numToPick: 231 del scores[self.numToPick:] 232 if self.onlyNames and hasattr(origFp,'_fieldsFromDb'): 233 fps.append(origFp._fieldsFromDb[0]) 234 else: 235 fps.append(origFp) 236 j+=1 237 if not silent and not j%1000: 238 print('scored %d fps'%j) 239 240 # sort the rows of that matrix: 241 #for i in range(nProbes): 242 # scores[i].sort() 243 244 # now go probe by probe and select the current top entry until we are finished: 245 nPicked = 0 246 self._picks = [] 247 taken = [0]*len(fps) 248 while nPicked < self.numToPick: 249 rowIdx = nPicked%len(scores) 250 row = scores[rowIdx] 251 score,idx = row.pop() 252 # make sure we haven't taken this one already (from another row): 253 while taken[idx] and len(row): 254 score,idx = row.pop() 255 if not taken[idx]: 256 fp = fps[idx] 257 self._picks.append((fp,score)) 258 taken[idx]=1 259 nPicked += 1
260 261 #------------------------------------ 262 # 263 # doctest boilerplate 264 #
265 -def _test():
266 import doctest,sys 267 return doctest.testmod(sys.modules["__main__"])
268 269 if __name__ == '__main__': 270 import sys 271 failed,tried = _test() 272 sys.exit(failed) 273