1
2
3
4
5
6
7
8
9
10 from rdkit import Chem
11 from rdkit.Chem import AllChem
12 from rdkit.Chem import Lipinski,Descriptors,Crippen
13 from rdkit.Dbase.DbConnection import DbConnect
14 from rdkit.Dbase import DbModule
15 import re
16
17
18 import rdkit.RDLogger as logging
19 logger = logging.logger()
20 logger.setLevel(logging.INFO)
21
22 -def ProcessMol(mol,typeConversions,globalProps,nDone,nameProp='_Name',nameCol='compound_id',
23 redraw=False,keepHs=False,
24 skipProps=False,addComputedProps=False,
25 skipSmiles=False,
26 uniqNames=None,namesSeen=None):
27 if not mol:
28 raise ValueError('no molecule')
29 if keepHs:
30 Chem.SanitizeMol(mol)
31 try:
32 nm = mol.GetProp(nameProp)
33 except KeyError:
34 nm = None
35 if not nm:
36 nm = 'Mol_%d'%nDone
37 if uniqNames and nm in namesSeen:
38 logger.error('duplicate compound id (%s) encountered. second instance skipped.'%nm)
39 return None
40 namesSeen.add(nm)
41 row = [nm]
42 if not skipProps:
43 if addComputedProps:
44 nHD=Lipinski.NumHDonors(mol)
45 mol.SetProp('DonorCount',str(nHD))
46 nHA=Lipinski.NumHAcceptors(mol)
47 mol.SetProp('AcceptorCount',str(nHA))
48 nRot=Lipinski.NumRotatableBonds(mol)
49 mol.SetProp('RotatableBondCount',str(nRot))
50 MW=Descriptors.MolWt(mol)
51 mol.SetProp('AMW',str(MW))
52 logp=Crippen.MolLogP(mol)
53 mol.SetProp('MolLogP',str(logp))
54
55 pns = list(mol.GetPropNames())
56 pD={}
57 for pi,pn in enumerate(pns):
58 if pn.lower()==nameCol.lower(): continue
59 pv = mol.GetProp(pn).strip()
60 if pv.find('>')<0 and pv.find('<')<0:
61 colTyp = globalProps.get(pn,2)
62 while colTyp>0:
63 try:
64 tpi = typeConversions[colTyp][1](pv)
65 except:
66 colTyp-=1
67 else:
68 break
69 globalProps[pn]=colTyp
70 pD[pn]=typeConversions[colTyp][1](pv)
71 else:
72 pD[pn]=pv
73 else:
74 pD={}
75 if redraw:
76 AllChem.Compute2DCoords(m)
77 if not skipSmiles:
78 row.append(Chem.MolToSmiles(mol,True))
79 row.append(DbModule.binaryHolder(mol.ToBinary()))
80 row.append(pD)
81 return row
82
83 -def ConvertRows(rows,globalProps,defaultVal,skipSmiles):
84 for i,row in enumerate(rows):
85 newRow = [row[0],row[1]]
86 pD=row[-1]
87 for pn in globalProps:
88 pv = pD.get(pn,defaultVal)
89 newRow.append(pv)
90 newRow.append(row[2])
91 if not skipSmiles:
92 newRow.append(row[3])
93 rows[i] = newRow
94
95 -def LoadDb(suppl,dbName,nameProp='_Name',nameCol='compound_id',silent=False,
96 redraw=False,errorsTo=None,keepHs=False,defaultVal='N/A',skipProps=False,
97 regName='molecules',skipSmiles=False,maxRowsCached=-1,
98 uniqNames=False,addComputedProps=False,lazySupplier=False,
99 startAnew=True):
100 if not lazySupplier:
101 nMols = len(suppl)
102 else:
103 nMols=-1
104 if not silent:
105 logger.info("Generating molecular database in file %s"%dbName)
106 if not lazySupplier:
107 logger.info(" Processing %d molecules"%nMols)
108 rows = []
109 globalProps = {}
110 namesSeen = set()
111 nDone = 0
112 typeConversions={0:('varchar',str),1:('float',float),2:('int',int)}
113 for m in suppl:
114 nDone +=1
115 if not m:
116 if errorsTo:
117 if hasattr(suppl,'GetItemText'):
118 d = suppl.GetItemText(nDone-1)
119 errorsTo.write(d)
120 else:
121 logger.warning('full error file support not complete')
122 continue
123
124 row=ProcessMol(m,typeConversions,globalProps,nDone,nameProp=nameProp,
125 nameCol=nameCol,redraw=redraw,
126 keepHs=keepHs,skipProps=skipProps,
127 addComputedProps=addComputedProps,skipSmiles=skipSmiles,
128 uniqNames=uniqNames,namesSeen=namesSeen)
129 if row is None: continue
130 rows.append([nDone]+row)
131 if not silent and not nDone%100:
132 logger.info(' done %d'%nDone)
133 if len(rows)==maxRowsCached:
134 break
135
136 nameDef='%s varchar not null'%nameCol
137 if uniqNames:
138 nameDef += ' unique'
139 typs = ['guid integer not null primary key',nameDef]
140 pns = []
141 for pn,v in globalProps.items():
142 addNm = re.sub(r'[\W]','_',pn)
143 typs.append('%s %s'%(addNm,typeConversions[v][0]))
144 pns.append(pn.lower())
145
146 if not skipSmiles:
147 if 'smiles' not in pns:
148 typs.append('smiles varchar')
149 else:
150 typs.append('cansmiles varchar')
151 typs.append('molpkl %s'%(DbModule.binaryTypeName))
152 conn = DbConnect(dbName)
153 curs = conn.GetCursor()
154 if startAnew:
155 try:
156 curs.execute('drop table %s'%regName)
157 except:
158 pass
159 curs.execute('create table %s (%s)'%(regName,','.join(typs)))
160 else:
161 curs.execute('select * from %s limit 1'%(regName,))
162 ocolns = set([x[0] for x in curs.description])
163 ncolns = set([x.split()[0] for x in typs])
164 if ncolns != ocolns:
165 raise ValueError('Column names do not match: %s != %s'%(ocolns,ncolns))
166 curs.execute('select max(guid) from %s'%(regName,))
167 offset = curs.fetchone()[0]
168 for row in rows:
169 row[0] += offset
170
171 qs = ','.join([DbModule.placeHolder for x in typs])
172
173
174 ConvertRows(rows,globalProps,defaultVal,skipSmiles)
175 curs.executemany('insert into %s values (%s)'%(regName,qs),rows)
176 conn.Commit()
177
178 rows = []
179 while 1:
180 nDone +=1
181 try:
182 m = next(suppl)
183 except StopIteration:
184 break
185 if not m:
186 if errorsTo:
187 if hasattr(suppl,'GetItemText'):
188 d = suppl.GetItemText(nDone-1)
189 errorsTo.write(d)
190 else:
191 logger.warning('full error file support not complete')
192 continue
193 tmpProps={}
194 row=ProcessMol(m,typeConversions,globalProps,nDone,nameProp=nameProp,
195 nameCol=nameCol,redraw=redraw,
196 keepHs=keepHs,skipProps=skipProps,
197 addComputedProps=addComputedProps,skipSmiles=skipSmiles,
198 uniqNames=uniqNames,namesSeen=namesSeen)
199 if not row: continue
200 rows.append([nDone]+row)
201 if not silent and not nDone%100:
202 logger.info(' done %d'%nDone)
203 if len(rows)==maxRowsCached:
204 ConvertRows(rows,globalProps,defaultVal,skipSmiles)
205 curs.executemany('insert into %s values (%s)'%(regName,qs),rows)
206 conn.Commit()
207 rows = []
208 if len(rows):
209 ConvertRows(rows,globalProps,defaultVal,skipSmiles)
210 curs.executemany('insert into %s values (%s)'%(regName,qs),rows)
211 conn.Commit()
212