1
2
3
4
5
6
7
8
9
10
11 """ Tools for doing PubMed searches and processing the results
12
13 NOTE: much of the example code in the documentation here uses XML
14 files from the test_data directory in order to avoid having to call
15 out to PubMed itself. Actual calls to the functions would not include
16 the _conn_ argument.
17
18 """
19 from __future__ import print_function
20 from rdkit import RDConfig
21 import QueryParams,Records
22 import urllib,urllib2
23 from xml.etree import ElementTree
24
26 proxy_support = urllib2.ProxyHandler({})
27 opener = urllib2.build_opener(proxy_support)
28 conn = urllib2.urlopen(url,args)
29 return conn
30
32 """ returns a tuple of pubmed ids (strings) for the query provided
33
34 To do a search, we need a query object:
35 >>> query = QueryParams.details()
36
37 set up the search parameters:
38 >>> query['term'] = 'penzotti je AND grootenhuis pd'
39 >>> query['field'] = 'auth'
40
41 now get the search ids:
42 >>> counts = GetNumHits(query)
43 >>> counts
44 2
45
46 alternately, we can search using field specifiers:
47 >>> query = QueryParams.details()
48 >>> query['term'] = 'penzotti je[au] AND hydrogen bonding[mh]'
49 >>> counts = GetNumHits(query)
50 >>> counts
51 3
52
53
54 """
55 query['rettype']='count'
56 conn = openURL(url,urllib.urlencode(query))
57 pubmed = ElementTree.parse(conn)
58 countText = pubmed.findtext('Count')
59 if countText:
60 res = int(countText)
61 else:
62 res = 0
63 return res
64
65
67 """ returns a tuple of pubmed ids (strings) for the query provided
68
69 To do a search, we need a query object:
70 >>> query = QueryParams.details()
71
72 set up the search parameters:
73 >>> query['term'] = 'penzotti je AND grootenhuis pd'
74 >>> query['field'] = 'auth'
75
76 now get the search ids:
77 >>> ids = GetSearchIds(query)
78 >>> len(ids)
79 2
80 >>> ids[0]
81 '11960484'
82 >>> ids[1]
83 '10893315'
84
85
86 """
87 conn = openURL(url,urllib.urlencode(query))
88 pubmed = ElementTree.parse(conn)
89 res = [id.text for id in pubmed.getiterator('Id')]
90 return tuple(res)
91
93 """ gets a set of document summary records for the ids provided
94
95 >>> ids = ['11960484']
96 >>> summs = GetSummaries(ids,conn=open(os.path.join(testDataDir,'summary.xml'),'r'))
97 >>> len(summs)
98 1
99 >>> rec = summs[0]
100 >>> isinstance(rec,Records.SummaryRecord)
101 1
102 >>> rec.PubMedId
103 '11960484'
104 >>> rec.Authors
105 'Penzotti JE, Lamb ML, Evensen E, Grootenhuis PD'
106 >>> rec.Title
107 'A computational ensemble pharmacophore model for identifying substrates of P-glycoprotein.'
108 >>> rec.Source
109 'J Med Chem'
110 >>> rec.Volume
111 '45'
112 >>> rec.Pages
113 '1737-40'
114 >>> rec.HasAbstract
115 '1'
116
117 """
118 if not conn:
119 try:
120 iter(ids)
121 except TypeError:
122 ids = [ids,]
123 if not query:
124 query = QueryParams.details()
125 ids = map(str,ids)
126 query['id'] = ','.join(ids)
127 conn = openURL(url,urllib.urlencode(query))
128 pubmed = ElementTree.parse(conn)
129 res = []
130 for summary in pubmed.getiterator('DocSum'):
131 rec = Records.SummaryRecord(summary)
132 if rec.PubMedId in ids:
133 res.append(rec)
134 ids.remove(rec.PubMedId)
135
136 return tuple(res)
137
139 """ gets a set of document summary records for the ids provided
140
141 >>> ids = ['11960484']
142 >>> recs = GetRecords(ids,conn=open(os.path.join(testDataDir,'records.xml'),'r'))
143 >>> len(recs)
144 1
145 >>> rec = recs[0]
146 >>> rec.PubMedId
147 '11960484'
148 >>> rec.Authors
149 u'Penzotti JE, Lamb ML, Evensen E, Grootenhuis PD'
150 >>> rec.Title
151 u'A computational ensemble pharmacophore model for identifying substrates of P-glycoprotein.'
152 >>> rec.Source
153 u'J Med Chem'
154 >>> rec.Volume
155 '45'
156 >>> rec.Pages
157 '1737-40'
158 >>> rec.PubYear
159 '2002'
160 >>> rec.Abstract[:10]
161 u'P-glycopro'
162
163 We've also got access to keywords:
164 >>> str(rec.keywords[0])
165 'Combinatorial Chemistry Techniques'
166 >>> str(rec.keywords[3])
167 'Indinavir / chemistry'
168
169 and chemicals:
170 >>> rec.chemicals[0]
171 'P-Glycoprotein'
172 >>> rec.chemicals[2]
173 'Nicardipine <55985-32-5>'
174
175
176 """
177 if not conn:
178 try:
179 iter(ids)
180 except TypeError:
181 ids = [ids,]
182 if not query:
183 query = QueryParams.details()
184 query['id'] = ','.join(map(str,ids))
185 conn = openURL(url,urllib.urlencode(query))
186
187 pubmed = ElementTree.parse(conn)
188 res = []
189 for article in pubmed.getiterator('PubmedArticle'):
190 rec = Records.JournalArticleRecord(article)
191 if rec.PubMedId in ids:
192 res.append(rec)
193 return tuple(res)
194
196 if not conn:
197 try:
198 iter(ids)
199 except TypeError:
200 ids = [ids,]
201 if not query:
202 query = QueryParams.details()
203 query['id'] = ','.join(map(str,ids))
204 conn = openURL(url,urllib.urlencode(query))
205 query['cmd'] = 'ncheck'
206 pubmed = ElementTree.parse(conn)
207
208 checklist = pubmed.find('LinkSet/IdCheckList')
209 recs = [Records.LinkRecord(x) for x in checklist.getiterator('Id')]
210
211 res = {}
212 for rec in recs:
213 id = rec.PubMedId
214 res[id] = rec.HasNeighbor
215 return res
216
218 if not conn:
219 try:
220 iter(ids)
221 except TypeError:
222 ids = [ids,]
223 if not query:
224 query = QueryParams.details()
225 query['id'] = ','.join(map(str,ids))
226 conn = openURL(url,urllib.urlencode(query))
227 query['cmd'] = 'neighbor'
228
229 pubmed = ElementTree.parse(conn)
230 linkset = pubmed.find('LinkSet/LinkSetDb')
231 scores = []
232 scoreNorm = 1.0
233 for link in linkset.getiterator('Link'):
234 id = link.findtext('Id')
235 score = float(link.findtext('Score'))
236 scores.append([id,score])
237
238 if id == ids[0]:
239 scoreNorm = score
240 for i in range(len(scores)):
241 id,score = scores[i]
242 scores[i] = id,score/scoreNorm
243 return tuple(scores)
244
245
246
247
248
249
251 import doctest,sys
252 return doctest.testmod(sys.modules["__main__"])
253
254 if __name__ == '__main__':
255 import sys,os.path
256 testDataDir = os.path.join(RDConfig.RDCodeDir,'Dbase','Pubmed','test_data')
257 failed,tried = _test()
258 sys.exit(failed)
259
260
261
262
263
264
265 ids = ['11666868','11169640']
266 if 0:
267 summs = GetSummaries(ids,conn=open('summary.xml','r'))
268 print('summs:',summs)
269 for summary in summs:
270 print(summary.Authors)
271 print('\t',summary.Title)
272 print('\t',summary.Source,end='')
273 print(summary.Volume,end='')
274 print(summary.Pages,end='')
275 print(summary.PubDate)
276
277 if 0:
278 ids = ['11666868']
279 res = GetRecords(ids,conn=open('records.xml','r'))
280 for record in res:
281 print(record.Authors)
282 print('\t',record.Title)
283 print('\t',record.Journal,end='')
284 print(record.Volume,end='')
285 print(record.Pages,end='')
286 print(record.PubYear)
287 print()
288
289 if 0:
290 ids = ['11666868','11169640']
291 res = CheckForLinks(ids,conn=open('haslinks.xml','r'))
292 print(res)
293
294 if 0:
295 ids = ['11666868']
296 res = GetLinks(ids,conn=open('links.xml','r'))
297
298 for id,score in res[:10]:
299 print(id,score)
300