Package rdkit :: Package Dbase :: Package Pubmed :: Module Searches
[hide private]
[frames] | no frames]

Source Code for Module rdkit.Dbase.Pubmed.Searches

  1  # $Id$ 
  2  # 
  3  # Copyright (C) 2003-2006 Rational Discovery LLC 
  4  # 
  5  #   @@ All Rights Reserved @@ 
  6  #  This file is part of the RDKit. 
  7  #  The contents are covered by the terms of the BSD license 
  8  #  which is included in the file license.txt, found at the root 
  9  #  of the RDKit source tree. 
 10  # 
 11  """ Tools for doing PubMed searches and processing the results 
 12   
 13  NOTE: much of the example code in the documentation here uses XML 
 14  files from the test_data directory in order to avoid having to call 
 15  out to PubMed itself.  Actual calls to the functions would not include 
 16  the _conn_ argument. 
 17   
 18  """ 
 19  from __future__ import print_function 
 20  from rdkit import RDConfig 
 21  import QueryParams,Records 
 22  import urllib,urllib2 
 23  from xml.etree import ElementTree 
 24   
25 -def openURL(url,args):
26 proxy_support = urllib2.ProxyHandler({}) 27 opener = urllib2.build_opener(proxy_support) 28 conn = urllib2.urlopen(url,args) 29 return conn
30
31 -def GetNumHits(query,url=QueryParams.searchBase):
32 """ returns a tuple of pubmed ids (strings) for the query provided 33 34 To do a search, we need a query object: 35 >>> query = QueryParams.details() 36 37 set up the search parameters: 38 >>> query['term'] = 'penzotti je AND grootenhuis pd' 39 >>> query['field'] = 'auth' 40 41 now get the search ids: 42 >>> counts = GetNumHits(query) 43 >>> counts 44 2 45 46 alternately, we can search using field specifiers: 47 >>> query = QueryParams.details() 48 >>> query['term'] = 'penzotti je[au] AND hydrogen bonding[mh]' 49 >>> counts = GetNumHits(query) 50 >>> counts 51 3 52 53 54 """ 55 query['rettype']='count' 56 conn = openURL(url,urllib.urlencode(query)) 57 pubmed = ElementTree.parse(conn) 58 countText = pubmed.findtext('Count') 59 if countText: 60 res = int(countText) 61 else: 62 res = 0 63 return res
64 65
66 -def GetSearchIds(query,url=QueryParams.searchBase):
67 """ returns a tuple of pubmed ids (strings) for the query provided 68 69 To do a search, we need a query object: 70 >>> query = QueryParams.details() 71 72 set up the search parameters: 73 >>> query['term'] = 'penzotti je AND grootenhuis pd' 74 >>> query['field'] = 'auth' 75 76 now get the search ids: 77 >>> ids = GetSearchIds(query) 78 >>> len(ids) 79 2 80 >>> ids[0] 81 '11960484' 82 >>> ids[1] 83 '10893315' 84 85 86 """ 87 conn = openURL(url,urllib.urlencode(query)) 88 pubmed = ElementTree.parse(conn) 89 res = [id.text for id in pubmed.getiterator('Id')] 90 return tuple(res)
91
92 -def GetSummaries(ids,query=None,url=QueryParams.summaryBase,conn=None):
93 """ gets a set of document summary records for the ids provided 94 95 >>> ids = ['11960484'] 96 >>> summs = GetSummaries(ids,conn=open(os.path.join(testDataDir,'summary.xml'),'r')) 97 >>> len(summs) 98 1 99 >>> rec = summs[0] 100 >>> isinstance(rec,Records.SummaryRecord) 101 1 102 >>> rec.PubMedId 103 '11960484' 104 >>> rec.Authors 105 'Penzotti JE, Lamb ML, Evensen E, Grootenhuis PD' 106 >>> rec.Title 107 'A computational ensemble pharmacophore model for identifying substrates of P-glycoprotein.' 108 >>> rec.Source 109 'J Med Chem' 110 >>> rec.Volume 111 '45' 112 >>> rec.Pages 113 '1737-40' 114 >>> rec.HasAbstract 115 '1' 116 117 """ 118 if not conn: 119 try: 120 iter(ids) 121 except TypeError: 122 ids = [ids,] 123 if not query: 124 query = QueryParams.details() 125 ids = map(str,ids) 126 query['id'] = ','.join(ids) 127 conn = openURL(url,urllib.urlencode(query)) 128 pubmed = ElementTree.parse(conn) 129 res = [] 130 for summary in pubmed.getiterator('DocSum'): 131 rec = Records.SummaryRecord(summary) 132 if rec.PubMedId in ids: 133 res.append(rec) 134 ids.remove(rec.PubMedId) 135 136 return tuple(res)
137
138 -def GetRecords(ids,query=None,url=QueryParams.fetchBase,conn=None):
139 """ gets a set of document summary records for the ids provided 140 141 >>> ids = ['11960484'] 142 >>> recs = GetRecords(ids,conn=open(os.path.join(testDataDir,'records.xml'),'r')) 143 >>> len(recs) 144 1 145 >>> rec = recs[0] 146 >>> rec.PubMedId 147 '11960484' 148 >>> rec.Authors 149 u'Penzotti JE, Lamb ML, Evensen E, Grootenhuis PD' 150 >>> rec.Title 151 u'A computational ensemble pharmacophore model for identifying substrates of P-glycoprotein.' 152 >>> rec.Source 153 u'J Med Chem' 154 >>> rec.Volume 155 '45' 156 >>> rec.Pages 157 '1737-40' 158 >>> rec.PubYear 159 '2002' 160 >>> rec.Abstract[:10] 161 u'P-glycopro' 162 163 We've also got access to keywords: 164 >>> str(rec.keywords[0]) 165 'Combinatorial Chemistry Techniques' 166 >>> str(rec.keywords[3]) 167 'Indinavir / chemistry' 168 169 and chemicals: 170 >>> rec.chemicals[0] 171 'P-Glycoprotein' 172 >>> rec.chemicals[2] 173 'Nicardipine <55985-32-5>' 174 175 176 """ 177 if not conn: 178 try: 179 iter(ids) 180 except TypeError: 181 ids = [ids,] 182 if not query: 183 query = QueryParams.details() 184 query['id'] = ','.join(map(str,ids)) 185 conn = openURL(url,urllib.urlencode(query)) 186 187 pubmed = ElementTree.parse(conn) 188 res = [] 189 for article in pubmed.getiterator('PubmedArticle'): 190 rec = Records.JournalArticleRecord(article) 191 if rec.PubMedId in ids: 192 res.append(rec) 193 return tuple(res)
194 216 244 245 246 #------------------------------------ 247 # 248 # doctest boilerplate 249 #
250 -def _test():
251 import doctest,sys 252 return doctest.testmod(sys.modules["__main__"])
253 254 if __name__ == '__main__': 255 import sys,os.path 256 testDataDir = os.path.join(RDConfig.RDCodeDir,'Dbase','Pubmed','test_data') 257 failed,tried = _test() 258 sys.exit(failed) 259 #query = QueryParams.details() 260 #query['term']='landrum ga' 261 #query['field']='auth' 262 #ids = GetSearchIds(query) 263 #print ids 264 #ids = ids[:2] 265 ids = ['11666868','11169640'] 266 if 0: 267 summs = GetSummaries(ids,conn=open('summary.xml','r')) 268 print('summs:',summs) 269 for summary in summs: 270 print(summary.Authors) 271 print('\t',summary.Title) 272 print('\t',summary.Source,end='') 273 print(summary.Volume,end='') 274 print(summary.Pages,end='') 275 print(summary.PubDate) 276 277 if 0: 278 ids = ['11666868'] 279 res = GetRecords(ids,conn=open('records.xml','r')) 280 for record in res: 281 print(record.Authors) 282 print('\t',record.Title) 283 print('\t',record.Journal,end='') 284 print(record.Volume,end='') 285 print(record.Pages,end='') 286 print(record.PubYear) 287 print() 288 289 if 0: 290 ids = ['11666868','11169640'] 291 res = CheckForLinks(ids,conn=open('haslinks.xml','r')) 292 print(res) 293 294 if 0: 295 ids = ['11666868'] 296 res = GetLinks(ids,conn=open('links.xml','r')) 297 #res = GetLinks(ids) 298 for id,score in res[:10]: 299 print(id,score) 300