我在pandas dataframe列中有一系列术语,我想用这些术语搜索PubMed,以了解它们在文献中出现的频率。我正在尝试调整Entrez biopython代码来完成这项任务,但不确定如何继续,因为我使用的代码似乎是针对单个搜索项,而不是数据帧。这是我到目前为止得到的结果,它返回HTTPError:HTTPError 500:Internal Server Error。提前感谢您的建议
from Bio import Medline
import pandas as pd
import time
Entrez.email =myemail@mail.com
# return a list of the PMIDs that match your search term
def getPubMedIDs(searchstring,maxrecords):
IDlist=[]
if not maxrecords > 100000: #maximum possible = 100,000 records
handle = Entrez.esearch(db="pubmed", term=searchstring, retmax = maxrecords)
result = Entrez.read(handle)
IDlist= result["IdList"]
handle.close()
return IDlist
# get MEDLINE data records for each PMID and store in a dataframe
# searchPMIDlist was created by getPubMedIDs function
# this will only get the first 10000 records from your PMID list
def getPubMeddata(searchPMIDlist,dataframename):
# a trick to remove duplicates from the PMID list using set and list
searchPMIDlist = list(set(searchPMIDlist))
# for displaying number of records processed
counter=0
# get MEDLINE data records
fetchhandle = Entrez.efetch(db="pubmed", id=searchPMIDlist, rettype="medline", retmode="text")
fetchresult = Medline.parse(fetchhandle)
# parse the dictionary of returned records
for record in fetchresult:
if "PMID" not in record: # if there's no PMID in this record (rare), skip it
continue
PT=''
if "PT" in record: # if there's a pub type list in this record, store it
PT=record["PT"]
TI=''
if "TI" in record: # if there's a title in this record, store it
TI=record["TI"]
AB=''
if "AB" in record: # if there's an abstract in this record, store it
AB=record["AB"]
MH=''
if "MH" in record:
MH=record["MH"]
# put the data you found into a new row in the dataframe
# you might want to collect different data for your purposes
dataframename = dataframename.append({'PMID': record["PMID"],
'PT': PT,
'Title': TI,
'Abstract': AB,
'MeSH': MH,
'AbLength': len(AB)}, ignore_index=True)
# if we've processed 100 new records, display number of records processed
counter += 1
if not counter % 100:
print(counter, "records processed")
time.sleep(5) # wait time between repeated fetches
fetchhandle.close()
return dataframename
myPMIDlist=[]
searchstring=allresultsdf['New Term'] #allresultsdf is dataframe containing terms I am searching, New Term is the column
myPMIDlist = getPubMedIDs(searchstring, 3000)
print("Search string:", searchstring)
print("Total PMIDs found:",len(myPMIDlist))```
目前没有回答
相关问题 更多 >
编程相关推荐