I am working upon a pubmed project where I need to extract the ids for free full text and free pmc articles.This is what my code is.
import requests
from bs4 import BeautifulSoup
from Bio import Entrez
Entrez.email = "[email protected]" # Always tell NCBI who you are
handle = Entrez.esearch(db="pubmed", term="cough")
record = Entrez.read(handle)
count = record['Count']
handle = Entrez.esearch(db="pubmed", term="cough", retmax=count)
record = Entrez.read(handle)
free_article_ids = []
for id_ in record['IdList']:
req = requests.get(f"https://www.ncbi.nlm.nih.gov/pubmed/{id_}")
soup = BeautifulSoup(req.text, 'lxml')
status = soup.find('span', {'class':'status_icon'})
if status is None:
continue
elif status.text in ["Free full text", "Free PMC Article"]:
free_article_ids.append(id_)
print(free_article_ids)
Problem with my code is that it is taking way too much time for giving the result and I want to speed this process up. How do I do it?
Use multithreading to download concurrently. Recommend a simple framework.
from Bio import Entrez
from simplified_scrapy import Spider, SimplifiedDoc, SimplifiedMain
class MySpider(Spider):
name = 'ncbi.nlm.nih.gov'
start_urls = []
def __init__(self):
Entrez.email = "[email protected]" # Always tell NCBI who you are
handle = Entrez.esearch(db="pubmed", term="cough")
record = Entrez.read(handle)
count = record['Count']
handle = Entrez.esearch(db="pubmed", term="cough", retmax=count)
record = Entrez.read(handle)
for id_ in record['IdList']:
self.start_urls.append(f"https://www.ncbi.nlm.nih.gov/pubmed/{id_}")
Spider.__init__(self,self.name) #necessary
free_article_ids = []
def extract(self,url,html,models,modelNames):
doc = SimplifiedDoc(html)
status = doc.select('span.status_icon')
if status and status.text in ["Free full text", "Free PMC Article"]:
id = url.split('/')[-1]
self.free_article_ids.append(id)
return {"Urls": [], "Data": {"id":id}}
return True
SimplifiedMain.startThread(MySpider())
Here are more examples. https://github.com/yiyedata/simplified-scrapy-demo
Collected from the Internet
Please contact [email protected] to delete if infringement.
Comments