https://github.com/FedericoV/eLife_Editorial_Process
Tip revision: aa2207a244010095d33095161954b8a0895e58fd authored by Federico Vaggi on 14 October 2016, 13:26:10 UTC
Changes for 2nd version of paper
Changes for 2nd version of paper
Tip revision: aa2207a
eLife_parsing.py
from bs4 import BeautifulSoup
import requests
import os
import time
import pandas as pd
def get_citations(ms_no):
doi_url = 'http://dx.doi.org/10.7554/eLife.%05d' % ms_no
r = requests.get(doi_url, allow_redirects=True)
volume = r.url.split('/')[4] # volume number
elife_url = "http://elifesciences.org/content/%s/e%05d/article-metrics" % (volume, ms_no)
r = requests.get(elife_url)
soup = BeautifulSoup(r.text)
metric_values = []
for value in soup.find_all("span", {"class": "metric-value"}):
try:
metric_values.append(int(value.text))
except ValueError:
metric_values.append(0)
# Get the name of the metric
metric_names = []
for value in soup.find_all("span", {"class": "metric-name"}):
if value.a is not None:
# get url:
name = value.a['href'].split('/')[2]
else:
name = value.text
metric_names.append(name)
return zip(metric_names, metric_values)
if __name__ == "__main__":
data_dir = os.path.join(os.getcwd(), 'data')
decisions_df = pd.read_csv(os.path.join(data_dir, 'Decisions.csv'),
index_col=0)
citations_dict = {}
for ms_no, values in decisions_df.iterrows():
print (ms_no)
decision = values['Decision_Type']
if decision == 'Accept Full Submission':
citations_dict[ms_no] = get_citations(ms_no)
for ms_no, cit in citations_dict.items():
citations_dict[int(ms_no)] = dict(cit)
citations = pd.DataFrame(citations_dict).T
citations.rename(columns={'www.ncbi.nlm.nih.gov': 'Pubmed_Citations',
'www.scopus.com': 'Scopus'}, inplace=True)
citations_fp = 'Citations_{}.csv'.format(time.strftime("%d-%m-%Y"))
citations_fp = os.path.join(data_dir, citations_fp)
citations.to_csv(citations_fp)