#!/usr/bin/env python # -*- coding: utf-8 -* try: # python 3.X from urllib.request import urlopen, urlretrieve except: # python 2.7 from urllib2 import urlopen from urllib import urlretrieve import re import argparse import sys import os """ usage: python fetch_nxml.py --pmcids PMC1234 PMC1235 """ # pubmed documentation: http://www.ncbi.nlm.nih.gov/books/NBK25499/ class PubMedResources(object): @staticmethod def retrieve_nxml_abstract(pmid, outfile = None): """ Retrieves nxml file of the abstract associated with the provided pmid """ query = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={}&rettype=abstract".format(pmid) nxml_file = outfile or "{}.nxml".format(pmid) urlretrieve(query, nxml_file) @staticmethod def retrieve_nxml_paper(pmcid, outfile = None): """ Retrieves nxml file for the provided pmcid """ query = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id={}".format(pmcid) nxml_file = outfile or "{}.nxml".format(pmcid) urlretrieve(query, nxml_file) class PubMedEntry(object): def __init__(self, someid): self.convert_ids(someid) def convert_ids(self, someid): # id conversion api: http://www.ncbi.nlm.nih.gov/pmc/tools/id-converter-api/ url = "http://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids={}".format(someid) with urlopen(url) as response: html = response.read().decode("utf-8") self.pmcid = re.findall("(?