From 4252cb8c971beb47d68f0ccfd7ca2622ac0e8e8c Mon Sep 17 00:00:00 2001 From: scnerd Date: Tue, 14 Jul 2020 09:31:12 -0400 Subject: [PATCH] Fixes issue #1, a break in biorxiv id's that are given directly. Fixes some instabilities caused by stray whitespace or leading slashes. Reformatted code to be closer to PEP8. Updated tests to use Pytest. Fixed requirements.txt, which was missing the ``requests`` package. --- .gitignore | 3 + refid2bib/__init__.py | 36 ++++---- refid2bib/__main__.py | 12 ++- refid2bib/core.py | 195 ++++++++++++++++++++++-------------------- requirements.txt | 3 +- test_refid2bib.py | 72 +++++++++++----- 6 files changed, 185 insertions(+), 136 deletions(-) diff --git a/.gitignore b/.gitignore index 894a44c..9e5e6d4 100644 --- a/.gitignore +++ b/.gitignore @@ -102,3 +102,6 @@ venv.bak/ # mypy .mypy_cache/ + +# Pycharm +.idea \ No newline at end of file diff --git a/refid2bib/__init__.py b/refid2bib/__init__.py index 35ece80..d878607 100644 --- a/refid2bib/__init__.py +++ b/refid2bib/__init__.py @@ -1,33 +1,35 @@ -from refid2bib.core import get_biorxiv_bibtex, get_doi_bibtex, get_arxiv_bibtex, get_pmid_bibtex import re +from refid2bib.core import get_biorxiv_bibtex, get_doi_bibtex, get_arxiv_bibtex, get_pmid_bibtex + bibtex_functions = { - 'biorxiv':get_biorxiv_bibtex, - 'doi':get_doi_bibtex, - 'arxiv':get_arxiv_bibtex, - 'pmid':get_pmid_bibtex} + 'biorxiv': get_biorxiv_bibtex, + 'doi': get_doi_bibtex, + 'arxiv': get_arxiv_bibtex, + 'pmid': get_pmid_bibtex +} -def refid2bib(ref, short_name=None, lastname_first=True, ref_type=None): - tests = {'doi':'^doi: ?|^https?://doi.org/|^https?://dx.doi.org/', - 'biorxiv':'^biorxiv ?|^biorxiv:', - 'arxiv': '^arxiv:|^https?://arxiv.org/abs/', - 'pmid': '^pmid: ?|^(?=pmc\d*)', +def refid2bib(ref, short_name=None, lastname_first=True, ref_type=None): + tests = { + 'doi': r'^doi: ?|^https?://doi.org/|^https?://dx.doi.org/', + 'biorxiv': r'^biorxiv:|^biorxiv ?', + 'arxiv': r'^arxiv:|^https?://arxiv.org/abs/', + 'pmid': r'^pmid: ?|^(?=pmc\d*)', } - + if ref_type in tests.keys(): - query = re.search( tests[ref_type], ref.lower() ) + query = re.search(tests[ref_type], ref.lower()) oid = ref[query.span()[1]:] else: for c, q in tests.items(): - query = re.search( q, ref.lower() ) + query = re.search(q, ref.lower()) if query is not None: oid = ref[query.span()[1]:] ref_type = c break - else: - raise ValueError( 'Cannot assign reference type for {}'.format(ref) ) + else: + raise ValueError('Cannot assign reference type for {}'.format(ref)) + oid = oid.lstrip('/').strip() return bibtex_functions[ref_type](oid, short_name=short_name, lastname_first=lastname_first) - - diff --git a/refid2bib/__main__.py b/refid2bib/__main__.py index b1947af..0f5b61d 100644 --- a/refid2bib/__main__.py +++ b/refid2bib/__main__.py @@ -1,6 +1,10 @@ -import sys from refid2bib import refid2bib -oid = sys.argv[1] -print( '\n') -print( refid2bib(oid) ) + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('document_id', help='DOI, ArXiv ID, etc. of the document you wish to look up') + args = parser.parse_args() + + print(refid2bib(args.document_id)) diff --git a/refid2bib/core.py b/refid2bib/core.py index 72691f1..78e6ee1 100644 --- a/refid2bib/core.py +++ b/refid2bib/core.py @@ -1,48 +1,52 @@ -import requests -import re import json +import re + import feedparser +import requests from nameparser import HumanName +pmid_url = 'https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids={};format=json' + -pmid_url='https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids={};format=json' -def pmid_formatter( pmid, pmid_base=pmid_url): +def pmid_formatter(pmid, pmid_base=pmid_url): return pmid_base.format(pmid) -def get_pmid_doi( pmid ): - data = requests.get( pmid_formatter(pmid) ) +def get_pmid_doi(pmid): + data = requests.get(pmid_formatter(pmid)) data.raise_for_status() data_dict = json.loads(data.text) return data_dict['records'][0]['doi'] -def get_pmid_bibtex( pmid, short_name=None, lastname_first=True ): +def get_pmid_bibtex(pmid, short_name=None, lastname_first=True): doi = get_pmid_doi(pmid) - return get_doi_bibtex( doi, short_name=short_name, lastname_first=lastname_first ) + return get_doi_bibtex(doi, short_name=short_name, lastname_first=lastname_first) + + +doi_url = 'http://dx.doi.org' -doi_url='http://dx.doi.org' -def doi_formatter( doi, doi_base=doi_url ): - return '/'.join((doi_base,doi)) +def doi_formatter(doi, doi_base=doi_url): + return '/'.join((doi_base, doi)) -def get_doi_bibtex( doi, short_name=None, lastname_first = True ): - #To do, better exception handling on the request - header={'Accept': 'application/x-bibtex'} - data=requests.get(doi_formatter(doi), headers=header) +def get_doi_bibtex(doi, short_name=None, lastname_first=True): + # To do, better exception handling on the request + header = {'Accept': 'application/x-bibtex'} + data = requests.get(doi_formatter(doi), headers=header) data.raise_for_status() bibtex = data.text + "\n" if lastname_first: - bibtex = invert_author_names( bibtex ) + bibtex = invert_author_names(bibtex) - is_biorxiv, biorxiv_id = doi_is_biorxiv( doi ) + is_biorxiv, biorxiv_id = doi_is_biorxiv(doi) if is_biorxiv: - bibtex = fix_biorxiv_info( bibtex, biorxiv_id ) - return replace_short_name( bibtex, short_name ) + bibtex = fix_biorxiv_info(bibtex, biorxiv_id) + return replace_short_name(bibtex, short_name) -def invert_author_names( bibtex ): +def invert_author_names(bibtex): author_test = re.search("author = {(?P.*)}", bibtex) author_str_original = author_test.groupdict()['author_str'] author_str_inverted = parse_authors(author_str_original) @@ -50,43 +54,47 @@ def invert_author_names( bibtex ): arxiv_url = "http://export.arxiv.org/api/query" -def get_arxiv_bibtex( arxiv_number, short_name=None, lastname_first=True): - params={'id_list':arxiv_number} - atom_data=requests.get(arxiv_url, params=params) - data=feedparser.parse(atom_data.text) + + +def get_arxiv_bibtex(arxiv_number, short_name=None, lastname_first=True): + params = {'id_list': arxiv_number} + atom_data = requests.get(arxiv_url, params=params) + data = feedparser.parse(atom_data.text) if 'id' in data['entries'][0]: - bibtex = parse_arxiv_bibtex( data, lastname_first=lastname_first ) + bibtex = parse_arxiv_bibtex(data, lastname_first=lastname_first) else: raise Exception('Arxiv ID not found') return replace_short_name(bibtex, short_name) biorxiv_doi = "10.1101/{bid}" -def biorxiv_doi_formatter( biorxiv_id, biorxiv_base=biorxiv_doi ): + + +def biorxiv_doi_formatter(biorxiv_id, biorxiv_base=biorxiv_doi): return biorxiv_base.format(bid=biorxiv_id) -def get_biorxiv_bibtex( biorxiv_id, short_name=None, lastname_first=True): - bibtex = get_doi_bibtex( biorxiv_doi_formatter(biorxiv_id), - short_name=short_name, - lastname_first=lastname_first ) +def get_biorxiv_bibtex(biorxiv_id, short_name=None, lastname_first=True): + bibtex = get_doi_bibtex(biorxiv_doi_formatter(biorxiv_id), + short_name=short_name, + lastname_first=lastname_first) return bibtex -def fix_biorxiv_info( bibtex, biorxiv_id ): +def fix_biorxiv_info(bibtex, biorxiv_id): biorxiv_info = "\teprinttype={{bioRxiv}},\n" \ "\teprint={{{bid}}},\n" \ "\thowpublished={{{hp}}}\n}}\n".format(bid=biorxiv_id, - hp='bioRxiv doi:10.1101/{}'.format(biorxiv_id)) - bibtex = bibtex.replace('@article{','@online{',1) + hp='bioRxiv doi:10.1101/{}'.format(biorxiv_id)) + bibtex = bibtex.replace('@article{', '@online{', 1) return bibtex[:-3] + ',\n' + biorxiv_info -def doi_is_biorxiv( doi ): - biorxiv_regex = "(?P10\.\d{4,}\.?\d*)\/(?P.*)" - biorxiv_doi_test = re.search( biorxiv_regex, doi ) - if ( biorxiv_doi_test.groupdict()['doi_prefix'] == '10.1101' ) and \ - ( re.match('^\d*$', biorxiv_doi_test.groupdict()['doi_suffix']) is not None ): +def doi_is_biorxiv(doi): + biorxiv_regex = r"(?P10\.\d{4,}\.?\d*)\/(?P.*)" + biorxiv_doi_test = re.search(biorxiv_regex, doi) + if (biorxiv_doi_test.groupdict()['doi_prefix'] == '10.1101') and \ + (re.match(r'^\d*$', biorxiv_doi_test.groupdict()['doi_suffix']) is not None): return True, biorxiv_doi_test.groupdict()['doi_suffix'] else: return False, None @@ -96,13 +104,13 @@ def replace_short_name(bibtex, short_name): if short_name is not None: oldreg = re.search('^@.*{(?P.*),', bibtex) old_name = oldreg.groupdict()['old_name'] - bibtex = bibtex.replace( old_name, short_name, 1) + bibtex = bibtex.replace(old_name, short_name, 1) return bibtex -def parse_authors( authors, lastname_first=True ): +def parse_authors(authors, lastname_first=True): if isinstance(authors, list): - author_list=authors + author_list = authors elif isinstance(authors, str): author_list = authors.split(' and ') else: @@ -113,34 +121,34 @@ def parse_authors( authors, lastname_first=True ): first_names = [] for name in name_list: if len(name.suffix) > 0: - last_names.append( '{l} {s}'.format(l=name.last, s=name.suffix) ) + last_names.append('{l} {s}'.format(l=name.last, s=name.suffix)) else: last_names.append('{l}'.format(l=name.last)) if len(name.middle) > 0: - first_names.append( '{f} {m}'.format(f=name.first, m=name.middle)) + first_names.append('{f} {m}'.format(f=name.first, m=name.middle)) else: - first_names.append( '{f}'.format(f=name.first)) + first_names.append('{f}'.format(f=name.first)) if lastname_first: name_str = ' and '.join(['{}, {}'.format(*name) for name in zip(last_names, first_names)]) else: - name_str = ' and '.join(['{} {}'.format(*name) for name in zip(first_names, last_names)]) + name_str = ' and '.join(['{} {}'.format(*name) for name in zip(first_names, last_names)]) return name_str month_map = { - 1:'Jan', - 2:'Feb', - 3:'Mar', - 4:'Apr', - 5:'May', - 6:'Jun', - 7:'Jul', - 8:'Aug', - 9:'Sep', - 10:'Oct', - 11:'Nov', - 12:'Dec' - } + 1: 'Jan', + 2: 'Feb', + 3: 'Mar', + 4: 'Apr', + 5: 'May', + 6: 'Jun', + 7: 'Jul', + 8: 'Aug', + 9: 'Sep', + 10: 'Oct', + 11: 'Nov', + 12: 'Dec' +} def format_bibtex_entry(short_name, @@ -153,35 +161,36 @@ def format_bibtex_entry(short_name, article_url): bibtex_header = '@online{' bibtex_base = "{header}{short_name},\n" \ - "\tauthor = {{{authors}}},\n" \ - "\ttitle = {{{title}}},\n" \ - "\tyear = {{{year}}},\n" \ - "\tmonth = {{{month}}},\n" \ - "\teprinttype = {{{eprinttype}}},\n" \ - "\teprint = {{{eprint}}},\n" \ - "\thowpublished = {{{how_published}}},\n" \ - "\turl = {{{url}}}\n}}\n" - bibtex = bibtex_base.format( header=bibtex_header, - short_name=short_name, - authors=authors, - title=title, - year=year, - month=month, - eprinttype=eprinttype, - eprint=eprint, - how_published=':'.join((eprinttype, eprint)), - url=article_url) + "\tauthor = {{{authors}}},\n" \ + "\ttitle = {{{title}}},\n" \ + "\tyear = {{{year}}},\n" \ + "\tmonth = {{{month}}},\n" \ + "\teprinttype = {{{eprinttype}}},\n" \ + "\teprint = {{{eprint}}},\n" \ + "\thowpublished = {{{how_published}}},\n" \ + "\turl = {{{url}}}\n}}\n" + bibtex = bibtex_base.format(header=bibtex_header, + short_name=short_name, + authors=authors, + title=title, + year=year, + month=month, + eprinttype=eprinttype, + eprint=eprint, + how_published=':'.join((eprinttype, eprint)), + url=article_url) return bibtex -def parse_arxiv_bibtex( data, lastname_first=True ): - #todo: exceptions versioning + +def parse_arxiv_bibtex(data, lastname_first=True): + # todo: exceptions versioning ind = 0 - + title = data['entries'][ind]['title'] - eprinttype='arXiv' + eprinttype = 'arXiv' id_parser = re.search('http://arxiv.org/abs/(?P.*)', data['entries'][ind]['id']) - eprint=id_parser.groupdict()['id_str'] + eprint = id_parser.groupdict()['id_str'] author_list = [author['name'] for author in data['entries'][ind]['authors']] authors = parse_authors(author_list, lastname_first=lastname_first) @@ -193,19 +202,19 @@ def parse_arxiv_bibtex( data, lastname_first=True ): first_author = HumanName(author_list[0]) if len(first_author.suffix) > 0: - default_short_name = '_'.join( (first_author.last.replace(' ',"_"), - first_author.middle.replace(' ',"_"), - str(year)) ) + default_short_name = '_'.join((first_author.last.replace(' ', "_"), + first_author.middle.replace(' ', "_"), + str(year))) else: - default_short_name = '_'.join( (first_author.last.replace(' ',"_"), - str(year)) ) + default_short_name = '_'.join((first_author.last.replace(' ', "_"), + str(year))) bibtex = format_bibtex_entry(default_short_name, - authors, - title, - year, - month, - eprinttype, - eprint, - article_url) + authors, + title, + year, + month, + eprinttype, + eprint, + article_url) return bibtex diff --git a/requirements.txt b/requirements.txt index 83e0a64..a1ed863 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ feedparser -nameparser \ No newline at end of file +nameparser +requests diff --git a/test_refid2bib.py b/test_refid2bib.py index c0ba960..5b35af1 100644 --- a/test_refid2bib.py +++ b/test_refid2bib.py @@ -1,31 +1,61 @@ from refid2bib import refid2bib -doi_biorxiv = "https://doi.org/10.1101/406314" -arxiv_id = "arXiv:1801.04381" -doi_general = "DOI: 10.1126/science.1260088" -pmid = 'PMID: 25056931' -pmc_id = 'PMC3711719' -print( 'DOI example:') -print( refid2bib(doi_general) ) +def test_doi(): + """DOI example:""" + assert refid2bib("DOI: 10.1126/science.1260088") is not None -print( 'Arxiv example:') -print( refid2bib(arxiv_id) ) -print( 'Biorxiv example:') -print( refid2bib(doi_biorxiv) ) +def test_arxiv(): + """Arxiv example:""" + assert refid2bib("arXiv:1801.04381") is not None -print( 'pmid example:') -print( refid2bib(pmid) ) -print( 'PMC example:') -print( refid2bib(pmc_id) ) +def test_arxiv2(): + """Arxiv example:""" + assert refid2bib('arxiv://1907.10138') is not None -print( 'Doi example, specified reference type:') -print( refid2bib(doi_general, ref_type='doi') ) -print( 'Arxiv example, custom short name:') -print( refid2bib(arxiv_id, short_name='my_custom_name' ) ) +def test_biorxiv(): + """Biorxiv example:""" + assert refid2bib("https://doi.org/10.1101/406314") is not None -print( 'General example, switched name order:') -print( refid2bib(doi_general, lastname_first=False ) ) + +def test_biorxiv2(): + """Biorxiv example: tests regression of bugfix for issue #1""" + """Hello! I've been using this library for a while, and I just started testing some functionality I hadn't needed + when I noticed that the biorxiv id doesn't currently work as described on the README. If I enter an id expression + with a colon in it like "biorxiv:570689", I get an error because the code ends up constructing a DOI with colon + in it. I think this is because the regular expression 'biorxiv':'^biorxiv ?|^biorxiv:' in the tests always + matches the first expression before the second one. It might just work to swap the two cases in the regex. """ + assert refid2bib("bioRxiv: 464909") is not None + + +def test_pmid(): + """pmid example:""" + assert refid2bib('PMID: 25056931') is not None + + +def test_pmc(): + """PMC example:""" + assert refid2bib('PMC3711719') is not None + + +def test_doi_explicit(): + """Doi example, specified reference type:""" + assert refid2bib("DOI: 10.1126/science.1260088", ref_type='doi') is not None + + +def test_arxiv_custom_short_name(): + """Arxiv example, custom short name:""" + assert refid2bib("arXiv:1801.04381", short_name='my_custom_name') is not None + + +def test_switching_name_order(): + """General example, switched name order:""" + assert refid2bib("DOI: 10.1126/science.1260088", lastname_first=False) is not None + + +if __name__ == '__main__': + import pytest + pytest.main()