diff --git a/.gitignore b/.gitignore index 894a44c..9e5e6d4 100644 --- a/.gitignore +++ b/.gitignore @@ -102,3 +102,6 @@ venv.bak/ # mypy .mypy_cache/ + +# Pycharm +.idea \ No newline at end of file diff --git a/refid2bib/__init__.py b/refid2bib/__init__.py index 35ece80..d878607 100644 --- a/refid2bib/__init__.py +++ b/refid2bib/__init__.py @@ -1,33 +1,35 @@ -from refid2bib.core import get_biorxiv_bibtex, get_doi_bibtex, get_arxiv_bibtex, get_pmid_bibtex import re +from refid2bib.core import get_biorxiv_bibtex, get_doi_bibtex, get_arxiv_bibtex, get_pmid_bibtex + bibtex_functions = { - 'biorxiv':get_biorxiv_bibtex, - 'doi':get_doi_bibtex, - 'arxiv':get_arxiv_bibtex, - 'pmid':get_pmid_bibtex} + 'biorxiv': get_biorxiv_bibtex, + 'doi': get_doi_bibtex, + 'arxiv': get_arxiv_bibtex, + 'pmid': get_pmid_bibtex +} -def refid2bib(ref, short_name=None, lastname_first=True, ref_type=None): - tests = {'doi':'^doi: ?|^https?://doi.org/|^https?://dx.doi.org/', - 'biorxiv':'^biorxiv ?|^biorxiv:', - 'arxiv': '^arxiv:|^https?://arxiv.org/abs/', - 'pmid': '^pmid: ?|^(?=pmc\d*)', +def refid2bib(ref, short_name=None, lastname_first=True, ref_type=None): + tests = { + 'doi': r'^doi: ?|^https?://doi.org/|^https?://dx.doi.org/', + 'biorxiv': r'^biorxiv:|^biorxiv ?', + 'arxiv': r'^arxiv:|^https?://arxiv.org/abs/', + 'pmid': r'^pmid: ?|^(?=pmc\d*)', } - + if ref_type in tests.keys(): - query = re.search( tests[ref_type], ref.lower() ) + query = re.search(tests[ref_type], ref.lower()) oid = ref[query.span()[1]:] else: for c, q in tests.items(): - query = re.search( q, ref.lower() ) + query = re.search(q, ref.lower()) if query is not None: oid = ref[query.span()[1]:] ref_type = c break - else: - raise ValueError( 'Cannot assign reference type for {}'.format(ref) ) + else: + raise ValueError('Cannot assign reference type for {}'.format(ref)) + oid = oid.lstrip('/').strip() return bibtex_functions[ref_type](oid, short_name=short_name, lastname_first=lastname_first) - - diff --git a/refid2bib/__main__.py b/refid2bib/__main__.py index b1947af..0f5b61d 100644 --- a/refid2bib/__main__.py +++ b/refid2bib/__main__.py @@ -1,6 +1,10 @@ -import sys from refid2bib import refid2bib -oid = sys.argv[1] -print( '\n') -print( refid2bib(oid) ) + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('document_id', help='DOI, ArXiv ID, etc. of the document you wish to look up') + args = parser.parse_args() + + print(refid2bib(args.document_id)) diff --git a/refid2bib/core.py b/refid2bib/core.py index 72691f1..78e6ee1 100644 --- a/refid2bib/core.py +++ b/refid2bib/core.py @@ -1,48 +1,52 @@ -import requests -import re import json +import re + import feedparser +import requests from nameparser import HumanName +pmid_url = 'https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids={};format=json' + -pmid_url='https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids={};format=json' -def pmid_formatter( pmid, pmid_base=pmid_url): +def pmid_formatter(pmid, pmid_base=pmid_url): return pmid_base.format(pmid) -def get_pmid_doi( pmid ): - data = requests.get( pmid_formatter(pmid) ) +def get_pmid_doi(pmid): + data = requests.get(pmid_formatter(pmid)) data.raise_for_status() data_dict = json.loads(data.text) return data_dict['records'][0]['doi'] -def get_pmid_bibtex( pmid, short_name=None, lastname_first=True ): +def get_pmid_bibtex(pmid, short_name=None, lastname_first=True): doi = get_pmid_doi(pmid) - return get_doi_bibtex( doi, short_name=short_name, lastname_first=lastname_first ) + return get_doi_bibtex(doi, short_name=short_name, lastname_first=lastname_first) + + +doi_url = 'http://dx.doi.org' -doi_url='http://dx.doi.org' -def doi_formatter( doi, doi_base=doi_url ): - return '/'.join((doi_base,doi)) +def doi_formatter(doi, doi_base=doi_url): + return '/'.join((doi_base, doi)) -def get_doi_bibtex( doi, short_name=None, lastname_first = True ): - #To do, better exception handling on the request - header={'Accept': 'application/x-bibtex'} - data=requests.get(doi_formatter(doi), headers=header) +def get_doi_bibtex(doi, short_name=None, lastname_first=True): + # To do, better exception handling on the request + header = {'Accept': 'application/x-bibtex'} + data = requests.get(doi_formatter(doi), headers=header) data.raise_for_status() bibtex = data.text + "\n" if lastname_first: - bibtex = invert_author_names( bibtex ) + bibtex = invert_author_names(bibtex) - is_biorxiv, biorxiv_id = doi_is_biorxiv( doi ) + is_biorxiv, biorxiv_id = doi_is_biorxiv(doi) if is_biorxiv: - bibtex = fix_biorxiv_info( bibtex, biorxiv_id ) - return replace_short_name( bibtex, short_name ) + bibtex = fix_biorxiv_info(bibtex, biorxiv_id) + return replace_short_name(bibtex, short_name) -def invert_author_names( bibtex ): +def invert_author_names(bibtex): author_test = re.search("author = {(?P.*)}", bibtex) author_str_original = author_test.groupdict()['author_str'] author_str_inverted = parse_authors(author_str_original) @@ -50,43 +54,47 @@ def invert_author_names( bibtex ): arxiv_url = "http://export.arxiv.org/api/query" -def get_arxiv_bibtex( arxiv_number, short_name=None, lastname_first=True): - params={'id_list':arxiv_number} - atom_data=requests.get(arxiv_url, params=params) - data=feedparser.parse(atom_data.text) + + +def get_arxiv_bibtex(arxiv_number, short_name=None, lastname_first=True): + params = {'id_list': arxiv_number} + atom_data = requests.get(arxiv_url, params=params) + data = feedparser.parse(atom_data.text) if 'id' in data['entries'][0]: - bibtex = parse_arxiv_bibtex( data, lastname_first=lastname_first ) + bibtex = parse_arxiv_bibtex(data, lastname_first=lastname_first) else: raise Exception('Arxiv ID not found') return replace_short_name(bibtex, short_name) biorxiv_doi = "10.1101/{bid}" -def biorxiv_doi_formatter( biorxiv_id, biorxiv_base=biorxiv_doi ): + + +def biorxiv_doi_formatter(biorxiv_id, biorxiv_base=biorxiv_doi): return biorxiv_base.format(bid=biorxiv_id) -def get_biorxiv_bibtex( biorxiv_id, short_name=None, lastname_first=True): - bibtex = get_doi_bibtex( biorxiv_doi_formatter(biorxiv_id), - short_name=short_name, - lastname_first=lastname_first ) +def get_biorxiv_bibtex(biorxiv_id, short_name=None, lastname_first=True): + bibtex = get_doi_bibtex(biorxiv_doi_formatter(biorxiv_id), + short_name=short_name, + lastname_first=lastname_first) return bibtex -def fix_biorxiv_info( bibtex, biorxiv_id ): +def fix_biorxiv_info(bibtex, biorxiv_id): biorxiv_info = "\teprinttype={{bioRxiv}},\n" \ "\teprint={{{bid}}},\n" \ "\thowpublished={{{hp}}}\n}}\n".format(bid=biorxiv_id, - hp='bioRxiv doi:10.1101/{}'.format(biorxiv_id)) - bibtex = bibtex.replace('@article{','@online{',1) + hp='bioRxiv doi:10.1101/{}'.format(biorxiv_id)) + bibtex = bibtex.replace('@article{', '@online{', 1) return bibtex[:-3] + ',\n' + biorxiv_info -def doi_is_biorxiv( doi ): - biorxiv_regex = "(?P10\.\d{4,}\.?\d*)\/(?P.*)" - biorxiv_doi_test = re.search( biorxiv_regex, doi ) - if ( biorxiv_doi_test.groupdict()['doi_prefix'] == '10.1101' ) and \ - ( re.match('^\d*$', biorxiv_doi_test.groupdict()['doi_suffix']) is not None ): +def doi_is_biorxiv(doi): + biorxiv_regex = r"(?P10\.\d{4,}\.?\d*)\/(?P.*)" + biorxiv_doi_test = re.search(biorxiv_regex, doi) + if (biorxiv_doi_test.groupdict()['doi_prefix'] == '10.1101') and \ + (re.match(r'^\d*$', biorxiv_doi_test.groupdict()['doi_suffix']) is not None): return True, biorxiv_doi_test.groupdict()['doi_suffix'] else: return False, None @@ -96,13 +104,13 @@ def replace_short_name(bibtex, short_name): if short_name is not None: oldreg = re.search('^@.*{(?P.*),', bibtex) old_name = oldreg.groupdict()['old_name'] - bibtex = bibtex.replace( old_name, short_name, 1) + bibtex = bibtex.replace(old_name, short_name, 1) return bibtex -def parse_authors( authors, lastname_first=True ): +def parse_authors(authors, lastname_first=True): if isinstance(authors, list): - author_list=authors + author_list = authors elif isinstance(authors, str): author_list = authors.split(' and ') else: @@ -113,34 +121,34 @@ def parse_authors( authors, lastname_first=True ): first_names = [] for name in name_list: if len(name.suffix) > 0: - last_names.append( '{l} {s}'.format(l=name.last, s=name.suffix) ) + last_names.append('{l} {s}'.format(l=name.last, s=name.suffix)) else: last_names.append('{l}'.format(l=name.last)) if len(name.middle) > 0: - first_names.append( '{f} {m}'.format(f=name.first, m=name.middle)) + first_names.append('{f} {m}'.format(f=name.first, m=name.middle)) else: - first_names.append( '{f}'.format(f=name.first)) + first_names.append('{f}'.format(f=name.first)) if lastname_first: name_str = ' and '.join(['{}, {}'.format(*name) for name in zip(last_names, first_names)]) else: - name_str = ' and '.join(['{} {}'.format(*name) for name in zip(first_names, last_names)]) + name_str = ' and '.join(['{} {}'.format(*name) for name in zip(first_names, last_names)]) return name_str month_map = { - 1:'Jan', - 2:'Feb', - 3:'Mar', - 4:'Apr', - 5:'May', - 6:'Jun', - 7:'Jul', - 8:'Aug', - 9:'Sep', - 10:'Oct', - 11:'Nov', - 12:'Dec' - } + 1: 'Jan', + 2: 'Feb', + 3: 'Mar', + 4: 'Apr', + 5: 'May', + 6: 'Jun', + 7: 'Jul', + 8: 'Aug', + 9: 'Sep', + 10: 'Oct', + 11: 'Nov', + 12: 'Dec' +} def format_bibtex_entry(short_name, @@ -153,35 +161,36 @@ def format_bibtex_entry(short_name, article_url): bibtex_header = '@online{' bibtex_base = "{header}{short_name},\n" \ - "\tauthor = {{{authors}}},\n" \ - "\ttitle = {{{title}}},\n" \ - "\tyear = {{{year}}},\n" \ - "\tmonth = {{{month}}},\n" \ - "\teprinttype = {{{eprinttype}}},\n" \ - "\teprint = {{{eprint}}},\n" \ - "\thowpublished = {{{how_published}}},\n" \ - "\turl = {{{url}}}\n}}\n" - bibtex = bibtex_base.format( header=bibtex_header, - short_name=short_name, - authors=authors, - title=title, - year=year, - month=month, - eprinttype=eprinttype, - eprint=eprint, - how_published=':'.join((eprinttype, eprint)), - url=article_url) + "\tauthor = {{{authors}}},\n" \ + "\ttitle = {{{title}}},\n" \ + "\tyear = {{{year}}},\n" \ + "\tmonth = {{{month}}},\n" \ + "\teprinttype = {{{eprinttype}}},\n" \ + "\teprint = {{{eprint}}},\n" \ + "\thowpublished = {{{how_published}}},\n" \ + "\turl = {{{url}}}\n}}\n" + bibtex = bibtex_base.format(header=bibtex_header, + short_name=short_name, + authors=authors, + title=title, + year=year, + month=month, + eprinttype=eprinttype, + eprint=eprint, + how_published=':'.join((eprinttype, eprint)), + url=article_url) return bibtex -def parse_arxiv_bibtex( data, lastname_first=True ): - #todo: exceptions versioning + +def parse_arxiv_bibtex(data, lastname_first=True): + # todo: exceptions versioning ind = 0 - + title = data['entries'][ind]['title'] - eprinttype='arXiv' + eprinttype = 'arXiv' id_parser = re.search('http://arxiv.org/abs/(?P.*)', data['entries'][ind]['id']) - eprint=id_parser.groupdict()['id_str'] + eprint = id_parser.groupdict()['id_str'] author_list = [author['name'] for author in data['entries'][ind]['authors']] authors = parse_authors(author_list, lastname_first=lastname_first) @@ -193,19 +202,19 @@ def parse_arxiv_bibtex( data, lastname_first=True ): first_author = HumanName(author_list[0]) if len(first_author.suffix) > 0: - default_short_name = '_'.join( (first_author.last.replace(' ',"_"), - first_author.middle.replace(' ',"_"), - str(year)) ) + default_short_name = '_'.join((first_author.last.replace(' ', "_"), + first_author.middle.replace(' ', "_"), + str(year))) else: - default_short_name = '_'.join( (first_author.last.replace(' ',"_"), - str(year)) ) + default_short_name = '_'.join((first_author.last.replace(' ', "_"), + str(year))) bibtex = format_bibtex_entry(default_short_name, - authors, - title, - year, - month, - eprinttype, - eprint, - article_url) + authors, + title, + year, + month, + eprinttype, + eprint, + article_url) return bibtex diff --git a/requirements.txt b/requirements.txt index 83e0a64..a1ed863 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ feedparser -nameparser \ No newline at end of file +nameparser +requests diff --git a/test_refid2bib.py b/test_refid2bib.py index c0ba960..5b35af1 100644 --- a/test_refid2bib.py +++ b/test_refid2bib.py @@ -1,31 +1,61 @@ from refid2bib import refid2bib -doi_biorxiv = "https://doi.org/10.1101/406314" -arxiv_id = "arXiv:1801.04381" -doi_general = "DOI: 10.1126/science.1260088" -pmid = 'PMID: 25056931' -pmc_id = 'PMC3711719' -print( 'DOI example:') -print( refid2bib(doi_general) ) +def test_doi(): + """DOI example:""" + assert refid2bib("DOI: 10.1126/science.1260088") is not None -print( 'Arxiv example:') -print( refid2bib(arxiv_id) ) -print( 'Biorxiv example:') -print( refid2bib(doi_biorxiv) ) +def test_arxiv(): + """Arxiv example:""" + assert refid2bib("arXiv:1801.04381") is not None -print( 'pmid example:') -print( refid2bib(pmid) ) -print( 'PMC example:') -print( refid2bib(pmc_id) ) +def test_arxiv2(): + """Arxiv example:""" + assert refid2bib('arxiv://1907.10138') is not None -print( 'Doi example, specified reference type:') -print( refid2bib(doi_general, ref_type='doi') ) -print( 'Arxiv example, custom short name:') -print( refid2bib(arxiv_id, short_name='my_custom_name' ) ) +def test_biorxiv(): + """Biorxiv example:""" + assert refid2bib("https://doi.org/10.1101/406314") is not None -print( 'General example, switched name order:') -print( refid2bib(doi_general, lastname_first=False ) ) + +def test_biorxiv2(): + """Biorxiv example: tests regression of bugfix for issue #1""" + """Hello! I've been using this library for a while, and I just started testing some functionality I hadn't needed + when I noticed that the biorxiv id doesn't currently work as described on the README. If I enter an id expression + with a colon in it like "biorxiv:570689", I get an error because the code ends up constructing a DOI with colon + in it. I think this is because the regular expression 'biorxiv':'^biorxiv ?|^biorxiv:' in the tests always + matches the first expression before the second one. It might just work to swap the two cases in the regex. """ + assert refid2bib("bioRxiv: 464909") is not None + + +def test_pmid(): + """pmid example:""" + assert refid2bib('PMID: 25056931') is not None + + +def test_pmc(): + """PMC example:""" + assert refid2bib('PMC3711719') is not None + + +def test_doi_explicit(): + """Doi example, specified reference type:""" + assert refid2bib("DOI: 10.1126/science.1260088", ref_type='doi') is not None + + +def test_arxiv_custom_short_name(): + """Arxiv example, custom short name:""" + assert refid2bib("arXiv:1801.04381", short_name='my_custom_name') is not None + + +def test_switching_name_order(): + """General example, switched name order:""" + assert refid2bib("DOI: 10.1126/science.1260088", lastname_first=False) is not None + + +if __name__ == '__main__': + import pytest + pytest.main()