From 4252cb8c971beb47d68f0ccfd7ca2622ac0e8e8c Mon Sep 17 00:00:00 2001
From: scnerd <david.maxson@jhuapl.edu>
Date: Tue, 14 Jul 2020 09:31:12 -0400
Subject: [PATCH] Fixes issue #1, a break in biorxiv id's that are given
 directly. Fixes some instabilities caused by stray whitespace or leading
 slashes. Reformatted code to be closer to PEP8. Updated tests to use Pytest.
 Fixed requirements.txt, which was missing the ``requests`` package.

---
 .gitignore            |   3 +
 refid2bib/__init__.py |  36 ++++----
 refid2bib/__main__.py |  12 ++-
 refid2bib/core.py     | 195 ++++++++++++++++++++++--------------------
 requirements.txt      |   3 +-
 test_refid2bib.py     |  72 +++++++++++-----
 6 files changed, 185 insertions(+), 136 deletions(-)

diff --git a/.gitignore b/.gitignore
index 894a44c..9e5e6d4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -102,3 +102,6 @@ venv.bak/
 
 # mypy
 .mypy_cache/
+
+# Pycharm
+.idea
\ No newline at end of file
diff --git a/refid2bib/__init__.py b/refid2bib/__init__.py
index 35ece80..d878607 100644
--- a/refid2bib/__init__.py
+++ b/refid2bib/__init__.py
@@ -1,33 +1,35 @@
-from refid2bib.core import get_biorxiv_bibtex, get_doi_bibtex, get_arxiv_bibtex, get_pmid_bibtex
 import re
 
+from refid2bib.core import get_biorxiv_bibtex, get_doi_bibtex, get_arxiv_bibtex, get_pmid_bibtex
+
 bibtex_functions = {
-    'biorxiv':get_biorxiv_bibtex,
-    'doi':get_doi_bibtex,
-    'arxiv':get_arxiv_bibtex,
-    'pmid':get_pmid_bibtex}
+    'biorxiv': get_biorxiv_bibtex,
+    'doi': get_doi_bibtex,
+    'arxiv': get_arxiv_bibtex,
+    'pmid': get_pmid_bibtex
+}
 
-def refid2bib(ref, short_name=None, lastname_first=True, ref_type=None):
 
-    tests = {'doi':'^doi: ?|^https?://doi.org/|^https?://dx.doi.org/',
-             'biorxiv':'^biorxiv ?|^biorxiv:',
-             'arxiv': '^arxiv:|^https?://arxiv.org/abs/',
-             'pmid': '^pmid: ?|^(?=pmc\d*)',
+def refid2bib(ref, short_name=None, lastname_first=True, ref_type=None):
+    tests = {
+        'doi': r'^doi: ?|^https?://doi.org/|^https?://dx.doi.org/',
+        'biorxiv': r'^biorxiv:|^biorxiv ?',
+        'arxiv': r'^arxiv:|^https?://arxiv.org/abs/',
+        'pmid': r'^pmid: ?|^(?=pmc\d*)',
     }
-    
+
     if ref_type in tests.keys():
-        query = re.search( tests[ref_type], ref.lower() )
+        query = re.search(tests[ref_type], ref.lower())
         oid = ref[query.span()[1]:]
     else:
         for c, q in tests.items():
-            query = re.search( q, ref.lower() )
+            query = re.search(q, ref.lower())
             if query is not None:
                 oid = ref[query.span()[1]:]
                 ref_type = c
                 break
-        else:            
-            raise ValueError( 'Cannot assign reference type for {}'.format(ref) )
+        else:
+            raise ValueError('Cannot assign reference type for {}'.format(ref))
 
+    oid = oid.lstrip('/').strip()
     return bibtex_functions[ref_type](oid, short_name=short_name, lastname_first=lastname_first)
-
-
diff --git a/refid2bib/__main__.py b/refid2bib/__main__.py
index b1947af..0f5b61d 100644
--- a/refid2bib/__main__.py
+++ b/refid2bib/__main__.py
@@ -1,6 +1,10 @@
-import sys
 from refid2bib import refid2bib
 
-oid = sys.argv[1]
-print( '\n')
-print( refid2bib(oid) )
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('document_id', help='DOI, ArXiv ID, etc. of the document you wish to look up')
+    args = parser.parse_args()
+
+    print(refid2bib(args.document_id))
diff --git a/refid2bib/core.py b/refid2bib/core.py
index 72691f1..78e6ee1 100644
--- a/refid2bib/core.py
+++ b/refid2bib/core.py
@@ -1,48 +1,52 @@
-import requests
-import re
 import json
+import re
+
 import feedparser
+import requests
 from nameparser import HumanName
 
+pmid_url = 'https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids={};format=json'
+
 
-pmid_url='https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids={};format=json'
-def pmid_formatter( pmid, pmid_base=pmid_url):
+def pmid_formatter(pmid, pmid_base=pmid_url):
     return pmid_base.format(pmid)
 
 
-def get_pmid_doi( pmid ):
-    data = requests.get( pmid_formatter(pmid) )
+def get_pmid_doi(pmid):
+    data = requests.get(pmid_formatter(pmid))
     data.raise_for_status()
     data_dict = json.loads(data.text)
     return data_dict['records'][0]['doi']
 
 
-def get_pmid_bibtex( pmid, short_name=None, lastname_first=True ):
+def get_pmid_bibtex(pmid, short_name=None, lastname_first=True):
     doi = get_pmid_doi(pmid)
-    return get_doi_bibtex( doi, short_name=short_name, lastname_first=lastname_first )
+    return get_doi_bibtex(doi, short_name=short_name, lastname_first=lastname_first)
+
+
+doi_url = 'http://dx.doi.org'
 
 
-doi_url='http://dx.doi.org'
-def doi_formatter( doi, doi_base=doi_url ):
-    return '/'.join((doi_base,doi))
+def doi_formatter(doi, doi_base=doi_url):
+    return '/'.join((doi_base, doi))
 
 
-def get_doi_bibtex( doi, short_name=None, lastname_first = True ):
-    #To do, better exception handling on the request
-    header={'Accept': 'application/x-bibtex'}
-    data=requests.get(doi_formatter(doi), headers=header)
+def get_doi_bibtex(doi, short_name=None, lastname_first=True):
+    # To do, better exception handling on the request
+    header = {'Accept': 'application/x-bibtex'}
+    data = requests.get(doi_formatter(doi), headers=header)
     data.raise_for_status()
     bibtex = data.text + "\n"
     if lastname_first:
-        bibtex = invert_author_names( bibtex )
+        bibtex = invert_author_names(bibtex)
 
-    is_biorxiv, biorxiv_id = doi_is_biorxiv( doi )
+    is_biorxiv, biorxiv_id = doi_is_biorxiv(doi)
     if is_biorxiv:
-        bibtex = fix_biorxiv_info( bibtex, biorxiv_id )  
-    return replace_short_name( bibtex, short_name )
+        bibtex = fix_biorxiv_info(bibtex, biorxiv_id)
+    return replace_short_name(bibtex, short_name)
 
 
-def invert_author_names( bibtex ):
+def invert_author_names(bibtex):
     author_test = re.search("author = {(?P<author_str>.*)}", bibtex)
     author_str_original = author_test.groupdict()['author_str']
     author_str_inverted = parse_authors(author_str_original)
@@ -50,43 +54,47 @@ def invert_author_names( bibtex ):
 
 
 arxiv_url = "http://export.arxiv.org/api/query"
-def get_arxiv_bibtex( arxiv_number, short_name=None, lastname_first=True):
-    params={'id_list':arxiv_number}
-    atom_data=requests.get(arxiv_url, params=params)
-    data=feedparser.parse(atom_data.text)
+
+
+def get_arxiv_bibtex(arxiv_number, short_name=None, lastname_first=True):
+    params = {'id_list': arxiv_number}
+    atom_data = requests.get(arxiv_url, params=params)
+    data = feedparser.parse(atom_data.text)
     if 'id' in data['entries'][0]:
-        bibtex = parse_arxiv_bibtex( data, lastname_first=lastname_first )
+        bibtex = parse_arxiv_bibtex(data, lastname_first=lastname_first)
     else:
         raise Exception('Arxiv ID not found')
     return replace_short_name(bibtex, short_name)
 
 
 biorxiv_doi = "10.1101/{bid}"
-def biorxiv_doi_formatter( biorxiv_id, biorxiv_base=biorxiv_doi ):
+
+
+def biorxiv_doi_formatter(biorxiv_id, biorxiv_base=biorxiv_doi):
     return biorxiv_base.format(bid=biorxiv_id)
 
 
-def get_biorxiv_bibtex( biorxiv_id, short_name=None, lastname_first=True):
-    bibtex = get_doi_bibtex( biorxiv_doi_formatter(biorxiv_id),
-                             short_name=short_name,
-                             lastname_first=lastname_first )
+def get_biorxiv_bibtex(biorxiv_id, short_name=None, lastname_first=True):
+    bibtex = get_doi_bibtex(biorxiv_doi_formatter(biorxiv_id),
+                            short_name=short_name,
+                            lastname_first=lastname_first)
     return bibtex
 
 
-def fix_biorxiv_info( bibtex, biorxiv_id ):
+def fix_biorxiv_info(bibtex, biorxiv_id):
     biorxiv_info = "\teprinttype={{bioRxiv}},\n" \
                    "\teprint={{{bid}}},\n" \
                    "\thowpublished={{{hp}}}\n}}\n".format(bid=biorxiv_id,
-                                                           hp='bioRxiv doi:10.1101/{}'.format(biorxiv_id))
-    bibtex = bibtex.replace('@article{','@online{',1)
+                                                          hp='bioRxiv doi:10.1101/{}'.format(biorxiv_id))
+    bibtex = bibtex.replace('@article{', '@online{', 1)
     return bibtex[:-3] + ',\n' + biorxiv_info
 
 
-def doi_is_biorxiv( doi ):
-    biorxiv_regex = "(?P<doi_prefix>10\.\d{4,}\.?\d*)\/(?P<doi_suffix>.*)"
-    biorxiv_doi_test = re.search( biorxiv_regex, doi )
-    if ( biorxiv_doi_test.groupdict()['doi_prefix'] == '10.1101' ) and \
-        ( re.match('^\d*$', biorxiv_doi_test.groupdict()['doi_suffix']) is not None ):
+def doi_is_biorxiv(doi):
+    biorxiv_regex = r"(?P<doi_prefix>10\.\d{4,}\.?\d*)\/(?P<doi_suffix>.*)"
+    biorxiv_doi_test = re.search(biorxiv_regex, doi)
+    if (biorxiv_doi_test.groupdict()['doi_prefix'] == '10.1101') and \
+        (re.match(r'^\d*$', biorxiv_doi_test.groupdict()['doi_suffix']) is not None):
         return True, biorxiv_doi_test.groupdict()['doi_suffix']
     else:
         return False, None
@@ -96,13 +104,13 @@ def replace_short_name(bibtex, short_name):
     if short_name is not None:
         oldreg = re.search('^@.*{(?P<old_name>.*),', bibtex)
         old_name = oldreg.groupdict()['old_name']
-        bibtex = bibtex.replace( old_name, short_name, 1)      
+        bibtex = bibtex.replace(old_name, short_name, 1)
     return bibtex
 
 
-def parse_authors( authors, lastname_first=True ):
+def parse_authors(authors, lastname_first=True):
     if isinstance(authors, list):
-        author_list=authors
+        author_list = authors
     elif isinstance(authors, str):
         author_list = authors.split(' and ')
     else:
@@ -113,34 +121,34 @@ def parse_authors( authors, lastname_first=True ):
     first_names = []
     for name in name_list:
         if len(name.suffix) > 0:
-            last_names.append( '{l} {s}'.format(l=name.last, s=name.suffix) )
+            last_names.append('{l} {s}'.format(l=name.last, s=name.suffix))
         else:
             last_names.append('{l}'.format(l=name.last))
         if len(name.middle) > 0:
-            first_names.append( '{f} {m}'.format(f=name.first, m=name.middle))
+            first_names.append('{f} {m}'.format(f=name.first, m=name.middle))
         else:
-            first_names.append( '{f}'.format(f=name.first))
+            first_names.append('{f}'.format(f=name.first))
     if lastname_first:
         name_str = ' and '.join(['{}, {}'.format(*name) for name in zip(last_names, first_names)])
     else:
-        name_str = ' and '.join(['{} {}'.format(*name) for name in zip(first_names, last_names)]) 
+        name_str = ' and '.join(['{} {}'.format(*name) for name in zip(first_names, last_names)])
     return name_str
 
 
 month_map = {
-    1:'Jan',
-    2:'Feb',
-    3:'Mar',
-    4:'Apr',
-    5:'May',
-    6:'Jun',
-    7:'Jul',
-    8:'Aug',
-    9:'Sep',
-    10:'Oct',
-    11:'Nov',
-    12:'Dec'
-    }
+    1: 'Jan',
+    2: 'Feb',
+    3: 'Mar',
+    4: 'Apr',
+    5: 'May',
+    6: 'Jun',
+    7: 'Jul',
+    8: 'Aug',
+    9: 'Sep',
+    10: 'Oct',
+    11: 'Nov',
+    12: 'Dec'
+}
 
 
 def format_bibtex_entry(short_name,
@@ -153,35 +161,36 @@ def format_bibtex_entry(short_name,
                         article_url):
     bibtex_header = '@online{'
     bibtex_base = "{header}{short_name},\n" \
-        "\tauthor = {{{authors}}},\n" \
-        "\ttitle = {{{title}}},\n" \
-        "\tyear = {{{year}}},\n" \
-        "\tmonth = {{{month}}},\n" \
-        "\teprinttype = {{{eprinttype}}},\n" \
-        "\teprint = {{{eprint}}},\n" \
-        "\thowpublished = {{{how_published}}},\n" \
-        "\turl = {{{url}}}\n}}\n"
-    bibtex = bibtex_base.format( header=bibtex_header,
-                    short_name=short_name,
-                    authors=authors,
-                    title=title,
-                    year=year,
-                    month=month,
-                    eprinttype=eprinttype,
-                    eprint=eprint,
-                    how_published=':'.join((eprinttype, eprint)),
-                    url=article_url)
+                  "\tauthor = {{{authors}}},\n" \
+                  "\ttitle = {{{title}}},\n" \
+                  "\tyear = {{{year}}},\n" \
+                  "\tmonth = {{{month}}},\n" \
+                  "\teprinttype = {{{eprinttype}}},\n" \
+                  "\teprint = {{{eprint}}},\n" \
+                  "\thowpublished = {{{how_published}}},\n" \
+                  "\turl = {{{url}}}\n}}\n"
+    bibtex = bibtex_base.format(header=bibtex_header,
+                                short_name=short_name,
+                                authors=authors,
+                                title=title,
+                                year=year,
+                                month=month,
+                                eprinttype=eprinttype,
+                                eprint=eprint,
+                                how_published=':'.join((eprinttype, eprint)),
+                                url=article_url)
     return bibtex
 
-def parse_arxiv_bibtex( data, lastname_first=True ):
-    #todo: exceptions versioning
+
+def parse_arxiv_bibtex(data, lastname_first=True):
+    # todo: exceptions versioning
     ind = 0
-    
+
     title = data['entries'][ind]['title']
 
-    eprinttype='arXiv'
+    eprinttype = 'arXiv'
     id_parser = re.search('http://arxiv.org/abs/(?P<id_str>.*)', data['entries'][ind]['id'])
-    eprint=id_parser.groupdict()['id_str']
+    eprint = id_parser.groupdict()['id_str']
 
     author_list = [author['name'] for author in data['entries'][ind]['authors']]
     authors = parse_authors(author_list, lastname_first=lastname_first)
@@ -193,19 +202,19 @@ def parse_arxiv_bibtex( data, lastname_first=True ):
 
     first_author = HumanName(author_list[0])
     if len(first_author.suffix) > 0:
-        default_short_name = '_'.join( (first_author.last.replace(' ',"_"),
-                                        first_author.middle.replace(' ',"_"),
-                                        str(year)) )
+        default_short_name = '_'.join((first_author.last.replace(' ', "_"),
+                                       first_author.middle.replace(' ', "_"),
+                                       str(year)))
     else:
-        default_short_name = '_'.join( (first_author.last.replace(' ',"_"),
-                                        str(year)) )
+        default_short_name = '_'.join((first_author.last.replace(' ', "_"),
+                                       str(year)))
 
     bibtex = format_bibtex_entry(default_short_name,
-                                authors,
-                                title,
-                                year,
-                                month,
-                                eprinttype,
-                                eprint,
-                                article_url)
+                                 authors,
+                                 title,
+                                 year,
+                                 month,
+                                 eprinttype,
+                                 eprint,
+                                 article_url)
     return bibtex
diff --git a/requirements.txt b/requirements.txt
index 83e0a64..a1ed863 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,3 @@
 feedparser
-nameparser
\ No newline at end of file
+nameparser
+requests
diff --git a/test_refid2bib.py b/test_refid2bib.py
index c0ba960..5b35af1 100644
--- a/test_refid2bib.py
+++ b/test_refid2bib.py
@@ -1,31 +1,61 @@
 from refid2bib import refid2bib
 
-doi_biorxiv = "https://doi.org/10.1101/406314"
-arxiv_id = "arXiv:1801.04381"
-doi_general = "DOI: 10.1126/science.1260088"
-pmid = 'PMID: 25056931'
-pmc_id = 'PMC3711719'
 
-print( 'DOI example:')
-print( refid2bib(doi_general) )
+def test_doi():
+    """DOI example:"""
+    assert refid2bib("DOI: 10.1126/science.1260088") is not None
 
-print( 'Arxiv example:')
-print( refid2bib(arxiv_id) )
 
-print( 'Biorxiv example:')
-print( refid2bib(doi_biorxiv) )
+def test_arxiv():
+    """Arxiv example:"""
+    assert refid2bib("arXiv:1801.04381") is not None
 
-print( 'pmid example:')
-print( refid2bib(pmid) )
 
-print( 'PMC example:')
-print( refid2bib(pmc_id) )
+def test_arxiv2():
+    """Arxiv example:"""
+    assert refid2bib('arxiv://1907.10138') is not None
 
-print( 'Doi example, specified reference type:')
-print( refid2bib(doi_general, ref_type='doi') )
 
-print( 'Arxiv example, custom short name:')
-print( refid2bib(arxiv_id, short_name='my_custom_name' ) )
+def test_biorxiv():
+    """Biorxiv example:"""
+    assert refid2bib("https://doi.org/10.1101/406314") is not None
 
-print( 'General example, switched name order:')
-print( refid2bib(doi_general, lastname_first=False ) )
+
+def test_biorxiv2():
+    """Biorxiv example: tests regression of bugfix for issue #1"""
+    """Hello! I've been using this library for a while, and I just started testing some functionality I hadn't needed 
+    when I noticed that the biorxiv id doesn't currently work as described on the README. If I enter an id expression 
+    with a colon in it like "biorxiv:570689", I get an error because the code ends up constructing a DOI with colon 
+    in it. I think this is because the regular expression 'biorxiv':'^biorxiv ?|^biorxiv:' in the tests always 
+    matches the first expression before the second one. It might just work to swap the two cases in the regex. """
+    assert refid2bib("bioRxiv: 464909") is not None
+
+
+def test_pmid():
+    """pmid example:"""
+    assert refid2bib('PMID: 25056931') is not None
+
+
+def test_pmc():
+    """PMC example:"""
+    assert refid2bib('PMC3711719') is not None
+
+
+def test_doi_explicit():
+    """Doi example, specified reference type:"""
+    assert refid2bib("DOI: 10.1126/science.1260088", ref_type='doi') is not None
+
+
+def test_arxiv_custom_short_name():
+    """Arxiv example, custom short name:"""
+    assert refid2bib("arXiv:1801.04381", short_name='my_custom_name') is not None
+
+
+def test_switching_name_order():
+    """General example, switched name order:"""
+    assert refid2bib("DOI: 10.1126/science.1260088", lastname_first=False) is not None
+
+
+if __name__ == '__main__':
+    import pytest
+    pytest.main()