Fixes issue #1, a break in biorxiv id's that are given directly. Fixe…

…s some instabilities caused by stray whitespace or leading slashes. Reformatted code to be closer to PEP8. Updated tests to use Pytest. Fixed requirements.txt, which was missing the ``requests`` package.
ceesem · Jul 14, 2020 · 4252cb8 · 4252cb8
1 parent 545a63f
commit 4252cb8
Show file tree

Hide file tree

Showing 6 changed files with 185 additions and 136 deletions.
diff --git a/.gitignore b/.gitignore
@@ -102,3 +102,6 @@ venv.bak/
 
 # mypy
 .mypy_cache/
+
+# Pycharm
+.idea
diff --git a/refid2bib/__init__.py b/refid2bib/__init__.py
@@ -1,33 +1,35 @@
-from refid2bib.core import get_biorxiv_bibtex, get_doi_bibtex, get_arxiv_bibtex, get_pmid_bibtex
 import re
 
+from refid2bib.core import get_biorxiv_bibtex, get_doi_bibtex, get_arxiv_bibtex, get_pmid_bibtex
+
 bibtex_functions = {
-    'biorxiv':get_biorxiv_bibtex,
-    'doi':get_doi_bibtex,
-    'arxiv':get_arxiv_bibtex,
-    'pmid':get_pmid_bibtex}
+    'biorxiv': get_biorxiv_bibtex,
+    'doi': get_doi_bibtex,
+    'arxiv': get_arxiv_bibtex,
+    'pmid': get_pmid_bibtex
+}
 
-def refid2bib(ref, short_name=None, lastname_first=True, ref_type=None):
 
-    tests = {'doi':'^doi: ?|^https?://doi.org/|^https?://dx.doi.org/',
-             'biorxiv':'^biorxiv ?|^biorxiv:',
-             'arxiv': '^arxiv:|^https?://arxiv.org/abs/',
-             'pmid': '^pmid: ?|^(?=pmc\d*)',
+def refid2bib(ref, short_name=None, lastname_first=True, ref_type=None):
+    tests = {
+        'doi': r'^doi: ?|^https?://doi.org/|^https?://dx.doi.org/',
+        'biorxiv': r'^biorxiv:|^biorxiv ?',
+        'arxiv': r'^arxiv:|^https?://arxiv.org/abs/',
+        'pmid': r'^pmid: ?|^(?=pmc\d*)',
     }
-    
+
     if ref_type in tests.keys():
-        query = re.search( tests[ref_type], ref.lower() )
+        query = re.search(tests[ref_type], ref.lower())
         oid = ref[query.span()[1]:]
     else:
         for c, q in tests.items():
-            query = re.search( q, ref.lower() )
+            query = re.search(q, ref.lower())
             if query is not None:
                 oid = ref[query.span()[1]:]
                 ref_type = c
                 break
-        else:            
-            raise ValueError( 'Cannot assign reference type for {}'.format(ref) )
+        else:
+            raise ValueError('Cannot assign reference type for {}'.format(ref))
 
+    oid = oid.lstrip('/').strip()
     return bibtex_functions[ref_type](oid, short_name=short_name, lastname_first=lastname_first)
-
-
diff --git a/refid2bib/__main__.py b/refid2bib/__main__.py
@@ -1,6 +1,10 @@
-import sys
 from refid2bib import refid2bib
 
-oid = sys.argv[1]
-print( '\n')
-print( refid2bib(oid) )
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('document_id', help='DOI, ArXiv ID, etc. of the document you wish to look up')
+    args = parser.parse_args()
+
+    print(refid2bib(args.document_id))
diff --git a/refid2bib/core.py b/refid2bib/core.py
@@ -1,92 +1,100 @@
-import requests
-import re
 import json
+import re
+
 import feedparser
+import requests
 from nameparser import HumanName
 
+pmid_url = 'https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids={};format=json'
+
 
-pmid_url='https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids={};format=json'
-def pmid_formatter( pmid, pmid_base=pmid_url):
+def pmid_formatter(pmid, pmid_base=pmid_url):
     return pmid_base.format(pmid)
 
 
-def get_pmid_doi( pmid ):
-    data = requests.get( pmid_formatter(pmid) )
+def get_pmid_doi(pmid):
+    data = requests.get(pmid_formatter(pmid))
     data.raise_for_status()
     data_dict = json.loads(data.text)
     return data_dict['records'][0]['doi']
 
 
-def get_pmid_bibtex( pmid, short_name=None, lastname_first=True ):
+def get_pmid_bibtex(pmid, short_name=None, lastname_first=True):
     doi = get_pmid_doi(pmid)
-    return get_doi_bibtex( doi, short_name=short_name, lastname_first=lastname_first )
+    return get_doi_bibtex(doi, short_name=short_name, lastname_first=lastname_first)
+
+
+doi_url = 'http://dx.doi.org'
 
 
-doi_url='http://dx.doi.org'
-def doi_formatter( doi, doi_base=doi_url ):
-    return '/'.join((doi_base,doi))
+def doi_formatter(doi, doi_base=doi_url):
+    return '/'.join((doi_base, doi))
 
 
-def get_doi_bibtex( doi, short_name=None, lastname_first = True ):
-    #To do, better exception handling on the request
-    header={'Accept': 'application/x-bibtex'}
-    data=requests.get(doi_formatter(doi), headers=header)
+def get_doi_bibtex(doi, short_name=None, lastname_first=True):
+    # To do, better exception handling on the request
+    header = {'Accept': 'application/x-bibtex'}
+    data = requests.get(doi_formatter(doi), headers=header)
     data.raise_for_status()
     bibtex = data.text + "\n"
     if lastname_first:
-        bibtex = invert_author_names( bibtex )
+        bibtex = invert_author_names(bibtex)
 
-    is_biorxiv, biorxiv_id = doi_is_biorxiv( doi )
+    is_biorxiv, biorxiv_id = doi_is_biorxiv(doi)
     if is_biorxiv:
-        bibtex = fix_biorxiv_info( bibtex, biorxiv_id )  
-    return replace_short_name( bibtex, short_name )
+        bibtex = fix_biorxiv_info(bibtex, biorxiv_id)
+    return replace_short_name(bibtex, short_name)
 
 
-def invert_author_names( bibtex ):
+def invert_author_names(bibtex):
     author_test = re.search("author = {(?P<author_str>.*)}", bibtex)
     author_str_original = author_test.groupdict()['author_str']
     author_str_inverted = parse_authors(author_str_original)
     return bibtex.replace(author_str_original, author_str_inverted)
 
 
 arxiv_url = "http://export.arxiv.org/api/query"
-def get_arxiv_bibtex( arxiv_number, short_name=None, lastname_first=True):
-    params={'id_list':arxiv_number}
-    atom_data=requests.get(arxiv_url, params=params)
-    data=feedparser.parse(atom_data.text)
+
+
+def get_arxiv_bibtex(arxiv_number, short_name=None, lastname_first=True):
+    params = {'id_list': arxiv_number}
+    atom_data = requests.get(arxiv_url, params=params)
+    data = feedparser.parse(atom_data.text)
     if 'id' in data['entries'][0]:
-        bibtex = parse_arxiv_bibtex( data, lastname_first=lastname_first )
+        bibtex = parse_arxiv_bibtex(data, lastname_first=lastname_first)
     else:
         raise Exception('Arxiv ID not found')
     return replace_short_name(bibtex, short_name)
 
 
 biorxiv_doi = "10.1101/{bid}"
-def biorxiv_doi_formatter( biorxiv_id, biorxiv_base=biorxiv_doi ):
+
+
+def biorxiv_doi_formatter(biorxiv_id, biorxiv_base=biorxiv_doi):
     return biorxiv_base.format(bid=biorxiv_id)
 
 
-def get_biorxiv_bibtex( biorxiv_id, short_name=None, lastname_first=True):
-    bibtex = get_doi_bibtex( biorxiv_doi_formatter(biorxiv_id),
-                             short_name=short_name,
-                             lastname_first=lastname_first )
+def get_biorxiv_bibtex(biorxiv_id, short_name=None, lastname_first=True):
+    bibtex = get_doi_bibtex(biorxiv_doi_formatter(biorxiv_id),
+                            short_name=short_name,
+                            lastname_first=lastname_first)
     return bibtex
 
 
-def fix_biorxiv_info( bibtex, biorxiv_id ):
+def fix_biorxiv_info(bibtex, biorxiv_id):
     biorxiv_info = "\teprinttype={{bioRxiv}},\n" \
                    "\teprint={{{bid}}},\n" \
                    "\thowpublished={{{hp}}}\n}}\n".format(bid=biorxiv_id,
-                                                           hp='bioRxiv doi:10.1101/{}'.format(biorxiv_id))
-    bibtex = bibtex.replace('@article{','@online{',1)
+                                                          hp='bioRxiv doi:10.1101/{}'.format(biorxiv_id))
+    bibtex = bibtex.replace('@article{', '@online{', 1)
     return bibtex[:-3] + ',\n' + biorxiv_info
 
 
-def doi_is_biorxiv( doi ):
-    biorxiv_regex = "(?P<doi_prefix>10\.\d{4,}\.?\d*)\/(?P<doi_suffix>.*)"
-    biorxiv_doi_test = re.search( biorxiv_regex, doi )
-    if ( biorxiv_doi_test.groupdict()['doi_prefix'] == '10.1101' ) and \
-        ( re.match('^\d*$', biorxiv_doi_test.groupdict()['doi_suffix']) is not None ):
+def doi_is_biorxiv(doi):
+    biorxiv_regex = r"(?P<doi_prefix>10\.\d{4,}\.?\d*)\/(?P<doi_suffix>.*)"
+    biorxiv_doi_test = re.search(biorxiv_regex, doi)
+    if (biorxiv_doi_test.groupdict()['doi_prefix'] == '10.1101') and \
+        (re.match(r'^\d*$', biorxiv_doi_test.groupdict()['doi_suffix']) is not None):
         return True, biorxiv_doi_test.groupdict()['doi_suffix']
     else:
         return False, None
@@ -96,13 +104,13 @@ def replace_short_name(bibtex, short_name):
     if short_name is not None:
         oldreg = re.search('^@.*{(?P<old_name>.*),', bibtex)
         old_name = oldreg.groupdict()['old_name']
-        bibtex = bibtex.replace( old_name, short_name, 1)      
+        bibtex = bibtex.replace(old_name, short_name, 1)
     return bibtex
 
 
-def parse_authors( authors, lastname_first=True ):
+def parse_authors(authors, lastname_first=True):
     if isinstance(authors, list):
-        author_list=authors
+        author_list = authors
     elif isinstance(authors, str):
         author_list = authors.split(' and ')
     else:
@@ -113,34 +121,34 @@ def parse_authors( authors, lastname_first=True ):
     first_names = []
     for name in name_list:
         if len(name.suffix) > 0:
-            last_names.append( '{l} {s}'.format(l=name.last, s=name.suffix) )
+            last_names.append('{l} {s}'.format(l=name.last, s=name.suffix))
         else:
             last_names.append('{l}'.format(l=name.last))
         if len(name.middle) > 0:
-            first_names.append( '{f} {m}'.format(f=name.first, m=name.middle))
+            first_names.append('{f} {m}'.format(f=name.first, m=name.middle))
         else:
-            first_names.append( '{f}'.format(f=name.first))
+            first_names.append('{f}'.format(f=name.first))
     if lastname_first:
         name_str = ' and '.join(['{}, {}'.format(*name) for name in zip(last_names, first_names)])
     else:
-        name_str = ' and '.join(['{} {}'.format(*name) for name in zip(first_names, last_names)]) 
+        name_str = ' and '.join(['{} {}'.format(*name) for name in zip(first_names, last_names)])
     return name_str
 
 
 month_map = {
-    1:'Jan',
-    2:'Feb',
-    3:'Mar',
-    4:'Apr',
-    5:'May',
-    6:'Jun',
-    7:'Jul',
-    8:'Aug',
-    9:'Sep',
-    10:'Oct',
-    11:'Nov',
-    12:'Dec'
-    }
+    1: 'Jan',
+    2: 'Feb',
+    3: 'Mar',
+    4: 'Apr',
+    5: 'May',
+    6: 'Jun',
+    7: 'Jul',
+    8: 'Aug',
+    9: 'Sep',
+    10: 'Oct',
+    11: 'Nov',
+    12: 'Dec'
+}
 
 
 def format_bibtex_entry(short_name,
@@ -153,35 +161,36 @@ def format_bibtex_entry(short_name,
                         article_url):
     bibtex_header = '@online{'
     bibtex_base = "{header}{short_name},\n" \
-        "\tauthor = {{{authors}}},\n" \
-        "\ttitle = {{{title}}},\n" \
-        "\tyear = {{{year}}},\n" \
-        "\tmonth = {{{month}}},\n" \
-        "\teprinttype = {{{eprinttype}}},\n" \
-        "\teprint = {{{eprint}}},\n" \
-        "\thowpublished = {{{how_published}}},\n" \
-        "\turl = {{{url}}}\n}}\n"
-    bibtex = bibtex_base.format( header=bibtex_header,
-                    short_name=short_name,
-                    authors=authors,
-                    title=title,
-                    year=year,
-                    month=month,
-                    eprinttype=eprinttype,
-                    eprint=eprint,
-                    how_published=':'.join((eprinttype, eprint)),
-                    url=article_url)
+                  "\tauthor = {{{authors}}},\n" \
+                  "\ttitle = {{{title}}},\n" \
+                  "\tyear = {{{year}}},\n" \
+                  "\tmonth = {{{month}}},\n" \
+                  "\teprinttype = {{{eprinttype}}},\n" \
+                  "\teprint = {{{eprint}}},\n" \
+                  "\thowpublished = {{{how_published}}},\n" \
+                  "\turl = {{{url}}}\n}}\n"
+    bibtex = bibtex_base.format(header=bibtex_header,
+                                short_name=short_name,
+                                authors=authors,
+                                title=title,
+                                year=year,
+                                month=month,
+                                eprinttype=eprinttype,
+                                eprint=eprint,
+                                how_published=':'.join((eprinttype, eprint)),
+                                url=article_url)
     return bibtex
 
-def parse_arxiv_bibtex( data, lastname_first=True ):
-    #todo: exceptions versioning
+
+def parse_arxiv_bibtex(data, lastname_first=True):
+    # todo: exceptions versioning
     ind = 0
-    
+
     title = data['entries'][ind]['title']
 
-    eprinttype='arXiv'
+    eprinttype = 'arXiv'
     id_parser = re.search('http://arxiv.org/abs/(?P<id_str>.*)', data['entries'][ind]['id'])
-    eprint=id_parser.groupdict()['id_str']
+    eprint = id_parser.groupdict()['id_str']
 
     author_list = [author['name'] for author in data['entries'][ind]['authors']]
     authors = parse_authors(author_list, lastname_first=lastname_first)
@@ -193,19 +202,19 @@ def parse_arxiv_bibtex( data, lastname_first=True ):
 
     first_author = HumanName(author_list[0])
     if len(first_author.suffix) > 0:
-        default_short_name = '_'.join( (first_author.last.replace(' ',"_"),
-                                        first_author.middle.replace(' ',"_"),
-                                        str(year)) )
+        default_short_name = '_'.join((first_author.last.replace(' ', "_"),
+                                       first_author.middle.replace(' ', "_"),
+                                       str(year)))
     else:
-        default_short_name = '_'.join( (first_author.last.replace(' ',"_"),
-                                        str(year)) )
+        default_short_name = '_'.join((first_author.last.replace(' ', "_"),
+                                       str(year)))
 
     bibtex = format_bibtex_entry(default_short_name,
-                                authors,
-                                title,
-                                year,
-                                month,
-                                eprinttype,
-                                eprint,
-                                article_url)
+                                 authors,
+                                 title,
+                                 year,
+                                 month,
+                                 eprinttype,
+                                 eprint,
+                                 article_url)
     return bibtex
diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,3 @@
 feedparser
-nameparser
+nameparser
+requests
-Original file line number
+Diff line change
@@ Expand Up / @@ -102,3 +102,6 @@ venv.bak/ @@
     # mypy
     .mypy_cache/
+    # Pycharm
+    .idea