Skip to content

Commit

Permalink
Fixes issue #1, a break in biorxiv id's that are given directly. Fixe…
Browse files Browse the repository at this point in the history
…s some instabilities caused by stray whitespace or leading slashes. Reformatted code to be closer to PEP8. Updated tests to use Pytest. Fixed requirements.txt, which was missing the ``requests`` package.
  • Loading branch information
scnerd committed Jul 14, 2020
1 parent 545a63f commit 4252cb8
Show file tree
Hide file tree
Showing 6 changed files with 185 additions and 136 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,6 @@ venv.bak/

# mypy
.mypy_cache/

# Pycharm
.idea
36 changes: 19 additions & 17 deletions refid2bib/__init__.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,35 @@
from refid2bib.core import get_biorxiv_bibtex, get_doi_bibtex, get_arxiv_bibtex, get_pmid_bibtex
import re

from refid2bib.core import get_biorxiv_bibtex, get_doi_bibtex, get_arxiv_bibtex, get_pmid_bibtex

bibtex_functions = {
'biorxiv':get_biorxiv_bibtex,
'doi':get_doi_bibtex,
'arxiv':get_arxiv_bibtex,
'pmid':get_pmid_bibtex}
'biorxiv': get_biorxiv_bibtex,
'doi': get_doi_bibtex,
'arxiv': get_arxiv_bibtex,
'pmid': get_pmid_bibtex
}

def refid2bib(ref, short_name=None, lastname_first=True, ref_type=None):

tests = {'doi':'^doi: ?|^https?://doi.org/|^https?://dx.doi.org/',
'biorxiv':'^biorxiv ?|^biorxiv:',
'arxiv': '^arxiv:|^https?://arxiv.org/abs/',
'pmid': '^pmid: ?|^(?=pmc\d*)',
def refid2bib(ref, short_name=None, lastname_first=True, ref_type=None):
tests = {
'doi': r'^doi: ?|^https?://doi.org/|^https?://dx.doi.org/',
'biorxiv': r'^biorxiv:|^biorxiv ?',
'arxiv': r'^arxiv:|^https?://arxiv.org/abs/',
'pmid': r'^pmid: ?|^(?=pmc\d*)',
}

if ref_type in tests.keys():
query = re.search( tests[ref_type], ref.lower() )
query = re.search(tests[ref_type], ref.lower())
oid = ref[query.span()[1]:]
else:
for c, q in tests.items():
query = re.search( q, ref.lower() )
query = re.search(q, ref.lower())
if query is not None:
oid = ref[query.span()[1]:]
ref_type = c
break
else:
raise ValueError( 'Cannot assign reference type for {}'.format(ref) )
else:
raise ValueError('Cannot assign reference type for {}'.format(ref))

oid = oid.lstrip('/').strip()
return bibtex_functions[ref_type](oid, short_name=short_name, lastname_first=lastname_first)


12 changes: 8 additions & 4 deletions refid2bib/__main__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
import sys
from refid2bib import refid2bib

oid = sys.argv[1]
print( '\n')
print( refid2bib(oid) )

if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('document_id', help='DOI, ArXiv ID, etc. of the document you wish to look up')
args = parser.parse_args()

print(refid2bib(args.document_id))
195 changes: 102 additions & 93 deletions refid2bib/core.py
Original file line number Diff line number Diff line change
@@ -1,92 +1,100 @@
import requests
import re
import json
import re

import feedparser
import requests
from nameparser import HumanName

pmid_url = 'https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids={};format=json'


pmid_url='https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids={};format=json'
def pmid_formatter( pmid, pmid_base=pmid_url):
def pmid_formatter(pmid, pmid_base=pmid_url):
return pmid_base.format(pmid)


def get_pmid_doi( pmid ):
data = requests.get( pmid_formatter(pmid) )
def get_pmid_doi(pmid):
data = requests.get(pmid_formatter(pmid))
data.raise_for_status()
data_dict = json.loads(data.text)
return data_dict['records'][0]['doi']


def get_pmid_bibtex( pmid, short_name=None, lastname_first=True ):
def get_pmid_bibtex(pmid, short_name=None, lastname_first=True):
doi = get_pmid_doi(pmid)
return get_doi_bibtex( doi, short_name=short_name, lastname_first=lastname_first )
return get_doi_bibtex(doi, short_name=short_name, lastname_first=lastname_first)


doi_url = 'http://dx.doi.org'


doi_url='http://dx.doi.org'
def doi_formatter( doi, doi_base=doi_url ):
return '/'.join((doi_base,doi))
def doi_formatter(doi, doi_base=doi_url):
return '/'.join((doi_base, doi))


def get_doi_bibtex( doi, short_name=None, lastname_first = True ):
#To do, better exception handling on the request
header={'Accept': 'application/x-bibtex'}
data=requests.get(doi_formatter(doi), headers=header)
def get_doi_bibtex(doi, short_name=None, lastname_first=True):
# To do, better exception handling on the request
header = {'Accept': 'application/x-bibtex'}
data = requests.get(doi_formatter(doi), headers=header)
data.raise_for_status()
bibtex = data.text + "\n"
if lastname_first:
bibtex = invert_author_names( bibtex )
bibtex = invert_author_names(bibtex)

is_biorxiv, biorxiv_id = doi_is_biorxiv( doi )
is_biorxiv, biorxiv_id = doi_is_biorxiv(doi)
if is_biorxiv:
bibtex = fix_biorxiv_info( bibtex, biorxiv_id )
return replace_short_name( bibtex, short_name )
bibtex = fix_biorxiv_info(bibtex, biorxiv_id)
return replace_short_name(bibtex, short_name)


def invert_author_names( bibtex ):
def invert_author_names(bibtex):
author_test = re.search("author = {(?P<author_str>.*)}", bibtex)
author_str_original = author_test.groupdict()['author_str']
author_str_inverted = parse_authors(author_str_original)
return bibtex.replace(author_str_original, author_str_inverted)


arxiv_url = "http://export.arxiv.org/api/query"
def get_arxiv_bibtex( arxiv_number, short_name=None, lastname_first=True):
params={'id_list':arxiv_number}
atom_data=requests.get(arxiv_url, params=params)
data=feedparser.parse(atom_data.text)


def get_arxiv_bibtex(arxiv_number, short_name=None, lastname_first=True):
params = {'id_list': arxiv_number}
atom_data = requests.get(arxiv_url, params=params)
data = feedparser.parse(atom_data.text)
if 'id' in data['entries'][0]:
bibtex = parse_arxiv_bibtex( data, lastname_first=lastname_first )
bibtex = parse_arxiv_bibtex(data, lastname_first=lastname_first)
else:
raise Exception('Arxiv ID not found')
return replace_short_name(bibtex, short_name)


biorxiv_doi = "10.1101/{bid}"
def biorxiv_doi_formatter( biorxiv_id, biorxiv_base=biorxiv_doi ):


def biorxiv_doi_formatter(biorxiv_id, biorxiv_base=biorxiv_doi):
return biorxiv_base.format(bid=biorxiv_id)


def get_biorxiv_bibtex( biorxiv_id, short_name=None, lastname_first=True):
bibtex = get_doi_bibtex( biorxiv_doi_formatter(biorxiv_id),
short_name=short_name,
lastname_first=lastname_first )
def get_biorxiv_bibtex(biorxiv_id, short_name=None, lastname_first=True):
bibtex = get_doi_bibtex(biorxiv_doi_formatter(biorxiv_id),
short_name=short_name,
lastname_first=lastname_first)
return bibtex


def fix_biorxiv_info( bibtex, biorxiv_id ):
def fix_biorxiv_info(bibtex, biorxiv_id):
biorxiv_info = "\teprinttype={{bioRxiv}},\n" \
"\teprint={{{bid}}},\n" \
"\thowpublished={{{hp}}}\n}}\n".format(bid=biorxiv_id,
hp='bioRxiv doi:10.1101/{}'.format(biorxiv_id))
bibtex = bibtex.replace('@article{','@online{',1)
hp='bioRxiv doi:10.1101/{}'.format(biorxiv_id))
bibtex = bibtex.replace('@article{', '@online{', 1)
return bibtex[:-3] + ',\n' + biorxiv_info


def doi_is_biorxiv( doi ):
biorxiv_regex = "(?P<doi_prefix>10\.\d{4,}\.?\d*)\/(?P<doi_suffix>.*)"
biorxiv_doi_test = re.search( biorxiv_regex, doi )
if ( biorxiv_doi_test.groupdict()['doi_prefix'] == '10.1101' ) and \
( re.match('^\d*$', biorxiv_doi_test.groupdict()['doi_suffix']) is not None ):
def doi_is_biorxiv(doi):
biorxiv_regex = r"(?P<doi_prefix>10\.\d{4,}\.?\d*)\/(?P<doi_suffix>.*)"
biorxiv_doi_test = re.search(biorxiv_regex, doi)
if (biorxiv_doi_test.groupdict()['doi_prefix'] == '10.1101') and \
(re.match(r'^\d*$', biorxiv_doi_test.groupdict()['doi_suffix']) is not None):
return True, biorxiv_doi_test.groupdict()['doi_suffix']
else:
return False, None
Expand All @@ -96,13 +104,13 @@ def replace_short_name(bibtex, short_name):
if short_name is not None:
oldreg = re.search('^@.*{(?P<old_name>.*),', bibtex)
old_name = oldreg.groupdict()['old_name']
bibtex = bibtex.replace( old_name, short_name, 1)
bibtex = bibtex.replace(old_name, short_name, 1)
return bibtex


def parse_authors( authors, lastname_first=True ):
def parse_authors(authors, lastname_first=True):
if isinstance(authors, list):
author_list=authors
author_list = authors
elif isinstance(authors, str):
author_list = authors.split(' and ')
else:
Expand All @@ -113,34 +121,34 @@ def parse_authors( authors, lastname_first=True ):
first_names = []
for name in name_list:
if len(name.suffix) > 0:
last_names.append( '{l} {s}'.format(l=name.last, s=name.suffix) )
last_names.append('{l} {s}'.format(l=name.last, s=name.suffix))
else:
last_names.append('{l}'.format(l=name.last))
if len(name.middle) > 0:
first_names.append( '{f} {m}'.format(f=name.first, m=name.middle))
first_names.append('{f} {m}'.format(f=name.first, m=name.middle))
else:
first_names.append( '{f}'.format(f=name.first))
first_names.append('{f}'.format(f=name.first))
if lastname_first:
name_str = ' and '.join(['{}, {}'.format(*name) for name in zip(last_names, first_names)])
else:
name_str = ' and '.join(['{} {}'.format(*name) for name in zip(first_names, last_names)])
name_str = ' and '.join(['{} {}'.format(*name) for name in zip(first_names, last_names)])
return name_str


month_map = {
1:'Jan',
2:'Feb',
3:'Mar',
4:'Apr',
5:'May',
6:'Jun',
7:'Jul',
8:'Aug',
9:'Sep',
10:'Oct',
11:'Nov',
12:'Dec'
}
1: 'Jan',
2: 'Feb',
3: 'Mar',
4: 'Apr',
5: 'May',
6: 'Jun',
7: 'Jul',
8: 'Aug',
9: 'Sep',
10: 'Oct',
11: 'Nov',
12: 'Dec'
}


def format_bibtex_entry(short_name,
Expand All @@ -153,35 +161,36 @@ def format_bibtex_entry(short_name,
article_url):
bibtex_header = '@online{'
bibtex_base = "{header}{short_name},\n" \
"\tauthor = {{{authors}}},\n" \
"\ttitle = {{{title}}},\n" \
"\tyear = {{{year}}},\n" \
"\tmonth = {{{month}}},\n" \
"\teprinttype = {{{eprinttype}}},\n" \
"\teprint = {{{eprint}}},\n" \
"\thowpublished = {{{how_published}}},\n" \
"\turl = {{{url}}}\n}}\n"
bibtex = bibtex_base.format( header=bibtex_header,
short_name=short_name,
authors=authors,
title=title,
year=year,
month=month,
eprinttype=eprinttype,
eprint=eprint,
how_published=':'.join((eprinttype, eprint)),
url=article_url)
"\tauthor = {{{authors}}},\n" \
"\ttitle = {{{title}}},\n" \
"\tyear = {{{year}}},\n" \
"\tmonth = {{{month}}},\n" \
"\teprinttype = {{{eprinttype}}},\n" \
"\teprint = {{{eprint}}},\n" \
"\thowpublished = {{{how_published}}},\n" \
"\turl = {{{url}}}\n}}\n"
bibtex = bibtex_base.format(header=bibtex_header,
short_name=short_name,
authors=authors,
title=title,
year=year,
month=month,
eprinttype=eprinttype,
eprint=eprint,
how_published=':'.join((eprinttype, eprint)),
url=article_url)
return bibtex

def parse_arxiv_bibtex( data, lastname_first=True ):
#todo: exceptions versioning

def parse_arxiv_bibtex(data, lastname_first=True):
# todo: exceptions versioning
ind = 0

title = data['entries'][ind]['title']

eprinttype='arXiv'
eprinttype = 'arXiv'
id_parser = re.search('http://arxiv.org/abs/(?P<id_str>.*)', data['entries'][ind]['id'])
eprint=id_parser.groupdict()['id_str']
eprint = id_parser.groupdict()['id_str']

author_list = [author['name'] for author in data['entries'][ind]['authors']]
authors = parse_authors(author_list, lastname_first=lastname_first)
Expand All @@ -193,19 +202,19 @@ def parse_arxiv_bibtex( data, lastname_first=True ):

first_author = HumanName(author_list[0])
if len(first_author.suffix) > 0:
default_short_name = '_'.join( (first_author.last.replace(' ',"_"),
first_author.middle.replace(' ',"_"),
str(year)) )
default_short_name = '_'.join((first_author.last.replace(' ', "_"),
first_author.middle.replace(' ', "_"),
str(year)))
else:
default_short_name = '_'.join( (first_author.last.replace(' ',"_"),
str(year)) )
default_short_name = '_'.join((first_author.last.replace(' ', "_"),
str(year)))

bibtex = format_bibtex_entry(default_short_name,
authors,
title,
year,
month,
eprinttype,
eprint,
article_url)
authors,
title,
year,
month,
eprinttype,
eprint,
article_url)
return bibtex
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
feedparser
nameparser
nameparser
requests
Loading

0 comments on commit 4252cb8

Please sign in to comment.