Skip to content

Commit

Permalink
Merge pull request #2 from scnerd/master
Browse files Browse the repository at this point in the history
Bug fixes and cleanup
  • Loading branch information
ceesem authored Nov 17, 2022
2 parents 545a63f + 4252cb8 commit 8301aa7
Show file tree
Hide file tree
Showing 6 changed files with 185 additions and 136 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,6 @@ venv.bak/

# mypy
.mypy_cache/

# Pycharm
.idea
36 changes: 19 additions & 17 deletions refid2bib/__init__.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,35 @@
from refid2bib.core import get_biorxiv_bibtex, get_doi_bibtex, get_arxiv_bibtex, get_pmid_bibtex
import re

from refid2bib.core import get_biorxiv_bibtex, get_doi_bibtex, get_arxiv_bibtex, get_pmid_bibtex

bibtex_functions = {
'biorxiv':get_biorxiv_bibtex,
'doi':get_doi_bibtex,
'arxiv':get_arxiv_bibtex,
'pmid':get_pmid_bibtex}
'biorxiv': get_biorxiv_bibtex,
'doi': get_doi_bibtex,
'arxiv': get_arxiv_bibtex,
'pmid': get_pmid_bibtex
}

def refid2bib(ref, short_name=None, lastname_first=True, ref_type=None):

tests = {'doi':'^doi: ?|^https?://doi.org/|^https?://dx.doi.org/',
'biorxiv':'^biorxiv ?|^biorxiv:',
'arxiv': '^arxiv:|^https?://arxiv.org/abs/',
'pmid': '^pmid: ?|^(?=pmc\d*)',
def refid2bib(ref, short_name=None, lastname_first=True, ref_type=None):
tests = {
'doi': r'^doi: ?|^https?://doi.org/|^https?://dx.doi.org/',
'biorxiv': r'^biorxiv:|^biorxiv ?',
'arxiv': r'^arxiv:|^https?://arxiv.org/abs/',
'pmid': r'^pmid: ?|^(?=pmc\d*)',
}

if ref_type in tests.keys():
query = re.search( tests[ref_type], ref.lower() )
query = re.search(tests[ref_type], ref.lower())
oid = ref[query.span()[1]:]
else:
for c, q in tests.items():
query = re.search( q, ref.lower() )
query = re.search(q, ref.lower())
if query is not None:
oid = ref[query.span()[1]:]
ref_type = c
break
else:
raise ValueError( 'Cannot assign reference type for {}'.format(ref) )
else:
raise ValueError('Cannot assign reference type for {}'.format(ref))

oid = oid.lstrip('/').strip()
return bibtex_functions[ref_type](oid, short_name=short_name, lastname_first=lastname_first)


12 changes: 8 additions & 4 deletions refid2bib/__main__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
import sys
from refid2bib import refid2bib

oid = sys.argv[1]
print( '\n')
print( refid2bib(oid) )

if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('document_id', help='DOI, ArXiv ID, etc. of the document you wish to look up')
args = parser.parse_args()

print(refid2bib(args.document_id))
195 changes: 102 additions & 93 deletions refid2bib/core.py
Original file line number Diff line number Diff line change
@@ -1,92 +1,100 @@
import requests
import re
import json
import re

import feedparser
import requests
from nameparser import HumanName

pmid_url = 'https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids={};format=json'


pmid_url='https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids={};format=json'
def pmid_formatter( pmid, pmid_base=pmid_url):
def pmid_formatter(pmid, pmid_base=pmid_url):
return pmid_base.format(pmid)


def get_pmid_doi( pmid ):
data = requests.get( pmid_formatter(pmid) )
def get_pmid_doi(pmid):
data = requests.get(pmid_formatter(pmid))
data.raise_for_status()
data_dict = json.loads(data.text)
return data_dict['records'][0]['doi']


def get_pmid_bibtex( pmid, short_name=None, lastname_first=True ):
def get_pmid_bibtex(pmid, short_name=None, lastname_first=True):
doi = get_pmid_doi(pmid)
return get_doi_bibtex( doi, short_name=short_name, lastname_first=lastname_first )
return get_doi_bibtex(doi, short_name=short_name, lastname_first=lastname_first)


doi_url = 'http://dx.doi.org'


doi_url='http://dx.doi.org'
def doi_formatter( doi, doi_base=doi_url ):
return '/'.join((doi_base,doi))
def doi_formatter(doi, doi_base=doi_url):
return '/'.join((doi_base, doi))


def get_doi_bibtex( doi, short_name=None, lastname_first = True ):
#To do, better exception handling on the request
header={'Accept': 'application/x-bibtex'}
data=requests.get(doi_formatter(doi), headers=header)
def get_doi_bibtex(doi, short_name=None, lastname_first=True):
# To do, better exception handling on the request
header = {'Accept': 'application/x-bibtex'}
data = requests.get(doi_formatter(doi), headers=header)
data.raise_for_status()
bibtex = data.text + "\n"
if lastname_first:
bibtex = invert_author_names( bibtex )
bibtex = invert_author_names(bibtex)

is_biorxiv, biorxiv_id = doi_is_biorxiv( doi )
is_biorxiv, biorxiv_id = doi_is_biorxiv(doi)
if is_biorxiv:
bibtex = fix_biorxiv_info( bibtex, biorxiv_id )
return replace_short_name( bibtex, short_name )
bibtex = fix_biorxiv_info(bibtex, biorxiv_id)
return replace_short_name(bibtex, short_name)


def invert_author_names( bibtex ):
def invert_author_names(bibtex):
author_test = re.search("author = {(?P<author_str>.*)}", bibtex)
author_str_original = author_test.groupdict()['author_str']
author_str_inverted = parse_authors(author_str_original)
return bibtex.replace(author_str_original, author_str_inverted)


arxiv_url = "http://export.arxiv.org/api/query"
def get_arxiv_bibtex( arxiv_number, short_name=None, lastname_first=True):
params={'id_list':arxiv_number}
atom_data=requests.get(arxiv_url, params=params)
data=feedparser.parse(atom_data.text)


def get_arxiv_bibtex(arxiv_number, short_name=None, lastname_first=True):
params = {'id_list': arxiv_number}
atom_data = requests.get(arxiv_url, params=params)
data = feedparser.parse(atom_data.text)
if 'id' in data['entries'][0]:
bibtex = parse_arxiv_bibtex( data, lastname_first=lastname_first )
bibtex = parse_arxiv_bibtex(data, lastname_first=lastname_first)
else:
raise Exception('Arxiv ID not found')
return replace_short_name(bibtex, short_name)


biorxiv_doi = "10.1101/{bid}"
def biorxiv_doi_formatter( biorxiv_id, biorxiv_base=biorxiv_doi ):


def biorxiv_doi_formatter(biorxiv_id, biorxiv_base=biorxiv_doi):
return biorxiv_base.format(bid=biorxiv_id)


def get_biorxiv_bibtex( biorxiv_id, short_name=None, lastname_first=True):
bibtex = get_doi_bibtex( biorxiv_doi_formatter(biorxiv_id),
short_name=short_name,
lastname_first=lastname_first )
def get_biorxiv_bibtex(biorxiv_id, short_name=None, lastname_first=True):
bibtex = get_doi_bibtex(biorxiv_doi_formatter(biorxiv_id),
short_name=short_name,
lastname_first=lastname_first)
return bibtex


def fix_biorxiv_info( bibtex, biorxiv_id ):
def fix_biorxiv_info(bibtex, biorxiv_id):
biorxiv_info = "\teprinttype={{bioRxiv}},\n" \
"\teprint={{{bid}}},\n" \
"\thowpublished={{{hp}}}\n}}\n".format(bid=biorxiv_id,
hp='bioRxiv doi:10.1101/{}'.format(biorxiv_id))
bibtex = bibtex.replace('@article{','@online{',1)
hp='bioRxiv doi:10.1101/{}'.format(biorxiv_id))
bibtex = bibtex.replace('@article{', '@online{', 1)
return bibtex[:-3] + ',\n' + biorxiv_info


def doi_is_biorxiv( doi ):
biorxiv_regex = "(?P<doi_prefix>10\.\d{4,}\.?\d*)\/(?P<doi_suffix>.*)"
biorxiv_doi_test = re.search( biorxiv_regex, doi )
if ( biorxiv_doi_test.groupdict()['doi_prefix'] == '10.1101' ) and \
( re.match('^\d*$', biorxiv_doi_test.groupdict()['doi_suffix']) is not None ):
def doi_is_biorxiv(doi):
biorxiv_regex = r"(?P<doi_prefix>10\.\d{4,}\.?\d*)\/(?P<doi_suffix>.*)"
biorxiv_doi_test = re.search(biorxiv_regex, doi)
if (biorxiv_doi_test.groupdict()['doi_prefix'] == '10.1101') and \
(re.match(r'^\d*$', biorxiv_doi_test.groupdict()['doi_suffix']) is not None):
return True, biorxiv_doi_test.groupdict()['doi_suffix']
else:
return False, None
Expand All @@ -96,13 +104,13 @@ def replace_short_name(bibtex, short_name):
if short_name is not None:
oldreg = re.search('^@.*{(?P<old_name>.*),', bibtex)
old_name = oldreg.groupdict()['old_name']
bibtex = bibtex.replace( old_name, short_name, 1)
bibtex = bibtex.replace(old_name, short_name, 1)
return bibtex


def parse_authors( authors, lastname_first=True ):
def parse_authors(authors, lastname_first=True):
if isinstance(authors, list):
author_list=authors
author_list = authors
elif isinstance(authors, str):
author_list = authors.split(' and ')
else:
Expand All @@ -113,34 +121,34 @@ def parse_authors( authors, lastname_first=True ):
first_names = []
for name in name_list:
if len(name.suffix) > 0:
last_names.append( '{l} {s}'.format(l=name.last, s=name.suffix) )
last_names.append('{l} {s}'.format(l=name.last, s=name.suffix))
else:
last_names.append('{l}'.format(l=name.last))
if len(name.middle) > 0:
first_names.append( '{f} {m}'.format(f=name.first, m=name.middle))
first_names.append('{f} {m}'.format(f=name.first, m=name.middle))
else:
first_names.append( '{f}'.format(f=name.first))
first_names.append('{f}'.format(f=name.first))
if lastname_first:
name_str = ' and '.join(['{}, {}'.format(*name) for name in zip(last_names, first_names)])
else:
name_str = ' and '.join(['{} {}'.format(*name) for name in zip(first_names, last_names)])
name_str = ' and '.join(['{} {}'.format(*name) for name in zip(first_names, last_names)])
return name_str


month_map = {
1:'Jan',
2:'Feb',
3:'Mar',
4:'Apr',
5:'May',
6:'Jun',
7:'Jul',
8:'Aug',
9:'Sep',
10:'Oct',
11:'Nov',
12:'Dec'
}
1: 'Jan',
2: 'Feb',
3: 'Mar',
4: 'Apr',
5: 'May',
6: 'Jun',
7: 'Jul',
8: 'Aug',
9: 'Sep',
10: 'Oct',
11: 'Nov',
12: 'Dec'
}


def format_bibtex_entry(short_name,
Expand All @@ -153,35 +161,36 @@ def format_bibtex_entry(short_name,
article_url):
bibtex_header = '@online{'
bibtex_base = "{header}{short_name},\n" \
"\tauthor = {{{authors}}},\n" \
"\ttitle = {{{title}}},\n" \
"\tyear = {{{year}}},\n" \
"\tmonth = {{{month}}},\n" \
"\teprinttype = {{{eprinttype}}},\n" \
"\teprint = {{{eprint}}},\n" \
"\thowpublished = {{{how_published}}},\n" \
"\turl = {{{url}}}\n}}\n"
bibtex = bibtex_base.format( header=bibtex_header,
short_name=short_name,
authors=authors,
title=title,
year=year,
month=month,
eprinttype=eprinttype,
eprint=eprint,
how_published=':'.join((eprinttype, eprint)),
url=article_url)
"\tauthor = {{{authors}}},\n" \
"\ttitle = {{{title}}},\n" \
"\tyear = {{{year}}},\n" \
"\tmonth = {{{month}}},\n" \
"\teprinttype = {{{eprinttype}}},\n" \
"\teprint = {{{eprint}}},\n" \
"\thowpublished = {{{how_published}}},\n" \
"\turl = {{{url}}}\n}}\n"
bibtex = bibtex_base.format(header=bibtex_header,
short_name=short_name,
authors=authors,
title=title,
year=year,
month=month,
eprinttype=eprinttype,
eprint=eprint,
how_published=':'.join((eprinttype, eprint)),
url=article_url)
return bibtex

def parse_arxiv_bibtex( data, lastname_first=True ):
#todo: exceptions versioning

def parse_arxiv_bibtex(data, lastname_first=True):
# todo: exceptions versioning
ind = 0

title = data['entries'][ind]['title']

eprinttype='arXiv'
eprinttype = 'arXiv'
id_parser = re.search('http://arxiv.org/abs/(?P<id_str>.*)', data['entries'][ind]['id'])
eprint=id_parser.groupdict()['id_str']
eprint = id_parser.groupdict()['id_str']

author_list = [author['name'] for author in data['entries'][ind]['authors']]
authors = parse_authors(author_list, lastname_first=lastname_first)
Expand All @@ -193,19 +202,19 @@ def parse_arxiv_bibtex( data, lastname_first=True ):

first_author = HumanName(author_list[0])
if len(first_author.suffix) > 0:
default_short_name = '_'.join( (first_author.last.replace(' ',"_"),
first_author.middle.replace(' ',"_"),
str(year)) )
default_short_name = '_'.join((first_author.last.replace(' ', "_"),
first_author.middle.replace(' ', "_"),
str(year)))
else:
default_short_name = '_'.join( (first_author.last.replace(' ',"_"),
str(year)) )
default_short_name = '_'.join((first_author.last.replace(' ', "_"),
str(year)))

bibtex = format_bibtex_entry(default_short_name,
authors,
title,
year,
month,
eprinttype,
eprint,
article_url)
authors,
title,
year,
month,
eprinttype,
eprint,
article_url)
return bibtex
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
feedparser
nameparser
nameparser
requests
Loading

0 comments on commit 8301aa7

Please sign in to comment.