Skip to content

Commit

Permalink
Merge pull request #1 from mfrasca/master
Browse files Browse the repository at this point in the history
please merge
  • Loading branch information
jhonmp15015 committed Dec 9, 2015
2 parents 0530722 + e9ff63b commit 4eaf3f4
Show file tree
Hide file tree
Showing 5 changed files with 448 additions and 82 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,6 @@ docs/_build/

# PyBuilder
target/
*~
*/*~
*/*/*~
20 changes: 20 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
try:
from setuptools import setup
except ImportError:
from distutils.core import setup

config = {
'description':
'convert a tropicos export to a Bauble importable json object',
'author': 'Mario Frasca',
'url': 'URL to get it at.',
'download_url': 'Where to download it.',
'author_email': '[email protected]',
'version': '0.1',
'install_requires': ['nose'],
'packages': ['taxonlist2json'],
'scripts': [],
'name': 'taxonlist2json'
}

setup(**config)
134 changes: 126 additions & 8 deletions taxonlist2json/__init__.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,21 @@
def line_to_binomial(input):
def binomial_to_dict(input):
'''compute dictionary equivalent to input
'''

input = input.strip()
if input in ['?', '']:
return {}
result = {'object': 'taxon',
'ht-rank': 'genus',
'hybrid': False,
'rank': 'species'}
values = input.strip().split(' ', 2)
values = input.split(' ', 2)
result['ht-epithet'] = values[0]
result['epithet'] = values[1]
result['author'] = values[2]

import HTMLParser
h = HTMLParser.HTMLParser() # rewrite with functor
result['author'] = h.unescape(values[2])

return result

Expand All @@ -17,19 +24,25 @@ def synonym_line_to_objects_pair(input):
'''compute pair where first element is synonym and second accepted taxon
'''
first, second = input.split('=')
synonym = line_to_binomial(first)
accepted = line_to_binomial(second)
synonym = binomial_to_dict(first)
accepted = binomial_to_dict(second)
return (synonym, accepted)


def whole_block_to_taxon_object(input):
'''compute one taxon object from $-separated text
receive one block of text, separated from previous and next by a $,
return one dictionary.
receive one block of text, separated from previous and next by a $
and not including any $ symbols, return one dictionary.
'''

return None
lines = element_to_lines(input)
result = binomial_to_dict(lines[0])
if len(lines) > 1:
synonym, accepted = synonym_line_to_objects_pair(lines[1])
if accepted:
result['accepted'] = accepted
return result


def element_to_lines(input):
Expand All @@ -49,3 +62,108 @@ def element_to_lines(input):
if lines[1].find('=') != -1:
result.append(lines[1])
return result


def convert(input):
return [whole_block_to_taxon_object(i)
for i in input.split('$') if i.strip()]


def import_ars_grin_family(input):
'''the family as of the input
input is html code snippet.
it starts with <i>, then the epithet, then </i>, then junk.
second line is optional and we are again only interested in italic.
noli resultare si nomen nudus vel illegitimus est
'''
if input.find("nom. nud.") != -1:
return None
if input.find("nom. illeg.") != -1:
return None
input = input.strip()
lines = input.split('\n')
result = ars_grin_line_to_object(lines[0])
if len(lines) > 1 and lines[1].find("<h2>") != -1:
result['accepted'] = ars_grin_line_to_object(lines[1])
return result


def ars_grin_line_to_object(input):
'''
input is html code snippet.
we only want the part in italic.
part of the line holds the author and the quality of the publication.
'''

input = input.strip()
if not input:
return None

input = input.split("<i>")[1]
input = input.split("</i>")[0]
return {'object': 'taxon',
'rank': 'family',
'epithet': input,
}


def convert_ars_grin(input):
"""return the list of objects
input is the file as we have saved it.
"""

separator = '\n <h1>'
items = input.split(separator)[1:]
result = [import_ars_grin_family(i) for i in items]
return [i for i in result if i]


def dict_from_epithet_author(fields, rank):
if rank == 'family':
prefix = 'ht-'
elif rank == 'genus':
prefix = ''
result = {'object': 'taxon',
prefix + 'rank': rank,
prefix + 'epithet': fields[0],
}
if len(fields) > 1:
result['author'] = fields[1]
return result


def ars_grin_genus_to_dict(input):
'''convert the content of page to bauble dict
the input is the content of a taxonomyfamily page
'''

import re
taxon_pattern = re.compile(
r'^<i>([A-Z][a-z]+)</i>[ ]*(.*)</h1>$')
synonym_pattern = re.compile(
r'^<h2>Synonym of <a [^>]*>([A-Z][^ ]+) (.*)</a></h2>$')
family_pattern = re.compile(
r'<td><i><a href=".*">([A-Z].*)</a></i></td>')
reject_pattern = re.compile(
r'<td>(a rejected|an illegitimate).*')
lines = [i.strip() for i in input.split('\n')]
if filter(None, map(reject_pattern.match, lines)):
return None
try:
taxon_match = filter(None, map(taxon_pattern.match, lines))[0]
except IndexError:
return None
synonym_match = (filter(None, map(synonym_pattern.match, lines)) +
[None])[0]
family_match = filter(None, map(family_pattern.match, lines))[0]
result = dict_from_epithet_author(family_match.groups(), 'family')
result.update(dict_from_epithet_author(taxon_match.groups(), 'genus'))

if synonym_match:
result['accepted'] = dict_from_epithet_author(synonym_match.groups(),
'genus')
return result
77 changes: 3 additions & 74 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,75 +1,4 @@
# -*- coding: utf-8 -*-

import unittest
from unittest import SkipTest

import taxonlist2json


class ConverterTest(unittest.TestCase):

def test_file_into_elements(self):
raise SkipTest("not tested yet")

def test_element_into_lines(self):
'returns only the relevant lines'
raise SkipTest("not tested yet")

def test_line_to_binomial_with_simple_author(self):
s = ' Abuta velutina Gleason'
result = taxonlist2json.line_to_binomial(s)
expect = {'object': 'taxon',
'rank': 'species',
'epithet': 'velutina',
'ht-rank': 'genus',
'ht-epithet': 'Abuta',
'hybrid': False,
'author': 'Gleason',
}
self.assertEquals(result, expect)

def test_line_to_binomial_with_composite_author(self):
result = taxonlist2json.line_to_binomial(
'Abutilon mollissimum (Cav.) Sweet')
expect = {'object': 'taxon',
'rank': 'species',
'epithet': 'mollissimum',
'ht-rank': 'genus',
'ht-epithet': 'Abutilon',
'hybrid': False,
'author': '(Cav.) Sweet',
}
self.assertEquals(result, expect)

def test_line_to_binomial_author_with_utf8_char(self):
s = "Abutilon nudiflorum (L'H&eacute;r.) Sweet"
result = taxonlist2json.line_to_binomial(s)
expect = {'ht-epithet': 'Abutilon',
'rank': 'species',
'author': "(L'Hér.) Sweet",
'hybrid': False,
'object': 'taxon',
'epithet': 'nudiflorum',
'ht-rank': 'genus'}
self.assertEquals(result, expect)

def test_synonym_line_to_objects_pair(self):
s = "Abutilon pulverulentum Ulbrich = "\
"Sidasodes jamesonii (Baker f. ) Fryxell & Fuertes"

result = taxonlist2json.synonym_line_to_objects_pair(s)

expect = ({'ht-epithet': 'Abutilon', 'rank': 'species',
'author': 'Ulbrich', 'hybrid': False,
'object': 'taxon', 'epithet': 'pulverulentum',
'ht-rank': 'genus'},
{'ht-epithet': 'Sidasodes',
'rank': 'species',
'author': '(Baker f. ) Fryxell & Fuertes',
'hybrid': False,
'object': 'taxon',
'epithet': 'jamesonii',
'ht-rank': 'genus'})
self.assertEquals(result, expect)


#
# this file is here just to make Python understand the folder corresponds to
# a module.
Loading

0 comments on commit 4eaf3f4

Please sign in to comment.