Skip to content

Commit

Permalink
#34 improved documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
e-dorigatti committed Jun 1, 2016
1 parent 327f337 commit 2667b58
Show file tree
Hide file tree
Showing 9 changed files with 170 additions and 32 deletions.
52 changes: 51 additions & 1 deletion strephit/commons/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,23 @@ def _path_for(hashed_key):


def get(key, default=None):
""" Retrieves an item from the cache
:param key: Key of the item
:param default: Default value to return if the
key is not in the cache
:return: The item associated with the given key or
the default value
Sample usage:
>>> from strephit.commons import cache
>>> cache.get('kk', 13)
13
>>> cache.get('kk', 0)
0
>>> cache.set('kk', 15)
>>> cache.get('kk', 0)
15
"""
if not ENABLED:
return default

Expand All @@ -39,6 +56,24 @@ def get(key, default=None):


def set(key, value, overwrite=True):
""" Stores an item in the cache under the given key
:param key: Unique key used to identify the idem.
:param value: Value to store in the cache. Must be
JSON-dumpable
:param overwrite: Whether to overwrite the previous
value associated with the key (if any)
:return: Nothing
Sample usage:
>>> from strephit.commons import cache
>>> cache.get('kk', 13)
13
>>> cache.get('kk', 0)
0
>>> cache.set('kk', 15)
>>> cache.get('kk', 0)
15
"""
if not ENABLED:
return

Expand All @@ -63,7 +98,22 @@ def set(key, value, overwrite=True):


def cached(function):
""" Decorator to cache function results based on its arguments """
""" Decorator to cache function results based on its arguments
Sample usage:
>>> from strephit.commons import cache
>>> @cache.cached
... def f(x):
... print 'inside f'
... return 2 * x
...
>>> f(10)
inside f
20
>>> f(10)
20
"""
def wrapper(*args, **kwargs):
key = str([function.__module__]) + function.__name__ + str(args) + str(kwargs)
res = get(key)
Expand Down
16 changes: 15 additions & 1 deletion strephit/commons/date_normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def _meta_init(self, specs):
self.globals = self.meta_funcs
self.globals.update(self.meta_vars)

def normalize_one(self, expression, conflict='first'):
def normalize_one(self, expression, conflict='longest'):
""" Find the matching part in the given expression
:param str expression: The expression in which to search the match
Expand All @@ -91,6 +91,11 @@ def normalize_one(self, expression, conflict='first'):
Allowed values are `first`, `longest` and `shortest`
:return: Tuple with (start, end), category, result
:rtype: tuple
Sample usage:
>>> from strephit.commons.date_normalizer import DateNormalizer
>>> DateNormalizer('en').normalize_one('Today is the 1st of June, 2016')
((13, 30), 'Time', {'month': 6, 'day': 1, 'year': 2016})
"""

best_match = None
Expand Down Expand Up @@ -118,6 +123,15 @@ def normalize_many(self, expression):
:param str expression: The expression in which to look for
:return: Generator of tuples (start, end), category, result
Sample usage:
>>> from pprint import pprint
>>> from strephit.commons.date_normalizer import DateNormalizer
>>> pprint(list(DateNormalizer('en').normalize_many('I was born on April 18th, '
... 'and today is April 18th, 2016!')))
[((14, 24), 'Time', {'day': 18, 'month': 4}),
((39, 55), 'Time', {'day': 18, 'month': 4, 'year': 2016})]
"""

# start matching from here, and move forward as new matches
Expand Down
17 changes: 16 additions & 1 deletion strephit/commons/parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,12 @@ def map(function, iterable, processes=0, flatten=False, raise_exc=True, batch_si
:param batch_size: If larger than 0, the input iterable will be grouped in groups
of this size and the resulting list passed to as argument to the worker.
:returns: iterable with the results. Order is not guaranteed to be preserved
Sample usage:
>>> from strephit.commons import parallel
>>> list(parallel.map(lambda x: 2*x, range(10)))
[0, 8, 10, 12, 14, 16, 18, 2, 4, 6]
"""
if processes == 1:
for task in make_batches(iterable, batch_size):
Expand Down Expand Up @@ -131,8 +137,17 @@ def execute(processes=0, *specs):
:param processes: Number of functions to execute at the same time
:param specs: a sequence of functions, each followed by its arguments (arguments as a tuple or list)
:return: the results that the functions returned
:return: the results that the functions returned, in the same order as they were specified
:rtype: list
Sample usage:
>>> from strephit.commons import parallel
>>> list(parallel.execute(4,
... lambda x, y: x + y, (5, -5),
... lambda *x: sum(x), range(5)
... ))
[0, 10]
"""
functions, arguments = specs[::2], specs[1::2]
res = list(map(lambda (i, args): (i, functions[i](*args)),
Expand Down
42 changes: 39 additions & 3 deletions strephit/commons/pos_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def _postprocess_tags(self, tags, skip_unknown=True):
""" Clean tagged data from non-tags and unknown lemmas (optionally) """
clean_tags = []
for tag in tags:
if skip_unknown and tag.lemma == u'<unknown>':
if skip_unknown and isinstance(tag, NotTag) or tag.lemma == u'<unknown>':
logger.debug("Unknown lemma found: %s. Skipping ..." % repr(tag))
continue
clean_tags.append(tag)
Expand All @@ -78,16 +78,52 @@ def tokenize(self, text):
return self.tokenizer.tokenize(text)

def tag_one(self, text, skip_unknown=True, **kwargs):
""" POS-Tags the given text, optionally skipping unknown lemmas """
""" POS-Tags the given text, optionally skipping unknown lemmas
:param unicode text: Text to be tagged
:param bool skip_unknown: Automatically emove unrecognized tags from the result
Sample usage:
>>> from strephit.commons.pos_tag import TTPosTagger
>>> from pprint import pprint
>>> pprint(TTPosTagger('en').tag_one(u'sample sentence to be tagged fycgvkuhbj'))
[Tag(word=u'sample', pos=u'NN', lemma=u'sample'),
Tag(word=u'sentence', pos=u'NN', lemma=u'sentence'),
Tag(word=u'to', pos=u'TO', lemma=u'to'),
Tag(word=u'be', pos=u'VB', lemma=u'be'),
Tag(word=u'tagged', pos=u'VVN', lemma=u'tag')]
"""
return self._postprocess_tags(make_tags(self.tagger.tag_text(text, **kwargs)),
skip_unknown)

def tag_many(self, items, document_key, pos_tag_key, batch_size=10000, **kwargs):
""" POS-Tags many text documents of the given items. Use this for massive text tagging
:param items: Iterable of items to tag. Generator preferred
:param document_key: Where to find the text to tag inside each item
:param document_key: Where to find the text to tag inside each item. Text must be unicode
:param pos_tag_key: Where to put pos tagged text
Sample usage:
>>> from strephit.commons.pos_tag import TTPosTagger
>>> from pprint import pprint
>>> pprint(list(TTPosTagger('en').tag_many(
... [{'text': u'Item one is in first position'}, {'text': u'In the second position is item two'}],
... 'text', 'tagged'
... )))
[{'tagged': [Tag(word=u'Item', pos=u'NN', lemma=u'item'),
Tag(word=u'one', pos=u'CD', lemma=u'one'),
Tag(word=u'is', pos=u'VBZ', lemma=u'be'),
Tag(word=u'in', pos=u'IN', lemma=u'in'),
Tag(word=u'first', pos=u'JJ', lemma=u'first'),
Tag(word=u'position', pos=u'NN', lemma=u'position')],
'text': u'Item one is in first position'},
{'tagged': [Tag(word=u'In', pos=u'IN', lemma=u'in'),
Tag(word=u'the', pos=u'DT', lemma=u'the'),
Tag(word=u'second', pos=u'JJ', lemma=u'second'),
Tag(word=u'position', pos=u'NN', lemma=u'position'),
Tag(word=u'is', pos=u'VBZ', lemma=u'be'),
Tag(word=u'item', pos=u'RB', lemma=u'item'),
Tag(word=u'two', pos=u'CD', lemma=u'two')],
'text': u'In the second position is item two'}]
"""
tt_pool = TaggerProcessPoll(
TAGLANG=self.language,
Expand Down
4 changes: 2 additions & 2 deletions strephit/commons/resources/normalization_rules_en.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,10 @@ Time:
- (in|by) {early_late_mid}? {match_month}? {match_year}: >
make_date(**match.groupdict())
- (?P<day>\d{{1,2}}){cardinal_suffix}? (of)? {match_month} {match_year}: >
- (?P<day>\d{{1,2}}){cardinal_suffix}? (of)? {match_month},? {match_year}?: >
make_date(**match.groupdict())
- (?P<month>{month}) (?P<day>\d{{1,2}}){cardinal_suffix}?, {match_year}: >
- (?P<month>{month}) (?P<day>\d{{1,2}}){cardinal_suffix}?(,? {match_year})?: >
make_date(**match.groupdict())
- (c\.|on|about|the|of|circa|around|year) {match_year}: >
Expand Down
39 changes: 29 additions & 10 deletions strephit/corpus_analysis/rank_verbs.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ def get_similarity_scores(verb_token, vectorizer, tf_idf_matrix):
""" Compute the cosine similarity score of a given verb token against the input corpus TF/IDF matrix.
:param str verb_token: Surface form of a verb, e.g., *born*
:param sklearn.feature_extraction.text.TfidfVectorizer vectorizer: Vectorizer
used to transform verbs into vectors
:return: cosine similarity score
:rtype: ndarray
"""
Expand All @@ -41,8 +43,8 @@ def get_similarity_scores(verb_token, vectorizer, tf_idf_matrix):
def produce_lemma_tokens(pos_tagged_path, pos_tag_key, language):
""" Extracts a map from lemma to all its tokens
:param pos_tagged_path: path of the pos-tagged corpus
:param pos_tag_key: where the pos tag data is in each item
:param str pos_tagged_path: path of the pos-tagged corpus
:param str pos_tag_key: where the pos tag data is in each item
:param language: language of the corpus
:return: mapping from lemma to tokens
:rtype: dict
Expand All @@ -61,8 +63,8 @@ def produce_lemma_tokens(pos_tagged_path, pos_tag_key, language):
def compute_tf_idf_matrix(corpus_path, document_key):
""" Computes the TF-IDF matrix of the corpus
:param corpus_path: path of the corpus
:param document_key: where the textual content is in the corpus
:param str corpus_path: path of the corpus
:param str document_key: where the textual content is in the corpus
:return: a vectorizer and the computed matrix
:rtype: tuple
"""
Expand All @@ -72,13 +74,23 @@ def compute_tf_idf_matrix(corpus_path, document_key):


class TFIDFRanking:
""" Computes TF-IDF based rankings.
The first ranking is based on the average TF-IDF score of each lemma over all corpus
The second ranking is based on the average standard deviation of TF-IDF scores
of each lemma over all corpus
"""

def __init__(self, vectorizer, verbs, tfidf_matrix):
self.vectorizer = vectorizer
self.verbs = verbs
self.tfidf_matrix = tfidf_matrix

def score_lemma(self, lemma):
""" Computess TF-IDF based score of a single lemma
:param str lemma: The lemma to score
:return: tuple with lemma, average tf-idf, average of tf-idf standard deviations
:rtype: tuple of (str, float, float)
"""
tf_idfs, st_devs = [], []
for token in self.verbs[lemma]:
scores = get_similarity_scores(token, self.vectorizer, self.tfidf_matrix)
Expand All @@ -88,6 +100,11 @@ def score_lemma(self, lemma):
return lemma, average(tf_idfs), average(st_devs)

def find_ranking(self, processes=0):
""" Ranks the verbs
:param int processes: How many processes to use for parallel ranking
:return: tuple with average tf-idf and average standard deviation ordered rankings
:rtype: tuple of (OrderedDict, OrderedDict)
"""
tfidf_ranking = {}
stdev_ranking = {}
for lemma, tfidf, stdev in parallel.map(self.score_lemma, self.verbs, processes):
Expand All @@ -98,6 +115,9 @@ def find_ranking(self, processes=0):


class PopularityRanking:
""" Ranking based on the popularity of each verb. Simply counts the
frequency of each lemma over all corpus
"""

def __init__(self, corpus_path, pos_tag_key):
self.tags = self._flatten(item.get(pos_tag_key) for item in load_scraped_items(corpus_path))
Expand Down Expand Up @@ -181,8 +201,11 @@ def get(k):
@click.option('--dump-popularity', type=click.File('w'), default='dev/popularity_ranking.json')
@click.option('--dump-final', type=click.File('w'), default='dev/verb_ranking.json')
@click.option('--processes', '-p', default=0)
def main(pos_tagged, document_key, pos_tag_key, language, dump_verbs, dump_tf_idf, dump_stdev, dump_popularity,
dump_final, processes):
def main(pos_tagged, document_key, pos_tag_key, language, dump_verbs, dump_tf_idf,
dump_stdev, dump_popularity, dump_final, processes):
""" Computes the three verb rankings: average TF-IDF, average of TF-IDF
standard deviation and popularity.
"""

logger.info('Computing lemma to token map and TF-IDF matrix')
lemma_tokens, (vectorizer, tf_idf_matrix) = parallel.execute(
Expand All @@ -206,7 +229,3 @@ def main(pos_tagged, document_key, pos_tag_key, language, dump_verbs, dump_tf_id
json.dump(pop_ranking, dump_popularity, indent=2)
json.dump(lemma_tokens, dump_verbs, default=lambda x: list(x), indent=2)
json.dump(final_ranking, dump_final, indent=2)


if __name__ == '__main__':
exit(main())
17 changes: 9 additions & 8 deletions strephit/extraction/balanced_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,14 @@

def lu_count(sentences, processes=0, input_encoded=False):
""" Count how many sentences per LU there are for each source
:param sentences: Corpus with the POS-tagged sentences
:param processes: how many processes to use for parallel execution
:param input_encoded: whether the corpus is an iterable of dictionaries
:param iterable sentences: Corpus with the POS-tagged sentences
:param int processes: how many processes to use for parallel execution
:param bool input_encoded: whether the corpus is an iterable of dictionaries
or an iterable of JSON-input_encoded documents. JSON-input_encoded
documents are preferable over large size dictionaries for performance reasons
:return: A dictionary source -> frequencies, where frequencies is
another dictionary lemma -> count
:type: bool
"""

def worker(batch):
Expand All @@ -45,15 +46,15 @@ def worker(batch):

def extract_sentences(sentences, probabilities, processes=0, input_encoded=False, output_encoded=False):
""" Extracts some sentences from the corpus following the given probabilities
:param sentences: Extracted sentences
:param probabilities: Conditional probabilities of extracting a sentence containing
:param iterable sentences: Extracted sentences
:param dict probabilities: Conditional probabilities of extracting a sentence containing
a specific LU given the source of the sentence. It is therefore a mapping
source -> probabilities, where probabilities is itself a mapping LU -> probability
:param processes: how many processes to use for parallel execution
:param input_encoded: whether the corpus is an iterable of dictionaries or an
:param int processes: how many processes to use for parallel execution
:param bool input_encoded: whether the corpus is an iterable of dictionaries or an
iterable of JSON-encoded documents. JSON-encoded documents are preferable
over large size dictionaries for performance reasons
:param output_encoded: whether to return a generator of dictionaries or a generator
:param bool output_encoded: whether to return a generator of dictionaries or a generator
of JSON-encoded documents. Prefer encoded output for performance reasons
:return: Generator of sentences
"""
Expand Down
14 changes: 9 additions & 5 deletions strephit/extraction/extract_sentences.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@ class SentenceExtractor:
def __init__(self, corpus, document_key, sentences_key, language, lemma_to_token, match_base_form):
""" Initializes the extractor.
:param corpus: The corpus, iterable of `dict`s. Generator preferred
:param document_key: The key from which to retrieve the textual document
:param sentences_key: The key to which the extracted sentences should be stored
:param language: The language the text is in
:param lemma_to_token: Mapping from lemma to list of tokens
:param iterable corpus: The corpus, iterable of `dict`s
:param str document_key: The key from which to retrieve the textual document
:param str sentences_key: The key to which the extracted sentences should be stored
:param str language: The language the text is in
:param dict lemma_to_token: Mapping from lemma to list of tokens
"""
self.corpus = corpus
self.sentences_key = sentences_key
Expand Down Expand Up @@ -65,6 +65,10 @@ def teardown_extractor(self):
def extract(self, processes=0):
""" Processes the corpus extracting sentences from each item
and storing them in the item itself.
:param int processes: how many processes to use for parallel tagging
:return: the extracted sentences
:type: generator of dicts
"""
self.setup_extractor()

Expand Down
1 change: 0 additions & 1 deletion strephit/extraction/process_semistructured.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,6 @@ def serialize_item((i, item, language, sourced_only)):

def resolve_genealogics_family(input_file, url_to_id):
""" Performs a second pass on genealogics to resolve additional family members
"""
family_properties = {
'Family': 'P1038',
Expand Down

0 comments on commit 2667b58

Please sign in to comment.