#34 improved documentation

Wikidata · Jun 1, 2016 · 2667b58 · 2667b58
1 parent 327f337
commit 2667b58
Show file tree

Hide file tree

Showing 9 changed files with 170 additions and 32 deletions.
diff --git a/strephit/commons/cache.py b/strephit/commons/cache.py
@@ -22,6 +22,23 @@ def _path_for(hashed_key):
 
 
 def get(key, default=None):
+    """ Retrieves an item from the cache
+        :param key: Key of the item
+        :param default: Default value to return if the
+         key is not in the cache
+        :return: The item associated with the given key or
+         the default value
+
+        Sample usage:
+        >>> from strephit.commons import cache
+        >>> cache.get('kk', 13)
+        13
+        >>> cache.get('kk', 0)
+        0
+        >>> cache.set('kk', 15)
+        >>> cache.get('kk', 0)
+        15
+    """
     if not ENABLED:
         return default
 
@@ -39,6 +56,24 @@ def get(key, default=None):
 
 
 def set(key, value, overwrite=True):
+    """ Stores an item in the cache under the given key
+        :param key: Unique key used to identify the idem.
+        :param value: Value to store in the cache. Must be
+         JSON-dumpable
+        :param overwrite: Whether to overwrite the previous
+         value associated with the key (if any)
+        :return: Nothing
+
+        Sample usage:
+        >>> from strephit.commons import cache
+        >>> cache.get('kk', 13)
+        13
+        >>> cache.get('kk', 0)
+        0
+        >>> cache.set('kk', 15)
+        >>> cache.get('kk', 0)
+        15
+    """
     if not ENABLED:
         return
 
@@ -63,7 +98,22 @@ def set(key, value, overwrite=True):
 
 
 def cached(function):
-    """ Decorator to cache function results based on its arguments """
+    """ Decorator to cache function results based on its arguments
+
+    Sample usage:
+    >>> from strephit.commons import cache
+    >>> @cache.cached
+    ... def f(x):
+    ...     print 'inside f'
+    ...     return 2 * x
+    ...
+    >>> f(10)
+    inside f
+    20
+    >>> f(10)
+    20
+
+    """
     def wrapper(*args, **kwargs):
         key = str([function.__module__]) + function.__name__ + str(args) + str(kwargs)
         res = get(key)

diff --git a/strephit/commons/date_normalizer.py b/strephit/commons/date_normalizer.py
@@ -77,7 +77,7 @@ def _meta_init(self, specs):
         self.globals = self.meta_funcs
         self.globals.update(self.meta_vars)
 
-    def normalize_one(self, expression, conflict='first'):
+    def normalize_one(self, expression, conflict='longest'):
         """ Find the matching part in the given expression
 
         :param str expression: The expression in which to search the match
@@ -91,6 +91,11 @@ def normalize_one(self, expression, conflict='first'):
          Allowed values are `first`, `longest` and `shortest`
         :return: Tuple with (start, end), category, result
         :rtype: tuple
+
+        Sample usage:
+        >>> from strephit.commons.date_normalizer import DateNormalizer
+        >>> DateNormalizer('en').normalize_one('Today is the 1st of June, 2016')
+        ((13, 30), 'Time', {'month': 6, 'day': 1, 'year': 2016})
         """
 
         best_match = None
@@ -118,6 +123,15 @@ def normalize_many(self, expression):
 
         :param str expression: The expression in which to look for
         :return: Generator of tuples (start, end), category, result
+
+        Sample usage:
+        >>> from pprint import pprint
+        >>> from strephit.commons.date_normalizer import DateNormalizer
+        >>> pprint(list(DateNormalizer('en').normalize_many('I was born on April 18th, '
+        ...                                                 'and today is April 18th, 2016!')))
+        [((14, 24), 'Time', {'day': 18, 'month': 4}),
+         ((39, 55), 'Time', {'day': 18, 'month': 4, 'year': 2016})]
+
         """
 
         # start matching from here, and move forward as new matches

diff --git a/strephit/commons/parallel.py b/strephit/commons/parallel.py
@@ -100,6 +100,12 @@ def map(function, iterable, processes=0, flatten=False, raise_exc=True, batch_si
         :param batch_size: If larger than 0, the input iterable will be grouped in groups
          of this size and the resulting list passed to as argument to the worker.
         :returns: iterable with the results. Order is not guaranteed to be preserved
+
+        Sample usage:
+        >>> from strephit.commons import parallel
+        >>> list(parallel.map(lambda x: 2*x, range(10)))
+        [0, 8, 10, 12, 14, 16, 18, 2, 4, 6]
+
     """
     if processes == 1:
         for task in make_batches(iterable, batch_size):
@@ -131,8 +137,17 @@ def execute(processes=0, *specs):
 
         :param processes: Number of functions to execute at the same time
         :param specs: a sequence of functions, each followed by its arguments (arguments as a tuple or list)
-        :return: the results that the functions returned
+        :return: the results that the functions returned, in the same order as they were specified
         :rtype: list
+
+        Sample usage:
+
+        >>> from strephit.commons import parallel
+        >>> list(parallel.execute(4,
+        ...     lambda x, y: x + y, (5, -5),
+        ...     lambda *x: sum(x), range(5)
+        ... ))
+        [0, 10]
     """
     functions, arguments = specs[::2], specs[1::2]
     res = list(map(lambda (i, args): (i, functions[i](*args)),

diff --git a/strephit/commons/pos_tag.py b/strephit/commons/pos_tag.py
@@ -66,7 +66,7 @@ def _postprocess_tags(self, tags, skip_unknown=True):
         """ Clean tagged data from non-tags and unknown lemmas (optionally) """
         clean_tags = []
         for tag in tags:
-            if skip_unknown and tag.lemma == u'<unknown>':
+            if skip_unknown and isinstance(tag, NotTag) or tag.lemma == u'<unknown>':
                 logger.debug("Unknown lemma found: %s. Skipping ..." % repr(tag))
                 continue
             clean_tags.append(tag)
@@ -78,16 +78,52 @@ def tokenize(self, text):
         return self.tokenizer.tokenize(text)
 
     def tag_one(self, text, skip_unknown=True, **kwargs):
-        """ POS-Tags the given text, optionally skipping unknown lemmas """
+        """ POS-Tags the given text, optionally skipping unknown lemmas
+            :param unicode text: Text to be tagged
+            :param bool skip_unknown: Automatically emove unrecognized tags from the result
+
+            Sample usage:
+            >>> from strephit.commons.pos_tag import TTPosTagger
+            >>> from pprint import pprint
+            >>> pprint(TTPosTagger('en').tag_one(u'sample sentence to be tagged fycgvkuhbj'))
+            [Tag(word=u'sample', pos=u'NN', lemma=u'sample'),
+             Tag(word=u'sentence', pos=u'NN', lemma=u'sentence'),
+             Tag(word=u'to', pos=u'TO', lemma=u'to'),
+             Tag(word=u'be', pos=u'VB', lemma=u'be'),
+             Tag(word=u'tagged', pos=u'VVN', lemma=u'tag')]
+        """
         return self._postprocess_tags(make_tags(self.tagger.tag_text(text, **kwargs)),
                                       skip_unknown)
 
     def tag_many(self, items, document_key, pos_tag_key, batch_size=10000, **kwargs):
         """ POS-Tags many text documents of the given items. Use this for massive text tagging
 
             :param items: Iterable of items to tag. Generator preferred
-            :param document_key: Where to find the text to tag inside each item
+            :param document_key: Where to find the text to tag inside each item. Text must be unicode
             :param pos_tag_key: Where to put pos tagged text
+
+            Sample usage:
+            >>> from strephit.commons.pos_tag import TTPosTagger
+            >>> from pprint import pprint
+            >>> pprint(list(TTPosTagger('en').tag_many(
+            ...     [{'text': u'Item one is in first position'}, {'text': u'In the second position is item two'}],
+            ...     'text', 'tagged'
+            ... )))
+            [{'tagged': [Tag(word=u'Item', pos=u'NN', lemma=u'item'),
+                         Tag(word=u'one', pos=u'CD', lemma=u'one'),
+                         Tag(word=u'is', pos=u'VBZ', lemma=u'be'),
+                         Tag(word=u'in', pos=u'IN', lemma=u'in'),
+                         Tag(word=u'first', pos=u'JJ', lemma=u'first'),
+                         Tag(word=u'position', pos=u'NN', lemma=u'position')],
+              'text': u'Item one is in first position'},
+             {'tagged': [Tag(word=u'In', pos=u'IN', lemma=u'in'),
+                         Tag(word=u'the', pos=u'DT', lemma=u'the'),
+                         Tag(word=u'second', pos=u'JJ', lemma=u'second'),
+                         Tag(word=u'position', pos=u'NN', lemma=u'position'),
+                         Tag(word=u'is', pos=u'VBZ', lemma=u'be'),
+                         Tag(word=u'item', pos=u'RB', lemma=u'item'),
+                         Tag(word=u'two', pos=u'CD', lemma=u'two')],
+              'text': u'In the second position is item two'}]
         """
         tt_pool = TaggerProcessPoll(
             TAGLANG=self.language,

diff --git a/strephit/commons/resources/normalization_rules_en.yml b/strephit/commons/resources/normalization_rules_en.yml
@@ -42,10 +42,10 @@ Time:
   - (in|by) {early_late_mid}? {match_month}? {match_year}: >
       make_date(**match.groupdict())
 
-  - (?P<day>\d{{1,2}}){cardinal_suffix}? (of)? {match_month} {match_year}: >
+  - (?P<day>\d{{1,2}}){cardinal_suffix}? (of)? {match_month},? {match_year}?: >
       make_date(**match.groupdict())
 
-  - (?P<month>{month}) (?P<day>\d{{1,2}}){cardinal_suffix}?, {match_year}: >
+  - (?P<month>{month}) (?P<day>\d{{1,2}}){cardinal_suffix}?(,? {match_year})?: >
       make_date(**match.groupdict())
 
   - (c\.|on|about|the|of|circa|around|year) {match_year}: >

diff --git a/strephit/corpus_analysis/rank_verbs.py b/strephit/corpus_analysis/rank_verbs.py
@@ -26,6 +26,8 @@ def get_similarity_scores(verb_token, vectorizer, tf_idf_matrix):
     """ Compute the cosine similarity score of a given verb token against the input corpus TF/IDF matrix.
 
         :param str verb_token: Surface form of a verb, e.g., *born*
+        :param sklearn.feature_extraction.text.TfidfVectorizer vectorizer: Vectorizer
+         used to transform verbs into vectors
         :return: cosine similarity score
         :rtype: ndarray
     """
@@ -41,8 +43,8 @@ def get_similarity_scores(verb_token, vectorizer, tf_idf_matrix):
 def produce_lemma_tokens(pos_tagged_path, pos_tag_key, language):
     """ Extracts a map from lemma to all its tokens
 
-        :param pos_tagged_path: path of the pos-tagged corpus
-        :param pos_tag_key: where the pos tag data is in each item
+        :param str pos_tagged_path: path of the pos-tagged corpus
+        :param str pos_tag_key: where the pos tag data is in each item
         :param language: language of the corpus
         :return: mapping from lemma to tokens
         :rtype: dict
@@ -61,8 +63,8 @@ def produce_lemma_tokens(pos_tagged_path, pos_tag_key, language):
 def compute_tf_idf_matrix(corpus_path, document_key):
     """ Computes the TF-IDF matrix of the corpus
 
-        :param corpus_path: path of the corpus
-        :param document_key: where the textual content is in the corpus
+        :param str corpus_path: path of the corpus
+        :param str document_key: where the textual content is in the corpus
         :return: a vectorizer and the computed matrix
         :rtype: tuple
     """
@@ -72,13 +74,23 @@ def compute_tf_idf_matrix(corpus_path, document_key):
 
 
 class TFIDFRanking:
+    """ Computes TF-IDF based rankings.
+        The first ranking is based on the average TF-IDF score of each lemma over all corpus
+        The second ranking is based on the average standard deviation of TF-IDF scores
+        of each lemma over all corpus
+    """
 
     def __init__(self, vectorizer, verbs, tfidf_matrix):
         self.vectorizer = vectorizer
         self.verbs = verbs
         self.tfidf_matrix = tfidf_matrix
 
     def score_lemma(self, lemma):
+        """ Computess TF-IDF based score of a single lemma
+            :param str lemma: The lemma to score
+            :return: tuple with lemma, average tf-idf, average of tf-idf standard deviations
+            :rtype: tuple of (str, float, float)
+        """
         tf_idfs, st_devs = [], []
         for token in self.verbs[lemma]:
             scores = get_similarity_scores(token, self.vectorizer, self.tfidf_matrix)
@@ -88,6 +100,11 @@ def score_lemma(self, lemma):
         return lemma, average(tf_idfs), average(st_devs)
 
     def find_ranking(self, processes=0):
+        """ Ranks the verbs
+            :param int processes: How many processes to use for parallel ranking
+            :return: tuple with average tf-idf and average standard deviation ordered rankings
+            :rtype: tuple of (OrderedDict, OrderedDict)
+        """
         tfidf_ranking = {}
         stdev_ranking = {}
         for lemma, tfidf, stdev in parallel.map(self.score_lemma, self.verbs, processes):
@@ -98,6 +115,9 @@ def find_ranking(self, processes=0):
 
 
 class PopularityRanking:
+    """ Ranking based on the popularity of each verb. Simply counts the
+        frequency of each lemma over all corpus
+    """
 
     def __init__(self, corpus_path, pos_tag_key):
         self.tags = self._flatten(item.get(pos_tag_key) for item in load_scraped_items(corpus_path))
@@ -181,8 +201,11 @@ def get(k):
 @click.option('--dump-popularity', type=click.File('w'), default='dev/popularity_ranking.json')
 @click.option('--dump-final', type=click.File('w'), default='dev/verb_ranking.json')
 @click.option('--processes', '-p', default=0)
-def main(pos_tagged, document_key, pos_tag_key, language, dump_verbs, dump_tf_idf, dump_stdev, dump_popularity,
-         dump_final, processes):
+def main(pos_tagged, document_key, pos_tag_key, language, dump_verbs, dump_tf_idf,
+         dump_stdev, dump_popularity, dump_final, processes):
+    """ Computes the three verb rankings: average TF-IDF, average of TF-IDF
+        standard deviation and popularity.
+    """
 
     logger.info('Computing lemma to token map and TF-IDF matrix')
     lemma_tokens, (vectorizer, tf_idf_matrix) = parallel.execute(
@@ -206,7 +229,3 @@ def main(pos_tagged, document_key, pos_tag_key, language, dump_verbs, dump_tf_id
     json.dump(pop_ranking, dump_popularity, indent=2)
     json.dump(lemma_tokens, dump_verbs, default=lambda x: list(x), indent=2)
     json.dump(final_ranking, dump_final, indent=2)
-
-
-if __name__ == '__main__':
-    exit(main())
diff --git a/strephit/extraction/balanced_extract.py b/strephit/extraction/balanced_extract.py
@@ -12,13 +12,14 @@
 
 def lu_count(sentences, processes=0, input_encoded=False):
     """ Count how many sentences per LU there are for each source
-        :param sentences: Corpus with the POS-tagged sentences
-        :param processes: how many processes to use for parallel execution
-        :param input_encoded: whether the corpus is an iterable of dictionaries
+        :param iterable sentences: Corpus with the POS-tagged sentences
+        :param int processes: how many processes to use for parallel execution
+        :param bool input_encoded: whether the corpus is an iterable of dictionaries
          or an iterable of JSON-input_encoded documents. JSON-input_encoded
          documents are preferable over large size dictionaries for performance reasons
         :return: A dictionary source -> frequencies, where frequencies is
          another dictionary lemma -> count
+        :type: bool
     """
 
     def worker(batch):
@@ -45,15 +46,15 @@ def worker(batch):
 
 def extract_sentences(sentences, probabilities, processes=0, input_encoded=False, output_encoded=False):
     """ Extracts some sentences from the corpus following the given probabilities
-        :param sentences: Extracted sentences
-        :param probabilities: Conditional probabilities of extracting a sentence containing
+        :param iterable sentences: Extracted sentences
+        :param dict probabilities: Conditional probabilities of extracting a sentence containing
          a specific LU given the source of the sentence. It is therefore a mapping
          source -> probabilities, where probabilities is itself a mapping LU -> probability
-        :param processes: how many processes to use for parallel execution
-        :param input_encoded: whether the corpus is an iterable of dictionaries or an
+        :param int processes: how many processes to use for parallel execution
+        :param bool input_encoded: whether the corpus is an iterable of dictionaries or an
          iterable of JSON-encoded documents. JSON-encoded documents are preferable
          over large size dictionaries for performance reasons
-        :param output_encoded: whether to return a generator of dictionaries or a generator
+        :param bool output_encoded: whether to return a generator of dictionaries or a generator
          of JSON-encoded documents. Prefer encoded output for performance reasons
         :return: Generator of sentences
     """

diff --git a/strephit/extraction/extract_sentences.py b/strephit/extraction/extract_sentences.py
@@ -27,11 +27,11 @@ class SentenceExtractor:
     def __init__(self, corpus, document_key, sentences_key, language, lemma_to_token, match_base_form):
         """ Initializes the extractor.
 
-            :param corpus: The corpus, iterable of `dict`s. Generator preferred
-            :param document_key: The key from which to retrieve the textual document
-            :param sentences_key: The key to which the extracted sentences should be stored
-            :param language: The language the text is in
-            :param lemma_to_token: Mapping from lemma to list of tokens
+            :param iterable corpus: The corpus, iterable of `dict`s
+            :param str document_key: The key from which to retrieve the textual document
+            :param str sentences_key: The key to which the extracted sentences should be stored
+            :param str language: The language the text is in
+            :param dict lemma_to_token: Mapping from lemma to list of tokens
         """
         self.corpus = corpus
         self.sentences_key = sentences_key
@@ -65,6 +65,10 @@ def teardown_extractor(self):
     def extract(self, processes=0):
         """ Processes the corpus extracting sentences from each item
             and storing them in the item itself.
+
+            :param int processes: how many processes to use for parallel tagging
+            :return: the extracted sentences
+            :type: generator of dicts
         """
         self.setup_extractor()
 

diff --git a/strephit/extraction/process_semistructured.py b/strephit/extraction/process_semistructured.py
@@ -92,7 +92,6 @@ def serialize_item((i, item, language, sourced_only)):
 
 def resolve_genealogics_family(input_file, url_to_id):
     """ Performs a second pass on genealogics to resolve additional family members
-
     """
     family_properties = {
         'Family': 'P1038',