Merge pull request #13 from MohammadForouhesh/integration-deployment-…

…pipeline Integration deployment pipeline
MohammadForouhesh · Mar 12, 2022 · ca84685 · ca84685
2 parents 1103e2b + 8a09285
commit ca84685
Show file tree

Hide file tree

Showing 13 changed files with 264 additions and 62 deletions.
diff --git a/.coverage b/.coverage
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -0,0 +1,34 @@
+name: Publish to PyPI
+
+on:
+  release:
+    types: [published]
+jobs:
+  build-and-publish-to-pypi:
+    name: Build and publish to PyPI
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python 3.6
+        uses: actions/setup-python@v1
+        with:
+          python-version: 3.7
+      - name: Install pypa/build
+        run: >-
+          python -m
+          pip install
+          build
+          --user
+      - name: Build a binary wheel and a source tarball
+        run: >-
+          python -m
+          build
+          --sdist
+          --wheel
+          --outdir dist/
+          .
+      - name: Publish to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          username: test_api_token
+          password: ${{ secrets.TEST_API_TOKEN }}
diff --git a/.github/workflows/deploy_test.yml b/.github/workflows/deploy_test.yml
@@ -0,0 +1,37 @@
+name: PyPI Unit test
+
+on:
+  push:
+    branches:
+      - main
+  schedule:
+    - cron: '0 0 * * *' # Once per day
+  pull_request:
+    branches:
+      - main
+    paths-ignore:
+      - '**.md'
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [3.7]
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v1
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install deepcut
+        pip install crf_pos[full]
+    - name: Test
+      run: |
+        pytest -v
+        coverage run main.py
+        coverage report -m
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -0,0 +1,38 @@
+name: Lint
+
+on:
+  push:
+    branches:
+      - main
+    paths-ignore:
+      - '**.md'
+  pull_request:
+    branches:
+      - main
+    paths-ignore:
+      - '**.md'
+
+jobs:
+  build:
+    runs-on: ubuntu-20.04
+    strategy:
+      matrix:
+        python-version: [3.7]
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v1
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+        pip install flake8 flake8-commas flake8-comprehensions flake8-tidy-imports
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
diff --git a/.gitignore b/.gitignore
@@ -6,3 +6,4 @@
 *.pyc
 model/UPC_full_model_wapiti
 *.txt
+resources/UPC_full_model_wapiti
diff --git a/README.md b/README.md
@@ -15,7 +15,9 @@ This repository contains Persian Part of Speech tagger based on Conditional Rand
 # Table of Contents
 1. [TO-DO](#todo)
 2. [Installation](#install)
-    1. [on CoLab](#colab)
+   1. [Using Pip](#pip)
+   2. [From Source](#source)
+   3. [Nn CoLab](#colab)
 3. [Usage](#usage)
 4. [Evaluation](#eval)
 
@@ -42,16 +44,22 @@ This repository contains Persian Part of Speech tagger based on Conditional Rand
 - [x] Scrutinize Coverage [issue#8](https://github.com/MohammadForouhesh/crf-pos-persian/issues/8#issue-1162353982)
 - [x] Documentation [pull#9](https://github.com/MohammadForouhesh/crf-pos-persian/pull/9#issuecomment-1061754671)
 - [x] Improve Coverage [pull#9](https://github.com/MohammadForouhesh/crf-pos-persian/pull/9#issuecomment-1061754671)  
-- [ ] Smooth Installation
+- [x] Smooth Installation [issue#12]() [pull#13]()
 - [x] Excel code quality [pull#11](https://github.com/MohammadForouhesh/crf-pos-persian/pull/11)
-
+- [ ] Adding documentation and flowchart of the code.
 ## Installation: <a name="install"></a>
+### Using Pip <a name="pip"></a>
+```shell
+! pip install crf_pos
+```
+
+### From Source <a name="source"></a>
 ```shell
 $ git clone https://github.com/MohammadForouhesh/crf-pos-persian 
 $ cd crf-pos-persian
 $ python setup.py install
 ```
-### on CoLab <a name="colab"></a>
+### On CoLab <a name="colab"></a>
 ```shell
 ! pip install git+https://github.com/MohammadForouhesh/crf-pos-persian.git
 ```
@@ -61,18 +69,19 @@ $ python setup.py install
 ```jupyterpython
 from crf_pos.pos_tagger.wapiti import WapitiPosTagger
 pos_tagger = WapitiPosTagger()
-tokens = text = 'او رئیس‌جمهور حجتالاسلاموالمسلمین ابرهیم رئیسی رئیس جمهور می باشد'.split()
+tokens = text = 'او رئیس‌جمهور حجتالاسلاموالمسلمین ابرهیم رئیسی رئیس جمهور ایران اسلامی می باشد'.split()
 pos_tagger[tokens]
 
 [1]: 
-[('ابراهیم', 'N'),
-('رپیسی', 'N'),
-('ریپس', 'ADJ'),
-('جمهور', 'N'),
-('جمهوری', 'N'),
-('اسلامی', 'ADJ'),
+[('او', 'PRO'),
+('رئیس\u200cجمهور', 'N'),
+('حجت\u200cالاسلام\u200cوالمسلمین', 'N'),
+('ابرهیم', 'N'),
+('رئیسی', 'N'),
+('رئیس\u200cجمهور', 'N'),
 ('ایران', 'N'),
-('میباشد', 'V')]
+('اسلامی', 'ADJ'),
+('می\u200cباشد', 'V')]
 ```
 ## Evaluation <a name="eval"></a>
 |Part-of-Speech|  precision|   recall|      f1-score|    support|

diff --git a/crf_pos/api.py b/crf_pos/api.py
@@ -11,16 +11,18 @@
 supported http links are:
     - https://github.com/MohammadForouhesh/crf-pos-persian/releases/download/v2.0.0.alpha/UPC_full_model_wapiti
     - https://github.com/MohammadForouhesh/crf-pos-persian/releases/download/v2.0.0.alpha/perpos.model
-    - https://raw.githubusercontent.com/MohammadForouhesh/Parsivar/master/parsivar/resource/normalizer/model/normalizer/Dic1_new.txt
-    - https://raw.githubusercontent.com/MohammadForouhesh/Parsivar/master/parsivar/resource/normalizer/model/normalizer/Dic2_new.txt
-    - https://raw.githubusercontent.com/MohammadForouhesh/Parsivar/master/parsivar/resource/normalizer/model/normalizer/Dic3_new.txt
+    - https://github.com/MohammadForouhesh/crf-pos-persian/releases/download/v2.0.0.alpha/corrections.txt
 """
 
 import os
 from typing import Union
 
 import requests
 
+http_dict = {'UPC_full_model_wapiti': 'https://github.com/MohammadForouhesh/crf-pos-persian/releases/download/v2.0.0.alpha/UPC_full_model_wapiti',
+             'perpos.model': 'https://github.com/MohammadForouhesh/crf-pos-persian/releases/download/v2.0.0.alpha/perpos.model',
+             'corrections.txt': 'https://github.com/MohammadForouhesh/crf-pos-persian/releases/download/v2.0.0.alpha/corrections.txt'}
+
 
 def downloader(path: str, save_path: str, mode: str) -> Union[int, None]:
     """
@@ -44,3 +46,16 @@ def downloader(path: str, save_path: str, mode: str) -> Union[int, None]:
             resource.write(model_bin.content)
     except Exception:
         raise Exception('not a proper webpage')
+
+
+def get_resources(dir_path: str, resource_name: str) -> str:
+    """
+    A tool to download required resources over internet.
+    :param dir_path:        Path to the https link of the resource
+    :param resource_name:   Resource name.
+    :return:                Path to the downloaded resource.
+    """
+    save_dir = dir_path + '/resources/'
+    os.makedirs(save_dir, exist_ok=True)
+    downloader(path=http_dict[resource_name], save_path=save_dir + resource_name, mode='wb')
+    return str(save_dir + resource_name)
diff --git a/crf_pos/normalization/normalizer.py b/crf_pos/normalization/normalizer.py
@@ -11,36 +11,25 @@
 helps with detecting
 half-spaces.
 """
-import itertools
-from re import sub
+
 import os
+from re import sub
 from typing import Dict, List, Generator
-
-from crf_pos.api import downloader
+from crf_pos.api import get_resources
 from crf_pos.normalization.tokenizer import clean_text
 
 
 class Normalizer:
     """
     A native persian text normalizer to help detecting half-spaces.
     """
-    def __init__(self, downloading: bool = False) -> None:
-        self.dir_path = os.path.dirname(
+    def __init__(self, downloading: bool = True) -> None:
+        dir_path = os.path.dirname(
             os.path.dirname(
                 os.path.dirname(
                     os.path.realpath(__file__)))) + "/"
-        if downloading: self.get_resources()
-        self.corrections = self.load_dictionary(self.dir_path + 'model/normalizer/corrections.txt')
-
-    def get_resources(self) -> None:
-        """
-        A tool to download required resources over internet.
-        :return:    None.
-        """
-        load_dir = 'https://github.com/MohammadForouhesh/crf-pos-persian/releases/download/v2.0.0.alpha/corrections.txt'
-        save_dir = self.dir_path + '/model/normalizer/'
-        os.makedirs(save_dir, exist_ok=True)
-        downloader(path=load_dir, save_path=save_dir + 'corrections.txt', mode='wb')
+        if downloading: get_resources(dir_path, resource_name='corrections.txt')
+        self.corrections = self.load_dictionary(dir_path + 'resources/corrections.txt')
 
     @staticmethod
     def load_dictionary(file_path: str) -> Dict[str, str]:
@@ -76,12 +65,29 @@ def space_correction(text: str) -> str:
 
     @staticmethod
     def window_sampling(tokens: List[str], window_length: int) -> Generator[str, None, None]:
+        """
+        Sample a sentence by moving a window of length `window_length` over it. e.g.
+        >>> list(Normalizer.window_sampling(tokens=['Hi', 'Hello', 'Hallo'], window_length=2))
+        ['Hi Hello', 'Hello Hallo']
+
+        :param tokens:          A list of tokens i.e. words
+        :param window_length:   An integer, the length of the sampling.
+        :return:                A list of concatenated tokens.
+        """
+
         if len(tokens) < window_length: yield ' '.join(tokens)
         while True:
             try:                yield ' '.join([tokens.pop(0)] + [tokens[_] for _ in range(window_length - 1)])
             except IndexError:  break
 
     def vector_mavericks(self, text: str, window_length: int) -> Generator[str, None, None]:
+        """
+        A generative recursive function that substitute a concatenated string of length `window_length` with its
+        half-space correction.
+        :param text:            The input text (str).
+        :param window_length:   The order (scope) of correction, considers the n-grams for correcting.
+        :return:                A list of tokens that are half-space corrected upto order `window_length`
+        """
         iter_sample = iter(self.window_sampling(text.replace('\u200c', ' ').split(), window_length))
         for word in iter_sample:
             try:
@@ -92,10 +98,21 @@ def vector_mavericks(self, text: str, window_length: int) -> Generator[str, None
                 except: yield word.split()[0]
 
     def moving_mavericks(self, text: str, scope: int = 4) -> Generator[str, None, None]:
+        """
+        Cascading the generation of half-space correction for a variety of different scopes (n-grams).
+        :param text:    An input text.
+        :param scope:   The maximum length of which we are interested to study.
+        :return:        A generator of generators
+        """
         yield self.vector_mavericks(text, scope)
         if scope > 1: yield from self.moving_mavericks(text, scope - 1)
 
     def collapse_mavericks(self, text: str) -> str:
+        """
+        Choosing the best output among all of the corrections.
+        :param text:    Input text (str)
+        :return:        half-space corrected text. (str)
+        """
         mavericks_cascades = list(map(lambda item: ' '.join(item), self.moving_mavericks(text)))
         return sorted(mavericks_cascades, key=lambda item: item.count('\u200c'))[-1]
 

diff --git a/crf_pos/pos_tagger/crf.py b/crf_pos/pos_tagger/crf.py
@@ -10,18 +10,21 @@
 This Module contains the implementation and encapsulation for Conditional Random Field classifier.
 """
 
+import pickle
 from typing import List, Tuple, Any
+from crf_pos.api import get_resources
 from crf_pos.pos_tagger.meta_tagger import MetaTagger
 from crf_pos.pos_tagger.utils import token2features
-import pickle
 
 
 class CrfPosTagger(MetaTagger):
     """
     Wapiti Part-of-Speech tagger encapsulation.
     """
-    def __init__(self, model_path) -> None:
+    def __init__(self) -> None:
         super().__init__()
+
+        model_path = get_resources(self.dir_path, resource_name='perpos.model')
         with open(model_path, 'rb') as resource:
             self.tagger = pickle.load(resource)
 

diff --git a/crf_pos/pos_tagger/meta_tagger.py b/crf_pos/pos_tagger/meta_tagger.py
@@ -10,6 +10,7 @@
 This module sole purpose is abstraction, it contains a meta class for Wapiti and CRF classifier.
 """
 
+import os
 from typing import Union, List, Any, Tuple, Generator
 from crf_pos.normalization.normalizer import Normalizer
 
@@ -19,6 +20,10 @@ class MetaTagger:
     Part-of-Speech taggers meta class abstraction.
     """
     def __init__(self) -> None:
+        self.dir_path = os.path.dirname(
+            os.path.dirname(
+                os.path.dirname(
+                    os.path.realpath(__file__)))) + "/"
         self.tagger = None
         self.norm = Normalizer(downloading=True)
 
@@ -31,8 +36,6 @@ def __getitem__(self, item: Union[list, str]) -> List[Tuple[str, str]]:
         if isinstance(item, str):   item = self.norm.normalize(item).split()
         return self.parse([item])[0]
 
-    ## ToDo get_resources.
-
     def parse(self, token_list: List[str]) -> List[List[Tuple[Any, Any]]]:
         """
         An abstract method, to be overwritten by its descendants.

diff --git a/crf_pos/pos_tagger/wapiti.py b/crf_pos/pos_tagger/wapiti.py
@@ -12,6 +12,8 @@
 
 from typing import List, Any, Tuple
 from wapiti import Model
+
+from crf_pos.api import get_resources
 from crf_pos.pos_tagger.meta_tagger import MetaTagger
 from crf_pos.pos_tagger.utils import remove_after_underline
 
@@ -20,8 +22,9 @@ class WapitiPosTagger(MetaTagger):
     """
     Wapiti Part-of-Speech tagger encapsulation.
     """
-    def __init__(self, model_path: str = 'model/UPC_full_model_wapiti') -> None:
+    def __init__(self) -> None:
         super().__init__()
+        model_path = get_resources(self.dir_path, resource_name='UPC_full_model_wapiti')
         self.tagger = Model(model=model_path)
 
     def parse(self, token_list: List[str]) -> List[List[Tuple[Any, Any]]]: