16.0.0, making things super simple

imahdimir · Oct 16, 2023 · 76fdc5a · 76fdc5a
1 parent 31b7bfb
commit 76fdc5a
Show file tree

Hide file tree

Showing 6 changed files with 58 additions and 91 deletions.
diff --git a/.github/workflows/publish-on-pip.yml b/.github/workflows/publish-on-pip.yml
@@ -10,7 +10,7 @@ name: Upload Python Package
 
 on:
   release:
-    types: [published]
+    types: [ published ]
 
 permissions:
   contents: read
@@ -21,19 +21,19 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v3
-    - name: Set up Python
-      uses: actions/setup-python@v3
-      with:
-        python-version: '3.x'
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install build
-    - name: Build package
-      run: python -m build
-    - name: Publish package
-      uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
-      with:
-        user: __token__
-        password: ${{ secrets.PYPI_API_TOKEN }}
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v3
+        with:
+          python-version: '3.x'
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install build
+      - name: Build package
+        run: python -m build
+      - name: Publish package
+        uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
+        with:
+          user: __token__
+          password: ${{ secrets.PYPI_API_TOKEN }}
diff --git a/README.md b/README.md
@@ -7,12 +7,13 @@ pip install githubdata
 # Quick Start
 
 ```python
-from githubdata.utils import get_data_fr_github_without_double_clone
+from githubdata import get_data_wo_double_clone
 
-# Github data repository url
-url = 'https://github.com/imahdimir/d-TSETMC_ID-2-FirmTicker'
+# GitHub "Data Repository" url/path
+url = 'imahdimir/d-TSETMC_ID-2-FirmTicker'
 
-df = get_data_fr_github_without_double_clone(url)
+# get the data as a pandas DataFrame
+df = get_data_wo_double_clone(url)
 ```
 
-- Easy as That!
+***Easy as that!***
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,43 +4,35 @@ build-backend = "hatchling.build"
 
 [project]
 name = "githubdata"
-version = "15.0.0"
+version = "16.0.0"
 authors = [{ name = "Mahdi Mir", email = "[email protected]" }]
 description = "A simple Python package to easily download from and manage a GitHub \"Data repository\""
 readme = "README.md"
 license = { file = "LICENSE" }
 dependencies = [
     "giteasy",
-    "pandas",
     "fastparquet",
     "pyarrow",
-    "openpyxl",
-    "mirutil",
-    "persiantools"
+    "pandas",
 ]
 classifiers = [
-    "Programming Language :: Python :: 3",
     "Intended Audience :: Developers",
     "License :: OSI Approved :: MIT License",
     "Operating System :: OS Independent",
-    "Programming Language :: Python",
-    "Programming Language :: Python :: 3.6",
-    "Programming Language :: Python :: 3.7",
-    "Programming Language :: Python :: 3.8",
-    "Programming Language :: Python :: 3.9",
-    "Programming Language :: Python :: 3.10",
-    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3",
     "Programming Language :: Python :: Implementation :: PyPy",
     "Topic :: Software Development :: Libraries :: Python Modules",
-    "Topic :: Utilities"]
+    "Topic :: Utilities"
+]
 keywords = [
     "TSE",
     "Finance",
     "data",
     "cleaning",
     "data_cleaning",
     "Tehran",
-    "Stocks"]
+    "Stocks"
+]
 
 [project.urls]
 "Homepage" = "https://github.com/imahdimir/githubdata"

diff --git a/src/githubdata/__init__.py b/src/githubdata/__init__.py
@@ -1,2 +1,4 @@
-from .github_data_repo import default_containing_dir
+from .github_data_repo import default_githubdata_dir
 from .github_data_repo import GitHubDataRepo
+from .utils import clone_overwrite_a_repo__ret_gdr_obj
+from .utils import get_data_wo_double_clone
diff --git a/src/githubdata/github_data_repo.py b/src/githubdata/github_data_repo.py
@@ -4,23 +4,16 @@
 
 from pathlib import Path
 
+import pandas as pd
 from giteasy import GitHubRepo
-from mirutil.df import read_data_according_to_type as rdatt
 
-data_file_suffixes = {
-        '.xlsx' : None ,
-        '.prq'  : None ,
-        '.csv'  : None ,
-        }
-
-default_containing_dir = Path('GitHubData/')
+default_githubdata_dir = Path('GitHubData/')
 
 class GitHubDataRepo(GitHubRepo) :
-
     def __init__(self ,
                  repo_url ,
                  local_path = None ,
-                 containing_dir = default_containing_dir ,
+                 containing_dir = default_githubdata_dir ,
                  committing_usr = None ,
                  token = None
                  ) :
@@ -30,34 +23,30 @@ def __init__(self ,
                          committing_usr = committing_usr ,
                          token = token)
 
-        self.data_suf: str | None = None
+        """
+        
+        """
+
         self.data_fp: Path | None = None
 
-        self.set_data_fps()
+        # run on init
+        self.set_data_fp()
 
     def clone_overwrite(self , depth = 1) :
         super().clone_overwrite(depth = depth)
-        self.set_data_fps()
-
-    def ret_sorted_fpns_by_suf(self , suffix) :
-        ls = list(self.local_path.glob(f'*{suffix}'))
-        return sorted(ls)
-
-    def _set_defualt_data_suffix(self) :
-        for ky in data_file_suffixes.keys() :
-            fps = self.ret_sorted_fpns_by_suf(ky)
-            if len(fps) != 0 :
-                self.data_suf = ky
-                return
-
-    def set_data_fps(self) :
-        self._set_defualt_data_suffix()
-        if self.data_suf is None :
-            return
-        fps = self.ret_sorted_fpns_by_suf(self.data_suf)
-        self.data_fp = fps[0]
+        self.set_data_fp()
+
+    def set_data_fp(self) :
+        fps = self.local_path.glob('*.parquet')
+        # get the first fp or none if no parquet file exists
+        self.data_fp = next(fps , None)
 
     def read_data(self) :
+        """
+        reads the data from the local path if it exists, otherwise clones the repo and reads the data.
+        :return: pandas.DataFrame
+        """
         if not self.local_path.exists() :
             self.clone_overwrite()
-        return rdatt(self.data_fp)
+        df = pd.read_parquet(self.data_fp)
+        return df
diff --git a/src/githubdata/utils.py b/src/githubdata/utils.py
@@ -3,18 +3,14 @@
 
     """
 
-import shutil
-
 import pandas as pd
-from persiantools.jdatetime import JalaliDateTime
 
 from .github_data_repo import GitHubDataRepo
 
-def get_data_fr_github_without_double_clone(github_url , remove_cache = False
-                                            ) -> pd.DataFrame :
+def get_data_wo_double_clone(github_url , remove_cache = False
+                             ) -> pd.DataFrame :
     """
-    gets data from a GitHub data repo, without cloning it twice.
-    if it is already cloned, it will read the data from the local path.
+    gets data from a GitHub data repo, without cloning it twice. if it is already cloned, it will read the data from the local path.
 
     :param: github_url
     :remove_cache: if True, it will remove the cloned repo after reading the data.
@@ -26,20 +22,7 @@ def get_data_fr_github_without_double_clone(github_url , remove_cache = False
         gd.rmdir()
     return df
 
-def clone_overwrite_a_repo_return_gdr_obj(gd_url) :
+def clone_overwrite_a_repo__ret_gdr_obj(gd_url) :
     gdr = GitHubDataRepo(gd_url)
     gdr.clone_overwrite()
     return gdr
-
-def replace_old_data_with_new_and_iso_jdate_title(gdt , df_fpn) :
-    gdt.data_fp.unlink()
-
-    tjd = JalaliDateTime.now().strftime('%Y-%m-%d')
-    fp = gdt.local_path / f'{tjd}.prq'
-
-    shutil.copy(df_fpn , fp)
-    print(f'Replaced {df_fpn} to {fp}')
-
-def push_to_github_by_code_url(gdt , github_url) :
-    msg = 'Updated by ' + github_url
-    gdt.commit_and_push(msg , branch = 'main')