diff --git a/.github/workflows/publish-on-pip.yml b/.github/workflows/publish-on-pip.yml index bdaab28..4ba9ac4 100644 --- a/.github/workflows/publish-on-pip.yml +++ b/.github/workflows/publish-on-pip.yml @@ -10,7 +10,7 @@ name: Upload Python Package on: release: - types: [published] + types: [ published ] permissions: contents: read @@ -21,19 +21,19 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - name: Set up Python - uses: actions/setup-python@v3 - with: - python-version: '3.x' - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install build - - name: Build package - run: python -m build - - name: Publish package - uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 - with: - user: __token__ - password: ${{ secrets.PYPI_API_TOKEN }} + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: '3.x' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build + - name: Build package + run: python -m build + - name: Publish package + uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/README.md b/README.md index 6389b57..9eb91f3 100644 --- a/README.md +++ b/README.md @@ -7,12 +7,13 @@ pip install githubdata # Quick Start ```python -from githubdata.utils import get_data_fr_github_without_double_clone +from githubdata import get_data_wo_double_clone -# Github data repository url -url = 'https://github.com/imahdimir/d-TSETMC_ID-2-FirmTicker' +# GitHub "Data Repository" url/path +url = 'imahdimir/d-TSETMC_ID-2-FirmTicker' -df = get_data_fr_github_without_double_clone(url) +# get the data as a pandas DataFrame +df = get_data_wo_double_clone(url) ``` -- Easy as That! +***Easy as that!*** diff --git a/pyproject.toml b/pyproject.toml index f307847..4a73736 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,35 +4,26 @@ build-backend = "hatchling.build" [project] name = "githubdata" -version = "15.0.0" +version = "16.0.0" authors = [{ name = "Mahdi Mir", email = "imahdimir@gmail.com" }] description = "A simple Python package to easily download from and manage a GitHub \"Data repository\"" readme = "README.md" license = { file = "LICENSE" } dependencies = [ "giteasy", - "pandas", "fastparquet", "pyarrow", - "openpyxl", - "mirutil", - "persiantools" + "pandas", ] classifiers = [ - "Programming Language :: Python :: 3", "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", - "Programming Language :: Python", - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3", "Programming Language :: Python :: Implementation :: PyPy", "Topic :: Software Development :: Libraries :: Python Modules", - "Topic :: Utilities"] + "Topic :: Utilities" +] keywords = [ "TSE", "Finance", @@ -40,7 +31,8 @@ keywords = [ "cleaning", "data_cleaning", "Tehran", - "Stocks"] + "Stocks" +] [project.urls] "Homepage" = "https://github.com/imahdimir/githubdata" diff --git a/src/githubdata/__init__.py b/src/githubdata/__init__.py index 70af0b6..d3fef45 100644 --- a/src/githubdata/__init__.py +++ b/src/githubdata/__init__.py @@ -1,2 +1,4 @@ -from .github_data_repo import default_containing_dir +from .github_data_repo import default_githubdata_dir from .github_data_repo import GitHubDataRepo +from .utils import clone_overwrite_a_repo__ret_gdr_obj +from .utils import get_data_wo_double_clone diff --git a/src/githubdata/github_data_repo.py b/src/githubdata/github_data_repo.py index 8921f5c..fcaa9d0 100644 --- a/src/githubdata/github_data_repo.py +++ b/src/githubdata/github_data_repo.py @@ -4,23 +4,16 @@ from pathlib import Path +import pandas as pd from giteasy import GitHubRepo -from mirutil.df import read_data_according_to_type as rdatt -data_file_suffixes = { - '.xlsx' : None , - '.prq' : None , - '.csv' : None , - } - -default_containing_dir = Path('GitHubData/') +default_githubdata_dir = Path('GitHubData/') class GitHubDataRepo(GitHubRepo) : - def __init__(self , repo_url , local_path = None , - containing_dir = default_containing_dir , + containing_dir = default_githubdata_dir , committing_usr = None , token = None ) : @@ -30,34 +23,30 @@ def __init__(self , committing_usr = committing_usr , token = token) - self.data_suf: str | None = None + """ + + """ + self.data_fp: Path | None = None - self.set_data_fps() + # run on init + self.set_data_fp() def clone_overwrite(self , depth = 1) : super().clone_overwrite(depth = depth) - self.set_data_fps() - - def ret_sorted_fpns_by_suf(self , suffix) : - ls = list(self.local_path.glob(f'*{suffix}')) - return sorted(ls) - - def _set_defualt_data_suffix(self) : - for ky in data_file_suffixes.keys() : - fps = self.ret_sorted_fpns_by_suf(ky) - if len(fps) != 0 : - self.data_suf = ky - return - - def set_data_fps(self) : - self._set_defualt_data_suffix() - if self.data_suf is None : - return - fps = self.ret_sorted_fpns_by_suf(self.data_suf) - self.data_fp = fps[0] + self.set_data_fp() + + def set_data_fp(self) : + fps = self.local_path.glob('*.parquet') + # get the first fp or none if no parquet file exists + self.data_fp = next(fps , None) def read_data(self) : + """ + reads the data from the local path if it exists, otherwise clones the repo and reads the data. + :return: pandas.DataFrame + """ if not self.local_path.exists() : self.clone_overwrite() - return rdatt(self.data_fp) + df = pd.read_parquet(self.data_fp) + return df diff --git a/src/githubdata/utils.py b/src/githubdata/utils.py index 753cf25..38c0dec 100644 --- a/src/githubdata/utils.py +++ b/src/githubdata/utils.py @@ -3,18 +3,14 @@ """ -import shutil - import pandas as pd -from persiantools.jdatetime import JalaliDateTime from .github_data_repo import GitHubDataRepo -def get_data_fr_github_without_double_clone(github_url , remove_cache = False - ) -> pd.DataFrame : +def get_data_wo_double_clone(github_url , remove_cache = False + ) -> pd.DataFrame : """ - gets data from a GitHub data repo, without cloning it twice. - if it is already cloned, it will read the data from the local path. + gets data from a GitHub data repo, without cloning it twice. if it is already cloned, it will read the data from the local path. :param: github_url :remove_cache: if True, it will remove the cloned repo after reading the data. @@ -26,20 +22,7 @@ def get_data_fr_github_without_double_clone(github_url , remove_cache = False gd.rmdir() return df -def clone_overwrite_a_repo_return_gdr_obj(gd_url) : +def clone_overwrite_a_repo__ret_gdr_obj(gd_url) : gdr = GitHubDataRepo(gd_url) gdr.clone_overwrite() return gdr - -def replace_old_data_with_new_and_iso_jdate_title(gdt , df_fpn) : - gdt.data_fp.unlink() - - tjd = JalaliDateTime.now().strftime('%Y-%m-%d') - fp = gdt.local_path / f'{tjd}.prq' - - shutil.copy(df_fpn , fp) - print(f'Replaced {df_fpn} to {fp}') - -def push_to_github_by_code_url(gdt , github_url) : - msg = 'Updated by ' + github_url - gdt.commit_and_push(msg , branch = 'main')