From 99a099efcb22d921253dc6fe603fc4054aed88a2 Mon Sep 17 00:00:00 2001 From: Arne Binder Date: Wed, 8 Nov 2023 01:52:58 +0100 Subject: [PATCH] remove workflow to upload datasets to the Huggingface hub (in favor of https://github.com/ArneBinder/pie-datasets/pull/27) --- .github/hub/requirements.txt | 4 - .github/hub/update_hub_repositories.py | 324 ------------------ .../workflows/update-hub-repositories.yaml | 34 -- 3 files changed, 362 deletions(-) delete mode 100644 .github/hub/requirements.txt delete mode 100644 .github/hub/update_hub_repositories.py delete mode 100644 .github/workflows/update-hub-repositories.yaml diff --git a/.github/hub/requirements.txt b/.github/hub/requirements.txt deleted file mode 100644 index 0dd1e1e4..00000000 --- a/.github/hub/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -GitPython==3.1.30 -python-dotenv==0.19.2 -requests==2.25.1 -tqdm==4.62.3 diff --git a/.github/hub/update_hub_repositories.py b/.github/hub/update_hub_repositories.py deleted file mode 100644 index d6bfb105..00000000 --- a/.github/hub/update_hub_repositories.py +++ /dev/null @@ -1,324 +0,0 @@ -# adapted from https://github.com/huggingface/datasets/blob/master/.github/hub/update_hub_repositories.py - -import base64 -import distutils.dir_util -import logging -import os -import re -import sys -from itertools import islice -from pathlib import Path -from typing import Dict, Optional, Set, Tuple - -import requests -from dotenv import load_dotenv -from git import Repo -from tqdm.contrib.concurrent import thread_map - -load_dotenv() -logger = logging.getLogger(__name__) -ROOT = Path() - -# General environment variables accepted values for booleans -ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"} -ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"}) - -if os.environ.get("HF_USE_PROD", "AUTO") in ENV_VARS_TRUE_VALUES: - HUB_ENDPOINT = "https://huggingface.co" -else: - HUB_ENDPOINT = "https://moon-staging.huggingface.co" - -HUB_CANONICAL_WHOAMI = HUB_ENDPOINT + "/api/whoami-v2" -HUB_CANONICAL_CREATE_URL = HUB_ENDPOINT + "/api/repos/create" -HUB_CANONICAL_INFO_URL = HUB_ENDPOINT + "/api/datasets/{organization}/{dataset_name}" -HUB_CANONICAL_DATASET_GIT_URL = ( - HUB_ENDPOINT.replace("https://", "https://user:{token}@") - + "/datasets/{organization}/{dataset_name}.git" -) -HUB_API_GH_TO_HF = HUB_ENDPOINT + "/api/gh-to-hf/{github_username}" -DATASETS_LIB_CATALOG_DIR_NAME = "dataset_builders" -DATASETS_LIB_COMMIT_URL = "https://github.com/huggingface/datasets/{organization}/commit/{hexsha}" -CANONICAL_DATASET_REPO_MAIN_BRANCH = "main" -HUB_DIR_NAME = "hub" - - -def hf_retrieve_author(author_name, author_email) -> Tuple[str, str]: - # Some HF members have enabled email address privacy on GitHub - # This is here just to be able to link the commits to their HF accounts - if author_email.endswith("@users.noreply.github.com"): - try: - github_username = author_email[: -len("@users.noreply.github.com")].split("+", 1)[-1] - response = requests.get(HUB_API_GH_TO_HF.format(github_username=github_username)) - author_email = response.json()["user"] + "@users.noreply.huggingface.co" - except Exception: - pass - return author_name, author_email - - -class UnauthorizedError(ConnectionError): - pass - - -class UpdateFailed(RuntimeError): - pass - - -def src_canonical_dataset_path(datasets_lib_path: Path, dataset_name: str) -> Path: - return datasets_lib_path / DATASETS_LIB_CATALOG_DIR_NAME / dataset_name - - -def canonical_dataset_path(dataset_name: str) -> Path: - return ROOT / HUB_DIR_NAME / dataset_name - - -def canonical_dataset_git_url(dataset_name: str, token: str, organization: str) -> str: - return HUB_CANONICAL_DATASET_GIT_URL.format( - dataset_name=dataset_name, token=token, organization=organization - ) - - -def canonical_dataset_info_url(dataset_name: str, organization: str) -> str: - return HUB_CANONICAL_INFO_URL.format(dataset_name=dataset_name, organization=organization) - - -def create_remote_repo(dataset_name: str, token: str, organization: str): - response = requests.post( - HUB_CANONICAL_CREATE_URL, - headers={"authorization": f"Bearer {token}"}, - json={ - "name": dataset_name, - "organization": organization, - # "canonical": True, - "type": "dataset", - }, - ) - response.raise_for_status() - - -def whoami(token: str) -> str: - response = requests.get(HUB_CANONICAL_WHOAMI, headers={"authorization": f"Bearer {token}"}) - response.raise_for_status() - user_info = response.json() - return user_info - - -def check_authorizations(user_info: dict, organization: str): - if organization not in [org["name"] for org in user_info["orgs"] if org["type"] == "org"]: - raise UnauthorizedError( - f"User {user_info['name']} is not part of the 'trusted-committers' org: " - "it can't push to canonical repositories" - ) - - -def apply_hacks_for_moon_landing(dataset_repo_path: Path): - if (dataset_repo_path / "README.md").is_file(): - with (dataset_repo_path / "README.md").open() as f: - readme_content = f.read() - if readme_content.count("---\n") > 1: - _, tags, content = readme_content.split("---\n", 2) - tags = tags.replace("\nlicense:", "\nlicenses:").replace(".", "-").replace("$", "%") - with (dataset_repo_path / "README.md").open("w") as f: - f.write("---\n".join(["", tags, content])) - - -class update_main: - def __init__( - self, - organization: str, - datasets_lib_path: str, - commit_args: Tuple[str], - token: str, - deleted_files: Dict[str, Set[str]], - tag_name: Optional[str] = None, - ) -> None: - self.organization = organization - self.datasets_lib_path = datasets_lib_path - self.commit_args = commit_args - self.token = token - self.deleted_files = ( - deleted_files # dict dataset_name -> set of relative paths of the deleted files - ) - self.tag_name = tag_name - - def __call__(self, dataset_name: str) -> bool: - try: - create_remote_repo(dataset_name, self.token, self.organization) - except requests.exceptions.HTTPError as e: - if "409 Client Error: Conflict for url:" not in repr( - e - ): # don't log if repo already exists - logger.warning(f"[{dataset_name}] " + repr(e)) - if not canonical_dataset_path(dataset_name).is_dir(): - repo = Repo.clone_from( - canonical_dataset_git_url(dataset_name, self.token, self.organization), - to_path=canonical_dataset_path(dataset_name), - ) - else: - repo = Repo(canonical_dataset_path(dataset_name)) - - logs = [] - logs.append(repo.git.reset("--hard")) - logs.append(repo.git.clean("-f", "-d")) - logs.append(repo.git.checkout(CANONICAL_DATASET_REPO_MAIN_BRANCH)) - logs.append(repo.remote().pull()) - # Copy the changes and commit - distutils.dir_util.copy_tree( - str(src_canonical_dataset_path(datasets_lib_path, dataset_name)), - str(canonical_dataset_path(dataset_name)), - ) - for filepath_to_delete in self.deleted_files.get(dataset_name, []): - try: - (canonical_dataset_path(dataset_name) / filepath_to_delete).unlink() - except Exception as e: - logger.warning( - f"[{dataset_name}] Couldn't delete file at {filepath_to_delete}: {repr(e)}" - ) - apply_hacks_for_moon_landing(canonical_dataset_path(dataset_name)) - logs.append(repo.git.add(".")) - if "Changes to be committed:" in repo.git.status(): - logs.append(repo.git.commit(*self.commit_args)) - try: - logs.append(repo.git.push()) - if self.tag_name: - # If the dataset repository hasn't been tagged for this release yet, - # it means that the new version of the datasets lib just got released. - # In this case we have to tag the new commit with this release name - logs.append( - repo.git.tag( - self.tag_name, f"-m Add tag from dataset_builders {self.tag_name}" - ) - ) - logs.append(repo.git.push("--tags")) - except Exception as e: - logs.append("push failed !") - logs.append(repr(e)) - if "Your branch is up to date with" not in repo.git.status(): - logs.append(repo.git.status()) - logs = "\n".join(str(log) for log in logs) - logger.warning(f"[{dataset_name}] Push failed") - logger.warning(f"[{dataset_name}] Git logs: \n{logs}") - return False - else: - return True - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) - token = os.environ["HF_TOKEN"] - organization = os.environ["ORGANIZATION"] - datasets_lib_path = Path(os.environ["DATASETS_LIB_PATH"]).expanduser().resolve() - - if Path(token).expanduser().is_file(): - with Path(token).expanduser().open() as f: - token = f.read().strip() - user_info = whoami(token) - check_authorizations(user_info, organization) - - datasets_lib_repo = Repo(datasets_lib_path) - current_commit, prev_commit = list(islice(datasets_lib_repo.iter_commits(), 2)) - author_name, author_email = current_commit.author.name, current_commit.author.email - author_name, author_email = hf_retrieve_author(author_name, author_email) - commit_args = (f"-m {current_commit.message}",) - commit_args += ( - f"-m Commit from {DATASETS_LIB_COMMIT_URL.format(organization=organization, hexsha=current_commit.hexsha)}", - ) - commit_args += (f"--author={author_name} <{author_email}>",) - - for _tag in datasets_lib_repo.tags: - # Add a new tag if this is a `datasets` release - if _tag.commit == current_commit and re.match(r"^v[0-9]+\.[0-9]+\.[0-9]+$", _tag.name): - new_tag = _tag - break - else: - new_tag = None - - changed_files_since_last_commit = [ - path - for diff in datasets_lib_repo.index.diff(prev_commit) - for path in [diff.a_path, diff.b_path] - if path.startswith(DATASETS_LIB_CATALOG_DIR_NAME) and path.count("/") >= 2 - ] - - changed_datasets_names_since_last_commit = { - path.split("/")[1] for path in changed_files_since_last_commit - } - # ignore json, csv etc. - changed_datasets_names_since_last_commit = { - dataset_name - for dataset_name in changed_datasets_names_since_last_commit - if ( - datasets_lib_path - / DATASETS_LIB_CATALOG_DIR_NAME - / dataset_name - / (dataset_name + ".py") - ).is_file() - } - - deleted_files = { - dataset_name: set() for dataset_name in changed_datasets_names_since_last_commit - } - for path in changed_files_since_last_commit: - _, dataset_name, rel_path = path.split("/", 2) - if ( - dataset_name in changed_datasets_names_since_last_commit - and not (datasets_lib_path / path).is_file() - ): - deleted_files[dataset_name].add(rel_path) - - dataset_names = sys.argv[1:] - if dataset_names: - if dataset_names[0] == "--all": - dataset_names = sorted( - d.name - for d in (datasets_lib_path / DATASETS_LIB_CATALOG_DIR_NAME).glob("*") - if d.is_dir() and (d / (d.name + ".py")).is_file() # ignore json, csv etc. - ) - if dataset_names[0] == "--auto": - if new_tag: - logger.info( - "All the datasets will be updated since --auto was used and " - f"this is a new release {new_tag.name} of the `datasets` library." - ) - dataset_names = sorted( - d.name for d in (ROOT / HUB_DIR_NAME).glob("*") if d.is_dir() - ) - dataset_names = sorted( - d.name - for d in (datasets_lib_path / DATASETS_LIB_CATALOG_DIR_NAME).glob("*") - if d.is_dir() and (d / (d.name + ".py")).is_file() # ignore json, csv etc. - ) - else: - logger.info( - "All the datasets that have been changed in the latest commit of `datasets` will be updated " - "since --auto was used." - ) - dataset_names = sorted(changed_datasets_names_since_last_commit) - if dataset_names: - logger.info( - f"Updating the '{CANONICAL_DATASET_REPO_MAIN_BRANCH}' branch of those datasets: {' '.join(dataset_names)}" - ) - successes = thread_map( - update_main( - organization=organization, - datasets_lib_path=datasets_lib_path, - commit_args=commit_args, - token=token, - deleted_files=deleted_files, - tag_name=new_tag.name if new_tag else None, - ), - dataset_names, - ) - datasets_with_errors = [ - dataset_name - for success, dataset_name in zip(successes, dataset_names) - if not success - ] - if datasets_with_errors: - raise UpdateFailed( - f"Those datasets couldn't be updated: {' '.join(datasets_with_errors)}\n" - "Please check the logs to see what went wrong.\n" - "Once you fixed the errors, you can re-run this script:\n\n" - f"\tpython update_main.py {' '.join(datasets_with_errors)}" - ) - else: - logger.info("No changes detected -- nothing to update !") diff --git a/.github/workflows/update-hub-repositories.yaml b/.github/workflows/update-hub-repositories.yaml deleted file mode 100644 index 837bcb40..00000000 --- a/.github/workflows/update-hub-repositories.yaml +++ /dev/null @@ -1,34 +0,0 @@ -name: Update Hub repositories - -on: workflow_dispatch - -jobs: - update-hub-repositories: - runs-on: ubuntu-latest - steps: - - name: Checkout repository - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: "3.9" - - name: Set up default Git config - run: | - git config --global user.name system - git config --global user.email christophalt@posteo.de - - name: Install dependencies - working-directory: ./.github/hub - run: | - python -m pip install --upgrade pip - pip install -r requirements.txt - - name: Update Hub repositories - working-directory: ./.github/hub - run: | - export HF_TOKEN=${{ secrets.HUB_TRUSTED_COMMITER_TOKEN }} - export ORGANIZATION=pie - export DATASETS_LIB_PATH=$GITHUB_WORKSPACE - export HF_USE_PROD=1 - export GIT_LFS_SKIP_SMUDGE=1 - python update_hub_repositories.py --all