From 8b029a17968cfff7c8dac5a767d64c886e5b7e36 Mon Sep 17 00:00:00 2001 From: Carlos Garcia Jurado Suarez Date: Wed, 13 Dec 2023 15:39:17 -0800 Subject: [PATCH 1/6] Load models from huggingface instead of blob storage --- .github/workflows/publish-to-pypi.yml | 3 ++- README.md | 22 +++++++++++++--------- azureml/eval.yml | 10 ++++++---- azureml/generate.yml | 10 ++++++---- pyproject.toml | 1 + 5 files changed, 28 insertions(+), 18 deletions(-) diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml index 3147bde..fb90505 100644 --- a/.github/workflows/publish-to-pypi.yml +++ b/.github/workflows/publish-to-pypi.yml @@ -17,7 +17,6 @@ permissions: jobs: deploy: - runs-on: ubuntu-latest permissions: id-token: write @@ -35,3 +34,5 @@ jobs: run: python -m build - name: Publish package uses: pypa/gh-action-pypi-publish@release/v1 + with: + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/README.md b/README.md index e2e66ad..8640545 100644 --- a/README.md +++ b/README.md @@ -31,13 +31,14 @@ Once you have created a new environment, you can install this project for local development using the following commands: ``` ->> pip install -e .'[dev]' +>> pip install -e .'[dev,train]' >> pre-commit install >> conda install pandoc ``` Notes: 1) The single quotes around `'[dev]'` may not be required for your operating system. +3) Look at `pyproject.toml` for other optional dependencies, e.g. you can do `pip install -e ."[dev,train,cuda]"` if you want to use CUDA. 2) `pre-commit install` will initialize pre-commit for this local repository, so that a set of tests will be run prior to completing a local commit. For more information, see the Python Project Template documentation on @@ -69,21 +70,24 @@ az account set --subscription "" az configure --defaults workspace= group= location= ``` -### Uploading data - -Example: -```sh -az storage blob upload --account-name --container > --file data/data.jsonl -n data/sweetpea/data.jsonl -``` ### Running jobs Prediction ```sh -az ml job create -f azureml/eval.yml --set display_name="Test prediction job" --web +az ml job create -f azureml/eval.yml --set display_name="Test prediction job" --set environment_variables.HF_TOKEN= --web ``` Notes: - `--name` will set the mlflow run id - `--display_name` becomes the name in the experiment dashboard -- `--web` argument will pop-up a browser window for tracking the job. \ No newline at end of file +- `--web` argument will pop-up a browser window for tracking the job. +- The `HF_TOKEN` is required for gated repos, which need authentication + + +### Uploading data + +Example: +```sh +az storage blob upload --account-name --container > --file data/data.jsonl -n data/sweetpea/data.jsonl + ``` \ No newline at end of file diff --git a/azureml/eval.yml b/azureml/eval.yml index ea6953b..551bd9c 100644 --- a/azureml/eval.yml +++ b/azureml/eval.yml @@ -2,7 +2,7 @@ $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json command: > python -m autora.doc.pipelines.main eval ${{inputs.data_dir}}/data.jsonl - --model-path ${{inputs.model_dir}}/llama-2-7b-chat-hf + --model-path ${{inputs.model_path}} --sys-id ${{inputs.sys_id}} --instruc-id ${{inputs.instruc_id}} --param temperature=${{inputs.temperature}} @@ -13,9 +13,11 @@ inputs: data_dir: type: uri_folder path: azureml://datastores/workspaceblobstore/paths/data/sweetpea/ - model_dir: - type: uri_folder - path: azureml://datastores/workspaceblobstore/paths/base_models + # Currently models are loading faster directly from HuggingFace vs Azure Blob Storage + # model_dir: + # type: uri_folder + # path: azureml://datastores/workspaceblobstore/paths/base_models + model_path: meta-llama/Llama-2-7b-chat-hf temperature: 0.7 top_p: 0.95 top_k: 40 diff --git a/azureml/generate.yml b/azureml/generate.yml index d849fcd..fedf7f5 100644 --- a/azureml/generate.yml +++ b/azureml/generate.yml @@ -1,7 +1,7 @@ $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json command: > python -m autora.doc.pipelines.main generate - --model-path ${{inputs.model_dir}}/llama-2-7b-chat-hf + --model-path ${{inputs.model_path}} --output ./outputs/output.txt --sys-id ${{inputs.sys_id}} --instruc-id ${{inputs.instruc_id}} @@ -11,9 +11,11 @@ command: > autora/doc/pipelines/main.py code: ../src inputs: - model_dir: - type: uri_folder - path: azureml://datastores/workspaceblobstore/paths/base_models + # Currently models are loading faster directly from HuggingFace vs Azure Blob Storage + # model_dir: + # type: uri_folder + # path: azureml://datastores/workspaceblobstore/paths/base_models + model_path: meta-llama/Llama-2-7b-chat-hf temperature: 0.7 top_p: 0.95 top_k: 40 diff --git a/pyproject.toml b/pyproject.toml index 422c8ff..c28cb5a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,6 +43,7 @@ dev = [ "ipython", # Also used in building notebooks into Sphinx "matplotlib", # Used in sample notebook intro_notebook.ipynb "ipykernel", + "hf_transfer", ] train = [ "jsonlines", From a312982e9059db764a02ae314c2feeeafe04fe56 Mon Sep 17 00:00:00 2001 From: Carlos Garcia Jurado Suarez Date: Fri, 5 Jan 2024 14:32:40 -0800 Subject: [PATCH 2/6] Use trusted pypi publisher --- .github/workflows/publish-to-pypi.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml index fb90505..279e8fc 100644 --- a/.github/workflows/publish-to-pypi.yml +++ b/.github/workflows/publish-to-pypi.yml @@ -33,6 +33,4 @@ jobs: - name: Build package run: python -m build - name: Publish package - uses: pypa/gh-action-pypi-publish@release/v1 - with: - password: ${{ secrets.PYPI_API_TOKEN }} + uses: pypa/gh-action-pypi-publish@release/v1 \ No newline at end of file From 50ae50b882a36ba097583365b13b32f43de9faef Mon Sep 17 00:00:00 2001 From: Carlos Garcia Jurado Suarez Date: Fri, 5 Jan 2024 14:42:47 -0800 Subject: [PATCH 3/6] fix line break --- .github/workflows/publish-to-pypi.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml index 279e8fc..a25fc7d 100644 --- a/.github/workflows/publish-to-pypi.yml +++ b/.github/workflows/publish-to-pypi.yml @@ -33,4 +33,4 @@ jobs: - name: Build package run: python -m build - name: Publish package - uses: pypa/gh-action-pypi-publish@release/v1 \ No newline at end of file + uses: pypa/gh-action-pypi-publish@release/v1 From 41f9a643005ded70f8e89de8a381a005dfd22ab3 Mon Sep 17 00:00:00 2001 From: Carlos Garcia Jurado Suarez Date: Fri, 5 Jan 2024 14:46:32 -0800 Subject: [PATCH 4/6] add formatting pre-commit checks --- .pre-commit-config.yaml | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 029a6e6..cc58e8e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,8 +1,20 @@ fail_fast: true repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + # - id: check-docstring-first + - id: check-json + - id: check-yaml + - id: pretty-format-json + exclude: \.ipy(n|nb)$ + args: ["--autofix", "--indent=2", "--no-sort-keys"] + # Compare the local template version to the latest remote template version - # This hook should always pass. It will print a message if the local version + # This hook should always pass. It will print a message if the local version # is out of date. - repo: https://github.com/lincc-frameworks/pre-commit-hooks rev: v0.1.1 @@ -82,7 +94,7 @@ repos: # Run unit tests, verify that they pass. Note that coverage is run against - # the ./src directory here because that is what will be committed. In the + # the ./src directory here because that is what will be committed. In the # github workflow script, the coverage is run against the installed package # and uploaded to Codecov by calling pytest like so: # `python -m pytest --cov= --cov-report=xml` @@ -95,9 +107,9 @@ repos: language: system pass_filenames: false always_run: true - # Make sure Sphinx can build the documentation while explicitly omitting - # notebooks from the docs, so users don't have to wait through the execution - # of each notebook or each commit. By default, these will be checked in the + # Make sure Sphinx can build the documentation while explicitly omitting + # notebooks from the docs, so users don't have to wait through the execution + # of each notebook or each commit. By default, these will be checked in the # GitHub workflows. - repo: local hooks: From ea4f6dc91e047fbfbdbd77b3250b49bc8440f2a2 Mon Sep 17 00:00:00 2001 From: Carlos Garcia Jurado Suarez Date: Fri, 5 Jan 2024 14:57:40 -0800 Subject: [PATCH 5/6] fix line endings --- .github/actions/deps/action.yaml | 6 +++--- .github/pull_request_template.md | 8 ++++---- .github/workflows/pre-commit-ci.yml | 6 +++--- .github/workflows/smoke-test.yml | 2 +- README.md | 10 +++++----- azureml/conda.yml | 2 +- azureml/eval.yml | 10 +++++----- azureml/generate.yml | 12 ++++++------ 8 files changed, 28 insertions(+), 28 deletions(-) diff --git a/.github/actions/deps/action.yaml b/.github/actions/deps/action.yaml index d38d141..04b8c5b 100644 --- a/.github/actions/deps/action.yaml +++ b/.github/actions/deps/action.yaml @@ -13,9 +13,9 @@ runs: python-version: ${{inputs.python-version}} - name: Setup pip shell: sh - run: | + run: | python3 -m ensurepip python3 -m pip install --upgrade pip - - name: Install project + - name: Install project shell: sh - run: pip install ".[dev,train]" \ No newline at end of file + run: pip install ".[dev,train]" diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 76e043c..2745f42 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,17 +1,17 @@ - ## Change Description - -This project was automatically generated using the LINCC-Frameworks -[python-project-template](https://github.com/lincc-frameworks/python-project-template). For more information about the project template see the +This project was automatically generated using the LINCC-Frameworks +[python-project-template](https://github.com/lincc-frameworks/python-project-template). For more information about the project template see the [documentation](https://lincc-ppt.readthedocs.io/en/latest/). ## Dev Guide - Getting Started @@ -41,7 +41,7 @@ Notes: 3) Look at `pyproject.toml` for other optional dependencies, e.g. you can do `pip install -e ."[dev,train,cuda]"` if you want to use CUDA. 2) `pre-commit install` will initialize pre-commit for this local repository, so that a set of tests will be run prior to completing a local commit. For more - information, see the Python Project Template documentation on + information, see the Python Project Template documentation on [pre-commit](https://lincc-ppt.readthedocs.io/en/latest/practices/precommit.html) 3) Install `pandoc` allows you to verify that automatic rendering of Jupyter notebooks into documentation for ReadTheDocs works as expected. For more information, see @@ -49,7 +49,7 @@ Notes: [Sphinx and Python Notebooks](https://lincc-ppt.readthedocs.io/en/latest/practices/sphinx.html#python-notebooks) -## Running AzureML pipelines +## Running AzureML pipelines This repo contains the evaluation and training pipelines for AutoDoc. @@ -90,4 +90,4 @@ Notes: Example: ```sh az storage blob upload --account-name --container > --file data/data.jsonl -n data/sweetpea/data.jsonl - ``` \ No newline at end of file + ``` diff --git a/azureml/conda.yml b/azureml/conda.yml index f772397..fab3656 100644 --- a/azureml/conda.yml +++ b/azureml/conda.yml @@ -15,4 +15,4 @@ dependencies: - xformers - scipy # This works, while installing from pytorch and cuda from conda does not - - torch==2.0.1 \ No newline at end of file + - torch==2.0.1 diff --git a/azureml/eval.yml b/azureml/eval.yml index 551bd9c..1f31ea4 100644 --- a/azureml/eval.yml +++ b/azureml/eval.yml @@ -1,6 +1,6 @@ $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json command: > - python -m autora.doc.pipelines.main eval + python -m autora.doc.pipelines.main eval ${{inputs.data_dir}}/data.jsonl --model-path ${{inputs.model_path}} --sys-id ${{inputs.sys_id}} @@ -11,12 +11,12 @@ command: > code: ../src inputs: data_dir: - type: uri_folder + type: uri_folder path: azureml://datastores/workspaceblobstore/paths/data/sweetpea/ # Currently models are loading faster directly from HuggingFace vs Azure Blob Storage # model_dir: - # type: uri_folder - # path: azureml://datastores/workspaceblobstore/paths/base_models + # type: uri_folder + # path: azureml://datastores/workspaceblobstore/paths/base_models model_path: meta-llama/Llama-2-7b-chat-hf temperature: 0.7 top_p: 0.95 @@ -37,4 +37,4 @@ environment: # azureml://registries/azureml/environments/acpt-pytorch-2.0-cuda11 display_name: autodoc_prediction compute: azureml:t4cluster experiment_name: evaluation -description: | \ No newline at end of file +description: | diff --git a/azureml/generate.yml b/azureml/generate.yml index fedf7f5..7e3f902 100644 --- a/azureml/generate.yml +++ b/azureml/generate.yml @@ -1,6 +1,6 @@ $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json command: > - python -m autora.doc.pipelines.main generate + python -m autora.doc.pipelines.main generate --model-path ${{inputs.model_path}} --output ./outputs/output.txt --sys-id ${{inputs.sys_id}} @@ -8,23 +8,23 @@ command: > --param temperature=${{inputs.temperature}} --param top_k=${{inputs.top_k}} --param top_p=${{inputs.top_p}} - autora/doc/pipelines/main.py + autora/doc/pipelines/main.py code: ../src inputs: # Currently models are loading faster directly from HuggingFace vs Azure Blob Storage # model_dir: - # type: uri_folder - # path: azureml://datastores/workspaceblobstore/paths/base_models + # type: uri_folder + # path: azureml://datastores/workspaceblobstore/paths/base_models model_path: meta-llama/Llama-2-7b-chat-hf temperature: 0.7 top_p: 0.95 top_k: 40 sys_id: SYS_1 instruc_id: INSTR_SWEETP_1 -environment: +environment: image: mcr.microsoft.com/azureml/curated/acpt-pytorch-2.0-cuda11.7:21 conda_file: conda.yml display_name: autodoc_prediction compute: azureml:t4cluster experiment_name: prediction -description: | \ No newline at end of file +description: | From 9f1bb61d0766d94309d3ea3905370dd46a5bce37 Mon Sep 17 00:00:00 2001 From: Carlos Garcia Jurado Suarez Date: Fri, 5 Jan 2024 15:05:06 -0800 Subject: [PATCH 6/6] fix line endings 2 --- .gitignore | 2 +- .mypy.ini | 2 +- docs/Makefile | 1 - pyproject.toml | 2 +- src/.amlignore | 2 +- 5 files changed, 4 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index 1ba51fa..cce3a02 100644 --- a/.gitignore +++ b/.gitignore @@ -147,4 +147,4 @@ _results/ _html/ # mlflow output -mlruns/ \ No newline at end of file +mlruns/ diff --git a/.mypy.ini b/.mypy.ini index b2565b1..d9e2214 100644 --- a/.mypy.ini +++ b/.mypy.ini @@ -7,4 +7,4 @@ explicit_package_bases = True ignore_missing_imports = True [mypy-mlflow.*] -ignore_missing_imports = True \ No newline at end of file +ignore_missing_imports = True diff --git a/docs/Makefile b/docs/Makefile index a5622f1..70d22f1 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -28,4 +28,3 @@ clean: # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - diff --git a/pyproject.toml b/pyproject.toml index c28cb5a..e1c4382 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -101,4 +101,4 @@ include = ["src/autora"] packages = ["src/autora"] [project.scripts] -autodoc = "autora.doc.pipelines.main:app" \ No newline at end of file +autodoc = "autora.doc.pipelines.main:app" diff --git a/src/.amlignore b/src/.amlignore index f1ec22a..8e62855 100644 --- a/src/.amlignore +++ b/src/.amlignore @@ -1,3 +1,3 @@ mlruns/ .mypy_cache/ -__pycache__/ \ No newline at end of file +__pycache__/