Load models from huggingface instead of blob storage (#22)

AutoResearch · Jan 5, 2024 · 89899bb · 89899bb
1 parent 45bd148
commit 89899bb
Show file tree

Hide file tree

Showing 15 changed files with 70 additions and 51 deletions.
diff --git a/.github/actions/deps/action.yaml b/.github/actions/deps/action.yaml
@@ -13,9 +13,9 @@ runs:
         python-version: ${{inputs.python-version}}
     - name: Setup pip
       shell: sh
-      run: |        
+      run: |
         python3 -m ensurepip
         python3 -m pip install --upgrade pip
-    - name: Install project 
+    - name: Install project
       shell: sh
-      run: pip install ".[dev,train]"
+      run: pip install ".[dev,train]"
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
@@ -1,17 +1,17 @@
-<!-- 
+<!--
 Thank you for your contribution to the repo :)
 
 Pull Request (PR) Instructions:
 Provide a general summary of your changes in the Title above. Fill out each section of the template, and replace the space with an `x` in all the boxes that apply. If you're unsure about any of these, don't hesitate to ask. We're here to help! Once you are satisfied with the pull request, click the "Create pull request" button to submit it for review.
 
 Before submitting this PR, please ensure that your input and responses are entered in the designated space provided below each section to keep all project-related information organized and easily accessible.
- 
+
 How to link to a PR:
-https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue 
+https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue
 -->
 
 ## Change Description
-<!--- 
+<!---
 Describe your changes in detail. In your description, you should answer questions like "Why is this change required? What problem does it solve?".
 
 If it fixes an open issue, please link to the issue here. If this PR closes an issue, put the word 'closes' before the issue link to auto-close the issue when the PR is merged.

diff --git a/.github/workflows/pre-commit-ci.yml b/.github/workflows/pre-commit-ci.yml
@@ -10,12 +10,12 @@ on:
 jobs:
   pre-commit-ci:
     runs-on: ubuntu-latest
-    env: 
+    env:
       SKIP: "check-lincc-frameworks-template-version,pytest-check,no-commit-to-branch,validate-pyproject,check-added-large-files,sphinx-build"
     steps:
     - uses: actions/checkout@v3
       with:
-        fetch-depth: 0 
+        fetch-depth: 0
     - name: Setup Dependencies
       uses: ./.github/actions/deps
       with:
@@ -24,4 +24,4 @@ jobs:
       with:
         extra_args: --from-ref ${{ github.event.pull_request.base.sha }} --to-ref ${{ github.event.pull_request.head.sha }}
     - uses: pre-commit-ci/[email protected]
-      if: always()
+      if: always()
diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml
@@ -17,7 +17,6 @@ permissions:
 
 jobs:
   deploy:
-
     runs-on: ubuntu-latest
     permissions:
       id-token: write

diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml
@@ -10,7 +10,7 @@ on:
   # Runs this workflow automatically
   schedule:
     - cron: 45 6 * * *
-    
+
   # Allows you to run this workflow manually from the Actions tab
   workflow_dispatch:
 

diff --git a/.gitignore b/.gitignore
@@ -147,4 +147,4 @@ _results/
 _html/
 
 # mlflow output
-mlruns/
+mlruns/
diff --git a/.mypy.ini b/.mypy.ini
@@ -7,4 +7,4 @@ explicit_package_bases = True
 ignore_missing_imports = True
 
 [mypy-mlflow.*]
-ignore_missing_imports = True
+ignore_missing_imports = True
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,8 +1,20 @@
 fail_fast: true
 repos:
 
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.4.0
+    hooks:
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      # - id: check-docstring-first
+      - id: check-json
+      - id: check-yaml
+      - id: pretty-format-json
+        exclude: \.ipy(n|nb)$
+        args: ["--autofix", "--indent=2", "--no-sort-keys"]
+
     # Compare the local template version to the latest remote template version
-    # This hook should always pass. It will print a message if the local version 
+    # This hook should always pass. It will print a message if the local version
     # is out of date.
   - repo: https://github.com/lincc-frameworks/pre-commit-hooks
     rev: v0.1.1
@@ -82,7 +94,7 @@ repos:
 
 
     # Run unit tests, verify that they pass. Note that coverage is run against
-    # the ./src directory here because that is what will be committed. In the 
+    # the ./src directory here because that is what will be committed. In the
     # github workflow script, the coverage is run against the installed package
     # and uploaded to Codecov by calling pytest like so:
     # `python -m pytest --cov=<package_name> --cov-report=xml`
@@ -95,9 +107,9 @@ repos:
         language: system
         pass_filenames: false
         always_run: true
-    # Make sure Sphinx can build the documentation while explicitly omitting 
-    # notebooks from the docs, so users don't have to wait through the execution 
-    # of each notebook or each commit. By default, these will be checked in the 
+    # Make sure Sphinx can build the documentation while explicitly omitting
+    # notebooks from the docs, so users don't have to wait through the execution
+    # of each notebook or each commit. By default, these will be checked in the
     # GitHub workflows.
   - repo: local
     hooks:

diff --git a/README.md b/README.md
@@ -11,8 +11,8 @@
 [![codecov](https://codecov.io/gh/AutoResearch/autodoc/branch/main/graph/badge.svg)](https://codecov.io/gh/AutoResearch/autodoc)
 <!-- [![Read the Docs](https://img.shields.io/readthedocs/autora-doc)](https://autora-doc.readthedocs.io/) -->
 
-This project was automatically generated using the LINCC-Frameworks 
-[python-project-template](https://github.com/lincc-frameworks/python-project-template). For more information about the project template see the 
+This project was automatically generated using the LINCC-Frameworks
+[python-project-template](https://github.com/lincc-frameworks/python-project-template). For more information about the project template see the
 [documentation](https://lincc-ppt.readthedocs.io/en/latest/).
 
 ## Dev Guide - Getting Started
@@ -31,24 +31,25 @@ Once you have created a new environment, you can install this project for local
 development using the following commands:
 
 ```
->> pip install -e .'[dev]'
+>> pip install -e .'[dev,train]'
 >> pre-commit install
 >> conda install pandoc
 ```
 
 Notes:
 1) The single quotes around `'[dev]'` may not be required for your operating system.
+3) Look at `pyproject.toml` for other optional dependencies, e.g. you can do `pip install -e ."[dev,train,cuda]"` if you want to use CUDA.
 2) `pre-commit install` will initialize pre-commit for this local repository, so
    that a set of tests will be run prior to completing a local commit. For more
-   information, see the Python Project Template documentation on 
+   information, see the Python Project Template documentation on
    [pre-commit](https://lincc-ppt.readthedocs.io/en/latest/practices/precommit.html)
 3) Install `pandoc` allows you to verify that automatic rendering of Jupyter notebooks
    into documentation for ReadTheDocs works as expected. For more information, see
    the Python Project Template documentation on
    [Sphinx and Python Notebooks](https://lincc-ppt.readthedocs.io/en/latest/practices/sphinx.html#python-notebooks)
 
 
-## Running AzureML pipelines 
+## Running AzureML pipelines
 
 This repo contains the evaluation and training pipelines for AutoDoc.
 
@@ -69,21 +70,24 @@ az account set --subscription "<your subscription name>"
 az configure --defaults workspace=<aml workspace> group=<resource group> location=<location, e.g. westus3>
 ```
 
-### Uploading data
-
-Example:
-```sh
-az storage blob upload  --account-name <account> --container <container>> --file data/data.jsonl -n data/sweetpea/data.jsonl
-```
 
 ### Running jobs
 
 Prediction
 ```sh
-az ml job create -f azureml/eval.yml  --set display_name="Test prediction job" --web
+az ml job create -f azureml/eval.yml  --set display_name="Test prediction job" --set environment_variables.HF_TOKEN=<your huggingface token> --web
 ```
 
 Notes:
 - `--name` will set the mlflow run id
 - `--display_name` becomes the name in the experiment dashboard
-- `--web` argument will pop-up a browser window for tracking the job.
+- `--web` argument will pop-up a browser window for tracking the job.
+- The `HF_TOKEN` is required for gated repos, which need authentication
+
+
+### Uploading data
+
+Example:
+```sh
+az storage blob upload  --account-name <account> --container <container>> --file data/data.jsonl -n data/sweetpea/data.jsonl
+ ```
diff --git a/azureml/conda.yml b/azureml/conda.yml
@@ -15,4 +15,4 @@ dependencies:
     - xformers
     - scipy
     # This works, while installing from pytorch and cuda from conda does not
-    - torch==2.0.1    
+    - torch==2.0.1
diff --git a/azureml/eval.yml b/azureml/eval.yml
@@ -1,8 +1,8 @@
 $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
 command: >
-  python -m autora.doc.pipelines.main eval 
+  python -m autora.doc.pipelines.main eval
   ${{inputs.data_dir}}/data.jsonl
-  --model-path ${{inputs.model_dir}}/llama-2-7b-chat-hf
+  --model-path ${{inputs.model_path}}
   --sys-id ${{inputs.sys_id}}
   --instruc-id ${{inputs.instruc_id}}
   --param temperature=${{inputs.temperature}}
@@ -11,11 +11,13 @@ command: >
 code: ../src
 inputs:
   data_dir:
-    type: uri_folder 
+    type: uri_folder
     path: azureml://datastores/workspaceblobstore/paths/data/sweetpea/
-  model_dir:
-    type: uri_folder 
-    path: azureml://datastores/workspaceblobstore/paths/base_models    
+  # Currently models are loading faster directly from HuggingFace vs Azure Blob Storage
+  # model_dir:
+  #   type: uri_folder
+  #   path: azureml://datastores/workspaceblobstore/paths/base_models
+  model_path: meta-llama/Llama-2-7b-chat-hf
   temperature: 0.7
   top_p: 0.95
   top_k: 40
@@ -35,4 +37,4 @@ environment: # azureml://registries/azureml/environments/acpt-pytorch-2.0-cuda11
 display_name: autodoc_prediction
 compute: azureml:t4cluster
 experiment_name: evaluation
-description: |
+description: |
diff --git a/azureml/generate.yml b/azureml/generate.yml
@@ -1,28 +1,30 @@
 $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
 command: >
-  python -m autora.doc.pipelines.main generate 
-  --model-path ${{inputs.model_dir}}/llama-2-7b-chat-hf
+  python -m autora.doc.pipelines.main generate
+  --model-path ${{inputs.model_path}}
   --output ./outputs/output.txt
   --sys-id ${{inputs.sys_id}}
   --instruc-id ${{inputs.instruc_id}}
   --param temperature=${{inputs.temperature}}
   --param top_k=${{inputs.top_k}}
   --param top_p=${{inputs.top_p}}
-  autora/doc/pipelines/main.py    
+  autora/doc/pipelines/main.py
 code: ../src
 inputs:
-  model_dir:
-    type: uri_folder 
-    path: azureml://datastores/workspaceblobstore/paths/base_models    
+  # Currently models are loading faster directly from HuggingFace vs Azure Blob Storage
+  # model_dir:
+  #   type: uri_folder
+  #   path: azureml://datastores/workspaceblobstore/paths/base_models
+  model_path: meta-llama/Llama-2-7b-chat-hf
   temperature: 0.7
   top_p: 0.95
   top_k: 40
   sys_id: SYS_1
   instruc_id: INSTR_SWEETP_1
-environment: 
+environment:
   image: mcr.microsoft.com/azureml/curated/acpt-pytorch-2.0-cuda11.7:21
   conda_file: conda.yml
 display_name: autodoc_prediction
 compute: azureml:t4cluster
 experiment_name: prediction
-description: |
+description: |
diff --git a/docs/Makefile b/docs/Makefile
@@ -28,4 +28,3 @@ clean:
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-
diff --git a/pyproject.toml b/pyproject.toml
@@ -43,6 +43,7 @@ dev = [
     "ipython", # Also used in building notebooks into Sphinx
     "matplotlib", # Used in sample notebook intro_notebook.ipynb
     "ipykernel",
+    "hf_transfer",
 ]
 train = [
     "jsonlines",
@@ -100,4 +101,4 @@ include = ["src/autora"]
 packages = ["src/autora"]
 
 [project.scripts]
-autodoc = "autora.doc.pipelines.main:app"
+autodoc = "autora.doc.pipelines.main:app"
diff --git a/src/.amlignore b/src/.amlignore
@@ -1,3 +1,3 @@
 mlruns/
 .mypy_cache/
-__pycache__/
+__pycache__/
-Original file line number
+Diff line change
@@ Expand Up / @@ -17,7 +17,6 @@ permissions: @@
     jobs:
       deploy:
         runs-on: ubuntu-latest
         permissions:
           id-token: write
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
Expand Up		@@ -28,4 +28,3 @@ clean:
		# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
		%: Makefile
		@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)