Skip to content

Commit

Permalink
Load models from huggingface instead of blob storage (#22)
Browse files Browse the repository at this point in the history
  • Loading branch information
carlosgjs authored Jan 5, 2024
1 parent 45bd148 commit 89899bb
Show file tree
Hide file tree
Showing 15 changed files with 70 additions and 51 deletions.
6 changes: 3 additions & 3 deletions .github/actions/deps/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ runs:
python-version: ${{inputs.python-version}}
- name: Setup pip
shell: sh
run: |
run: |
python3 -m ensurepip
python3 -m pip install --upgrade pip
- name: Install project
- name: Install project
shell: sh
run: pip install ".[dev,train]"
run: pip install ".[dev,train]"
8 changes: 4 additions & 4 deletions .github/pull_request_template.md
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
<!--
<!--
Thank you for your contribution to the repo :)
Pull Request (PR) Instructions:
Provide a general summary of your changes in the Title above. Fill out each section of the template, and replace the space with an `x` in all the boxes that apply. If you're unsure about any of these, don't hesitate to ask. We're here to help! Once you are satisfied with the pull request, click the "Create pull request" button to submit it for review.
Before submitting this PR, please ensure that your input and responses are entered in the designated space provided below each section to keep all project-related information organized and easily accessible.
How to link to a PR:
https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue
https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue
-->

## Change Description
<!---
<!---
Describe your changes in detail. In your description, you should answer questions like "Why is this change required? What problem does it solve?".
If it fixes an open issue, please link to the issue here. If this PR closes an issue, put the word 'closes' before the issue link to auto-close the issue when the PR is merged.
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/pre-commit-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@ on:
jobs:
pre-commit-ci:
runs-on: ubuntu-latest
env:
env:
SKIP: "check-lincc-frameworks-template-version,pytest-check,no-commit-to-branch,validate-pyproject,check-added-large-files,sphinx-build"
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0
fetch-depth: 0
- name: Setup Dependencies
uses: ./.github/actions/deps
with:
Expand All @@ -24,4 +24,4 @@ jobs:
with:
extra_args: --from-ref ${{ github.event.pull_request.base.sha }} --to-ref ${{ github.event.pull_request.head.sha }}
- uses: pre-commit-ci/[email protected]
if: always()
if: always()
1 change: 0 additions & 1 deletion .github/workflows/publish-to-pypi.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ permissions:

jobs:
deploy:

runs-on: ubuntu-latest
permissions:
id-token: write
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/smoke-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ on:
# Runs this workflow automatically
schedule:
- cron: 45 6 * * *

# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:

Expand Down
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -147,4 +147,4 @@ _results/
_html/

# mlflow output
mlruns/
mlruns/
2 changes: 1 addition & 1 deletion .mypy.ini
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@ explicit_package_bases = True
ignore_missing_imports = True

[mypy-mlflow.*]
ignore_missing_imports = True
ignore_missing_imports = True
22 changes: 17 additions & 5 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,20 @@
fail_fast: true
repos:

- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.4.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
# - id: check-docstring-first
- id: check-json
- id: check-yaml
- id: pretty-format-json
exclude: \.ipy(n|nb)$
args: ["--autofix", "--indent=2", "--no-sort-keys"]

# Compare the local template version to the latest remote template version
# This hook should always pass. It will print a message if the local version
# This hook should always pass. It will print a message if the local version
# is out of date.
- repo: https://github.com/lincc-frameworks/pre-commit-hooks
rev: v0.1.1
Expand Down Expand Up @@ -82,7 +94,7 @@ repos:


# Run unit tests, verify that they pass. Note that coverage is run against
# the ./src directory here because that is what will be committed. In the
# the ./src directory here because that is what will be committed. In the
# github workflow script, the coverage is run against the installed package
# and uploaded to Codecov by calling pytest like so:
# `python -m pytest --cov=<package_name> --cov-report=xml`
Expand All @@ -95,9 +107,9 @@ repos:
language: system
pass_filenames: false
always_run: true
# Make sure Sphinx can build the documentation while explicitly omitting
# notebooks from the docs, so users don't have to wait through the execution
# of each notebook or each commit. By default, these will be checked in the
# Make sure Sphinx can build the documentation while explicitly omitting
# notebooks from the docs, so users don't have to wait through the execution
# of each notebook or each commit. By default, these will be checked in the
# GitHub workflows.
- repo: local
hooks:
Expand Down
30 changes: 17 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
[![codecov](https://codecov.io/gh/AutoResearch/autodoc/branch/main/graph/badge.svg)](https://codecov.io/gh/AutoResearch/autodoc)
<!-- [![Read the Docs](https://img.shields.io/readthedocs/autora-doc)](https://autora-doc.readthedocs.io/) -->

This project was automatically generated using the LINCC-Frameworks
[python-project-template](https://github.com/lincc-frameworks/python-project-template). For more information about the project template see the
This project was automatically generated using the LINCC-Frameworks
[python-project-template](https://github.com/lincc-frameworks/python-project-template). For more information about the project template see the
[documentation](https://lincc-ppt.readthedocs.io/en/latest/).

## Dev Guide - Getting Started
Expand All @@ -31,24 +31,25 @@ Once you have created a new environment, you can install this project for local
development using the following commands:

```
>> pip install -e .'[dev]'
>> pip install -e .'[dev,train]'
>> pre-commit install
>> conda install pandoc
```

Notes:
1) The single quotes around `'[dev]'` may not be required for your operating system.
3) Look at `pyproject.toml` for other optional dependencies, e.g. you can do `pip install -e ."[dev,train,cuda]"` if you want to use CUDA.
2) `pre-commit install` will initialize pre-commit for this local repository, so
that a set of tests will be run prior to completing a local commit. For more
information, see the Python Project Template documentation on
information, see the Python Project Template documentation on
[pre-commit](https://lincc-ppt.readthedocs.io/en/latest/practices/precommit.html)
3) Install `pandoc` allows you to verify that automatic rendering of Jupyter notebooks
into documentation for ReadTheDocs works as expected. For more information, see
the Python Project Template documentation on
[Sphinx and Python Notebooks](https://lincc-ppt.readthedocs.io/en/latest/practices/sphinx.html#python-notebooks)


## Running AzureML pipelines
## Running AzureML pipelines

This repo contains the evaluation and training pipelines for AutoDoc.

Expand All @@ -69,21 +70,24 @@ az account set --subscription "<your subscription name>"
az configure --defaults workspace=<aml workspace> group=<resource group> location=<location, e.g. westus3>
```

### Uploading data

Example:
```sh
az storage blob upload --account-name <account> --container <container>> --file data/data.jsonl -n data/sweetpea/data.jsonl
```

### Running jobs

Prediction
```sh
az ml job create -f azureml/eval.yml --set display_name="Test prediction job" --web
az ml job create -f azureml/eval.yml --set display_name="Test prediction job" --set environment_variables.HF_TOKEN=<your huggingface token> --web
```

Notes:
- `--name` will set the mlflow run id
- `--display_name` becomes the name in the experiment dashboard
- `--web` argument will pop-up a browser window for tracking the job.
- `--web` argument will pop-up a browser window for tracking the job.
- The `HF_TOKEN` is required for gated repos, which need authentication


### Uploading data

Example:
```sh
az storage blob upload --account-name <account> --container <container>> --file data/data.jsonl -n data/sweetpea/data.jsonl
```
2 changes: 1 addition & 1 deletion azureml/conda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@ dependencies:
- xformers
- scipy
# This works, while installing from pytorch and cuda from conda does not
- torch==2.0.1
- torch==2.0.1
16 changes: 9 additions & 7 deletions azureml/eval.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
$schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
command: >
python -m autora.doc.pipelines.main eval
python -m autora.doc.pipelines.main eval
${{inputs.data_dir}}/data.jsonl
--model-path ${{inputs.model_dir}}/llama-2-7b-chat-hf
--model-path ${{inputs.model_path}}
--sys-id ${{inputs.sys_id}}
--instruc-id ${{inputs.instruc_id}}
--param temperature=${{inputs.temperature}}
Expand All @@ -11,11 +11,13 @@ command: >
code: ../src
inputs:
data_dir:
type: uri_folder
type: uri_folder
path: azureml://datastores/workspaceblobstore/paths/data/sweetpea/
model_dir:
type: uri_folder
path: azureml://datastores/workspaceblobstore/paths/base_models
# Currently models are loading faster directly from HuggingFace vs Azure Blob Storage
# model_dir:
# type: uri_folder
# path: azureml://datastores/workspaceblobstore/paths/base_models
model_path: meta-llama/Llama-2-7b-chat-hf
temperature: 0.7
top_p: 0.95
top_k: 40
Expand All @@ -35,4 +37,4 @@ environment: # azureml://registries/azureml/environments/acpt-pytorch-2.0-cuda11
display_name: autodoc_prediction
compute: azureml:t4cluster
experiment_name: evaluation
description: |
description: |
18 changes: 10 additions & 8 deletions azureml/generate.yml
Original file line number Diff line number Diff line change
@@ -1,28 +1,30 @@
$schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
command: >
python -m autora.doc.pipelines.main generate
--model-path ${{inputs.model_dir}}/llama-2-7b-chat-hf
python -m autora.doc.pipelines.main generate
--model-path ${{inputs.model_path}}
--output ./outputs/output.txt
--sys-id ${{inputs.sys_id}}
--instruc-id ${{inputs.instruc_id}}
--param temperature=${{inputs.temperature}}
--param top_k=${{inputs.top_k}}
--param top_p=${{inputs.top_p}}
autora/doc/pipelines/main.py
autora/doc/pipelines/main.py
code: ../src
inputs:
model_dir:
type: uri_folder
path: azureml://datastores/workspaceblobstore/paths/base_models
# Currently models are loading faster directly from HuggingFace vs Azure Blob Storage
# model_dir:
# type: uri_folder
# path: azureml://datastores/workspaceblobstore/paths/base_models
model_path: meta-llama/Llama-2-7b-chat-hf
temperature: 0.7
top_p: 0.95
top_k: 40
sys_id: SYS_1
instruc_id: INSTR_SWEETP_1
environment:
environment:
image: mcr.microsoft.com/azureml/curated/acpt-pytorch-2.0-cuda11.7:21
conda_file: conda.yml
display_name: autodoc_prediction
compute: azureml:t4cluster
experiment_name: prediction
description: |
description: |
1 change: 0 additions & 1 deletion docs/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,3 @@ clean:
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ dev = [
"ipython", # Also used in building notebooks into Sphinx
"matplotlib", # Used in sample notebook intro_notebook.ipynb
"ipykernel",
"hf_transfer",
]
train = [
"jsonlines",
Expand Down Expand Up @@ -100,4 +101,4 @@ include = ["src/autora"]
packages = ["src/autora"]

[project.scripts]
autodoc = "autora.doc.pipelines.main:app"
autodoc = "autora.doc.pipelines.main:app"
2 changes: 1 addition & 1 deletion src/.amlignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
mlruns/
.mypy_cache/
__pycache__/
__pycache__/

0 comments on commit 89899bb

Please sign in to comment.