From 8b029a17968cfff7c8dac5a767d64c886e5b7e36 Mon Sep 17 00:00:00 2001
From: Carlos Garcia Jurado Suarez <carlosg@uw.edu>
Date: Wed, 13 Dec 2023 15:39:17 -0800
Subject: [PATCH 1/6] Load models from huggingface instead of blob storage

---
 .github/workflows/publish-to-pypi.yml |  3 ++-
 README.md                             | 22 +++++++++++++---------
 azureml/eval.yml                      | 10 ++++++----
 azureml/generate.yml                  | 10 ++++++----
 pyproject.toml                        |  1 +
 5 files changed, 28 insertions(+), 18 deletions(-)
diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml
index 3147bde..fb90505 100644
--- a/.github/workflows/publish-to-pypi.yml
+++ b/.github/workflows/publish-to-pypi.yml
@@ -17,7 +17,6 @@ permissions:
 
 jobs:
   deploy:
-
     runs-on: ubuntu-latest
     permissions:
       id-token: write
@@ -35,3 +34,5 @@ jobs:
       run: python -m build
     - name: Publish package
       uses: pypa/gh-action-pypi-publish@release/v1
+      with:
+        password: ${{ secrets.PYPI_API_TOKEN }}      
diff --git a/README.md b/README.md
index e2e66ad..8640545 100644
--- a/README.md
+++ b/README.md
@@ -31,13 +31,14 @@ Once you have created a new environment, you can install this project for local
 development using the following commands:
 
 ```
->> pip install -e .'[dev]'
+>> pip install -e .'[dev,train]'
 >> pre-commit install
 >> conda install pandoc
 ```
 
 Notes:
 1) The single quotes around `'[dev]'` may not be required for your operating system.
+3) Look at `pyproject.toml` for other optional dependencies, e.g. you can do `pip install -e ."[dev,train,cuda]"` if you want to use CUDA.
 2) `pre-commit install` will initialize pre-commit for this local repository, so
    that a set of tests will be run prior to completing a local commit. For more
    information, see the Python Project Template documentation on 
@@ -69,21 +70,24 @@ az account set --subscription "<your subscription name>"
 az configure --defaults workspace=<aml workspace> group=<resource group> location=<location, e.g. westus3>
 ```
 
-### Uploading data
-
-Example:
-```sh
-az storage blob upload  --account-name <account> --container <container>> --file data/data.jsonl -n data/sweetpea/data.jsonl
-```
 
 ### Running jobs
 
 Prediction
 ```sh
-az ml job create -f azureml/eval.yml  --set display_name="Test prediction job" --web
+az ml job create -f azureml/eval.yml  --set display_name="Test prediction job" --set environment_variables.HF_TOKEN=<your huggingface token> --web
 ```
 
 Notes:
 - `--name` will set the mlflow run id
 - `--display_name` becomes the name in the experiment dashboard
-- `--web` argument will pop-up a browser window for tracking the job.
\ No newline at end of file
+- `--web` argument will pop-up a browser window for tracking the job.
+- The `HF_TOKEN` is required for gated repos, which need authentication
+
+
+### Uploading data
+
+Example:
+```sh
+az storage blob upload  --account-name <account> --container <container>> --file data/data.jsonl -n data/sweetpea/data.jsonl
+ ```
\ No newline at end of file
diff --git a/azureml/eval.yml b/azureml/eval.yml
index ea6953b..551bd9c 100644
--- a/azureml/eval.yml
+++ b/azureml/eval.yml
@@ -2,7 +2,7 @@ $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
 command: >
   python -m autora.doc.pipelines.main eval 
   ${{inputs.data_dir}}/data.jsonl
-  --model-path ${{inputs.model_dir}}/llama-2-7b-chat-hf
+  --model-path ${{inputs.model_path}}
   --sys-id ${{inputs.sys_id}}
   --instruc-id ${{inputs.instruc_id}}
   --param temperature=${{inputs.temperature}}
@@ -13,9 +13,11 @@ inputs:
   data_dir:
     type: uri_folder 
     path: azureml://datastores/workspaceblobstore/paths/data/sweetpea/
-  model_dir:
-    type: uri_folder 
-    path: azureml://datastores/workspaceblobstore/paths/base_models    
+  # Currently models are loading faster directly from HuggingFace vs Azure Blob Storage
+  # model_dir:
+  #   type: uri_folder 
+  #   path: azureml://datastores/workspaceblobstore/paths/base_models    
+  model_path: meta-llama/Llama-2-7b-chat-hf
   temperature: 0.7
   top_p: 0.95
   top_k: 40
diff --git a/azureml/generate.yml b/azureml/generate.yml
index d849fcd..fedf7f5 100644
--- a/azureml/generate.yml
+++ b/azureml/generate.yml
@@ -1,7 +1,7 @@
 $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
 command: >
   python -m autora.doc.pipelines.main generate 
-  --model-path ${{inputs.model_dir}}/llama-2-7b-chat-hf
+  --model-path ${{inputs.model_path}}
   --output ./outputs/output.txt
   --sys-id ${{inputs.sys_id}}
   --instruc-id ${{inputs.instruc_id}}
@@ -11,9 +11,11 @@ command: >
   autora/doc/pipelines/main.py    
 code: ../src
 inputs:
-  model_dir:
-    type: uri_folder 
-    path: azureml://datastores/workspaceblobstore/paths/base_models    
+  # Currently models are loading faster directly from HuggingFace vs Azure Blob Storage
+  # model_dir:
+  #   type: uri_folder 
+  #   path: azureml://datastores/workspaceblobstore/paths/base_models    
+  model_path: meta-llama/Llama-2-7b-chat-hf
   temperature: 0.7
   top_p: 0.95
   top_k: 40
diff --git a/pyproject.toml b/pyproject.toml
index 422c8ff..c28cb5a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -43,6 +43,7 @@ dev = [
     "ipython", # Also used in building notebooks into Sphinx
     "matplotlib", # Used in sample notebook intro_notebook.ipynb
     "ipykernel",
+    "hf_transfer",
 ]
 train = [
     "jsonlines",

From a312982e9059db764a02ae314c2feeeafe04fe56 Mon Sep 17 00:00:00 2001
From: Carlos Garcia Jurado Suarez <carlosg@uw.edu>
Date: Fri, 5 Jan 2024 14:32:40 -0800
Subject: [PATCH 2/6] Use trusted pypi publisher

---
 .github/workflows/publish-to-pypi.yml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml
index fb90505..279e8fc 100644
--- a/.github/workflows/publish-to-pypi.yml
+++ b/.github/workflows/publish-to-pypi.yml
@@ -33,6 +33,4 @@ jobs:
     - name: Build package
       run: python -m build
     - name: Publish package
-      uses: pypa/gh-action-pypi-publish@release/v1
-      with:
-        password: ${{ secrets.PYPI_API_TOKEN }}      
+      uses: pypa/gh-action-pypi-publish@release/v1
\ No newline at end of file

From 50ae50b882a36ba097583365b13b32f43de9faef Mon Sep 17 00:00:00 2001
From: Carlos Garcia Jurado Suarez <carlosg@uw.edu>
Date: Fri, 5 Jan 2024 14:42:47 -0800
Subject: [PATCH 3/6] fix line break

---
 .github/workflows/publish-to-pypi.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml
index 279e8fc..a25fc7d 100644
--- a/.github/workflows/publish-to-pypi.yml
+++ b/.github/workflows/publish-to-pypi.yml
@@ -33,4 +33,4 @@ jobs:
     - name: Build package
       run: python -m build
     - name: Publish package
-      uses: pypa/gh-action-pypi-publish@release/v1
\ No newline at end of file
+      uses: pypa/gh-action-pypi-publish@release/v1

From 41f9a643005ded70f8e89de8a381a005dfd22ab3 Mon Sep 17 00:00:00 2001
From: Carlos Garcia Jurado Suarez <carlosg@uw.edu>
Date: Fri, 5 Jan 2024 14:46:32 -0800
Subject: [PATCH 4/6] add formatting pre-commit checks

---
 .pre-commit-config.yaml | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 029a6e6..cc58e8e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,8 +1,20 @@
 fail_fast: true
 repos:
 
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.4.0
+    hooks:
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      # - id: check-docstring-first
+      - id: check-json
+      - id: check-yaml
+      - id: pretty-format-json
+        exclude: \.ipy(n|nb)$
+        args: ["--autofix", "--indent=2", "--no-sort-keys"]
+
     # Compare the local template version to the latest remote template version
-    # This hook should always pass. It will print a message if the local version 
+    # This hook should always pass. It will print a message if the local version
     # is out of date.
   - repo: https://github.com/lincc-frameworks/pre-commit-hooks
     rev: v0.1.1
@@ -82,7 +94,7 @@ repos:
 
 
     # Run unit tests, verify that they pass. Note that coverage is run against
-    # the ./src directory here because that is what will be committed. In the 
+    # the ./src directory here because that is what will be committed. In the
     # github workflow script, the coverage is run against the installed package
     # and uploaded to Codecov by calling pytest like so:
     # `python -m pytest --cov=<package_name> --cov-report=xml`
@@ -95,9 +107,9 @@ repos:
         language: system
         pass_filenames: false
         always_run: true
-    # Make sure Sphinx can build the documentation while explicitly omitting 
-    # notebooks from the docs, so users don't have to wait through the execution 
-    # of each notebook or each commit. By default, these will be checked in the 
+    # Make sure Sphinx can build the documentation while explicitly omitting
+    # notebooks from the docs, so users don't have to wait through the execution
+    # of each notebook or each commit. By default, these will be checked in the
     # GitHub workflows.
   - repo: local
     hooks:

From ea4f6dc91e047fbfbdbd77b3250b49bc8440f2a2 Mon Sep 17 00:00:00 2001
From: Carlos Garcia Jurado Suarez <carlosg@uw.edu>
Date: Fri, 5 Jan 2024 14:57:40 -0800
Subject: [PATCH 5/6] fix line endings

---
 .github/actions/deps/action.yaml    |  6 +++---
 .github/pull_request_template.md    |  8 ++++----
 .github/workflows/pre-commit-ci.yml |  6 +++---
 .github/workflows/smoke-test.yml    |  2 +-
 README.md                           | 10 +++++-----
 azureml/conda.yml                   |  2 +-
 azureml/eval.yml                    | 10 +++++-----
 azureml/generate.yml                | 12 ++++++------
 8 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/.github/actions/deps/action.yaml b/.github/actions/deps/action.yaml
index d38d141..04b8c5b 100644
--- a/.github/actions/deps/action.yaml
+++ b/.github/actions/deps/action.yaml
@@ -13,9 +13,9 @@ runs:
         python-version: ${{inputs.python-version}}
     - name: Setup pip
       shell: sh
-      run: |        
+      run: |
         python3 -m ensurepip
         python3 -m pip install --upgrade pip
-    - name: Install project 
+    - name: Install project
       shell: sh
-      run: pip install ".[dev,train]"
\ No newline at end of file
+      run: pip install ".[dev,train]"
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index 76e043c..2745f42 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,17 +1,17 @@
-<!-- 
+<!--
 Thank you for your contribution to the repo :)
 
 Pull Request (PR) Instructions:
 Provide a general summary of your changes in the Title above. Fill out each section of the template, and replace the space with an `x` in all the boxes that apply. If you're unsure about any of these, don't hesitate to ask. We're here to help! Once you are satisfied with the pull request, click the "Create pull request" button to submit it for review.
 
 Before submitting this PR, please ensure that your input and responses are entered in the designated space provided below each section to keep all project-related information organized and easily accessible.
- 
+
 How to link to a PR:
-https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue 
+https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue
 -->
 
 ## Change Description
-<!--- 
+<!---
 Describe your changes in detail. In your description, you should answer questions like "Why is this change required? What problem does it solve?".
 
 If it fixes an open issue, please link to the issue here. If this PR closes an issue, put the word 'closes' before the issue link to auto-close the issue when the PR is merged.
diff --git a/.github/workflows/pre-commit-ci.yml b/.github/workflows/pre-commit-ci.yml
index dc6cdf6..936a9a4 100644
--- a/.github/workflows/pre-commit-ci.yml
+++ b/.github/workflows/pre-commit-ci.yml
@@ -10,12 +10,12 @@ on:
 jobs:
   pre-commit-ci:
     runs-on: ubuntu-latest
-    env: 
+    env:
       SKIP: "check-lincc-frameworks-template-version,pytest-check,no-commit-to-branch,validate-pyproject,check-added-large-files,sphinx-build"
     steps:
     - uses: actions/checkout@v3
       with:
-        fetch-depth: 0 
+        fetch-depth: 0
     - name: Setup Dependencies
       uses: ./.github/actions/deps
       with:
@@ -24,4 +24,4 @@ jobs:
       with:
         extra_args: --from-ref ${{ github.event.pull_request.base.sha }} --to-ref ${{ github.event.pull_request.head.sha }}
     - uses: pre-commit-ci/lite-action@v1.0.1
-      if: always()
\ No newline at end of file
+      if: always()
diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml
index ed5b25d..7fcc6bf 100644
--- a/.github/workflows/smoke-test.yml
+++ b/.github/workflows/smoke-test.yml
@@ -10,7 +10,7 @@ on:
   # Runs this workflow automatically
   schedule:
     - cron: 45 6 * * *
-    
+
   # Allows you to run this workflow manually from the Actions tab
   workflow_dispatch:
 
diff --git a/README.md b/README.md
index 8640545..d51e74b 100644
--- a/README.md
+++ b/README.md
@@ -11,8 +11,8 @@
 [![codecov](https://codecov.io/gh/AutoResearch/autodoc/branch/main/graph/badge.svg)](https://codecov.io/gh/AutoResearch/autodoc)
 <!-- [![Read the Docs](https://img.shields.io/readthedocs/autora-doc)](https://autora-doc.readthedocs.io/) -->
 
-This project was automatically generated using the LINCC-Frameworks 
-[python-project-template](https://github.com/lincc-frameworks/python-project-template). For more information about the project template see the 
+This project was automatically generated using the LINCC-Frameworks
+[python-project-template](https://github.com/lincc-frameworks/python-project-template). For more information about the project template see the
 [documentation](https://lincc-ppt.readthedocs.io/en/latest/).
 
 ## Dev Guide - Getting Started
@@ -41,7 +41,7 @@ Notes:
 3) Look at `pyproject.toml` for other optional dependencies, e.g. you can do `pip install -e ."[dev,train,cuda]"` if you want to use CUDA.
 2) `pre-commit install` will initialize pre-commit for this local repository, so
    that a set of tests will be run prior to completing a local commit. For more
-   information, see the Python Project Template documentation on 
+   information, see the Python Project Template documentation on
    [pre-commit](https://lincc-ppt.readthedocs.io/en/latest/practices/precommit.html)
 3) Install `pandoc` allows you to verify that automatic rendering of Jupyter notebooks
    into documentation for ReadTheDocs works as expected. For more information, see
@@ -49,7 +49,7 @@ Notes:
    [Sphinx and Python Notebooks](https://lincc-ppt.readthedocs.io/en/latest/practices/sphinx.html#python-notebooks)
 
 
-## Running AzureML pipelines 
+## Running AzureML pipelines
 
 This repo contains the evaluation and training pipelines for AutoDoc.
 
@@ -90,4 +90,4 @@ Notes:
 Example:
 ```sh
 az storage blob upload  --account-name <account> --container <container>> --file data/data.jsonl -n data/sweetpea/data.jsonl
- ```
\ No newline at end of file
+ ```
diff --git a/azureml/conda.yml b/azureml/conda.yml
index f772397..fab3656 100644
--- a/azureml/conda.yml
+++ b/azureml/conda.yml
@@ -15,4 +15,4 @@ dependencies:
     - xformers
     - scipy
     # This works, while installing from pytorch and cuda from conda does not
-    - torch==2.0.1    
\ No newline at end of file
+    - torch==2.0.1
diff --git a/azureml/eval.yml b/azureml/eval.yml
index 551bd9c..1f31ea4 100644
--- a/azureml/eval.yml
+++ b/azureml/eval.yml
@@ -1,6 +1,6 @@
 $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
 command: >
-  python -m autora.doc.pipelines.main eval 
+  python -m autora.doc.pipelines.main eval
   ${{inputs.data_dir}}/data.jsonl
   --model-path ${{inputs.model_path}}
   --sys-id ${{inputs.sys_id}}
@@ -11,12 +11,12 @@ command: >
 code: ../src
 inputs:
   data_dir:
-    type: uri_folder 
+    type: uri_folder
     path: azureml://datastores/workspaceblobstore/paths/data/sweetpea/
   # Currently models are loading faster directly from HuggingFace vs Azure Blob Storage
   # model_dir:
-  #   type: uri_folder 
-  #   path: azureml://datastores/workspaceblobstore/paths/base_models    
+  #   type: uri_folder
+  #   path: azureml://datastores/workspaceblobstore/paths/base_models
   model_path: meta-llama/Llama-2-7b-chat-hf
   temperature: 0.7
   top_p: 0.95
@@ -37,4 +37,4 @@ environment: # azureml://registries/azureml/environments/acpt-pytorch-2.0-cuda11
 display_name: autodoc_prediction
 compute: azureml:t4cluster
 experiment_name: evaluation
-description: |
\ No newline at end of file
+description: |
diff --git a/azureml/generate.yml b/azureml/generate.yml
index fedf7f5..7e3f902 100644
--- a/azureml/generate.yml
+++ b/azureml/generate.yml
@@ -1,6 +1,6 @@
 $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
 command: >
-  python -m autora.doc.pipelines.main generate 
+  python -m autora.doc.pipelines.main generate
   --model-path ${{inputs.model_path}}
   --output ./outputs/output.txt
   --sys-id ${{inputs.sys_id}}
@@ -8,23 +8,23 @@ command: >
   --param temperature=${{inputs.temperature}}
   --param top_k=${{inputs.top_k}}
   --param top_p=${{inputs.top_p}}
-  autora/doc/pipelines/main.py    
+  autora/doc/pipelines/main.py
 code: ../src
 inputs:
   # Currently models are loading faster directly from HuggingFace vs Azure Blob Storage
   # model_dir:
-  #   type: uri_folder 
-  #   path: azureml://datastores/workspaceblobstore/paths/base_models    
+  #   type: uri_folder
+  #   path: azureml://datastores/workspaceblobstore/paths/base_models
   model_path: meta-llama/Llama-2-7b-chat-hf
   temperature: 0.7
   top_p: 0.95
   top_k: 40
   sys_id: SYS_1
   instruc_id: INSTR_SWEETP_1
-environment: 
+environment:
   image: mcr.microsoft.com/azureml/curated/acpt-pytorch-2.0-cuda11.7:21
   conda_file: conda.yml
 display_name: autodoc_prediction
 compute: azureml:t4cluster
 experiment_name: prediction
-description: |
\ No newline at end of file
+description: |

From 9f1bb61d0766d94309d3ea3905370dd46a5bce37 Mon Sep 17 00:00:00 2001
From: Carlos Garcia Jurado Suarez <carlosg@uw.edu>
Date: Fri, 5 Jan 2024 15:05:06 -0800
Subject: [PATCH 6/6] fix line endings 2

---
 .gitignore     | 2 +-
 .mypy.ini      | 2 +-
 docs/Makefile  | 1 -
 pyproject.toml | 2 +-
 src/.amlignore | 2 +-
 5 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/.gitignore b/.gitignore
index 1ba51fa..cce3a02 100644
--- a/.gitignore
+++ b/.gitignore
@@ -147,4 +147,4 @@ _results/
 _html/
 
 # mlflow output
-mlruns/
\ No newline at end of file
+mlruns/
diff --git a/.mypy.ini b/.mypy.ini
index b2565b1..d9e2214 100644
--- a/.mypy.ini
+++ b/.mypy.ini
@@ -7,4 +7,4 @@ explicit_package_bases = True
 ignore_missing_imports = True
 
 [mypy-mlflow.*]
-ignore_missing_imports = True
\ No newline at end of file
+ignore_missing_imports = True
diff --git a/docs/Makefile b/docs/Makefile
index a5622f1..70d22f1 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -28,4 +28,3 @@ clean:
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-
diff --git a/pyproject.toml b/pyproject.toml
index c28cb5a..e1c4382 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -101,4 +101,4 @@ include = ["src/autora"]
 packages = ["src/autora"]
 
 [project.scripts]
-autodoc = "autora.doc.pipelines.main:app"
\ No newline at end of file
+autodoc = "autora.doc.pipelines.main:app"
diff --git a/src/.amlignore b/src/.amlignore
index f1ec22a..8e62855 100644
--- a/src/.amlignore
+++ b/src/.amlignore
@@ -1,3 +1,3 @@
 mlruns/
 .mypy_cache/
-__pycache__/
\ No newline at end of file
+__pycache__/