diff --git a/README.md b/README.md index 127c2f4..e2e66ad 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,15 @@ # AutoDoc +[![ssec](https://img.shields.io/badge/SSEC-Project-purple?logo=&style=plastic)](https://escience.washington.edu/software-engineering/ssec/) + [![Template](https://img.shields.io/badge/Template-LINCC%20Frameworks%20Python%20Project%20Template-brightgreen)](https://lincc-ppt.readthedocs.io/en/latest/) -[![PyPI](https://img.shields.io/pypi/v/autora-doc?color=blue&logo=pypi&logoColor=white)](https://pypi.org/project/autora-doc/) + [![GitHub Workflow Status](https://github.com/autoresearch/autodoc/actions/workflows/smoke-test.yml/badge.svg)](https://github.com/AutoResearch/autodoc/actions/workflows/smoke-test.yml) [![codecov](https://codecov.io/gh/AutoResearch/autodoc/branch/main/graph/badge.svg)](https://codecov.io/gh/AutoResearch/autodoc) -[![Read the Docs](https://img.shields.io/readthedocs/autora-doc)](https://autora-doc.readthedocs.io/) + This project was automatically generated using the LINCC-Frameworks [python-project-template](https://github.com/lincc-frameworks/python-project-template). For more information about the project template see the diff --git a/azureml/eval.yml b/azureml/eval.yml index a2f72b6..ea6953b 100644 --- a/azureml/eval.yml +++ b/azureml/eval.yml @@ -2,9 +2,12 @@ $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json command: > python -m autora.doc.pipelines.main eval ${{inputs.data_dir}}/data.jsonl - ${{inputs.model_dir}}/llama-2-7b-chat-hf - SYS_1 - INSTR_SWEETP_1 + --model-path ${{inputs.model_dir}}/llama-2-7b-chat-hf + --sys-id ${{inputs.sys_id}} + --instruc-id ${{inputs.instruc_id}} + --param temperature=${{inputs.temperature}} + --param top_k=${{inputs.top_k}} + --param top_p=${{inputs.top_p}} code: ../src inputs: data_dir: @@ -13,6 +16,11 @@ inputs: model_dir: type: uri_folder path: azureml://datastores/workspaceblobstore/paths/base_models + temperature: 0.7 + top_p: 0.95 + top_k: 40 + sys_id: SYS_1 + instruc_id: INSTR_SWEETP_1 # using a curated environment doesn't work because we need additional packages environment: # azureml://registries/azureml/environments/acpt-pytorch-2.0-cuda11.7/versions/21 image: mcr.microsoft.com/azureml/curated/acpt-pytorch-2.0-cuda11.7:21 @@ -25,6 +33,6 @@ environment: # azureml://registries/azureml/environments/acpt-pytorch-2.0-cuda11 # image: nvcr.io/nvidia/pytorch:23.10-py3 conda_file: conda.yml display_name: autodoc_prediction -compute: azureml:v100cluster -experiment_name: autodoc_prediction +compute: azureml:t4cluster +experiment_name: evaluation description: | \ No newline at end of file diff --git a/azureml/generate.yml b/azureml/generate.yml index c7df113..d849fcd 100644 --- a/azureml/generate.yml +++ b/azureml/generate.yml @@ -3,16 +3,26 @@ command: > python -m autora.doc.pipelines.main generate --model-path ${{inputs.model_dir}}/llama-2-7b-chat-hf --output ./outputs/output.txt + --sys-id ${{inputs.sys_id}} + --instruc-id ${{inputs.instruc_id}} + --param temperature=${{inputs.temperature}} + --param top_k=${{inputs.top_k}} + --param top_p=${{inputs.top_p}} autora/doc/pipelines/main.py code: ../src inputs: model_dir: type: uri_folder path: azureml://datastores/workspaceblobstore/paths/base_models + temperature: 0.7 + top_p: 0.95 + top_k: 40 + sys_id: SYS_1 + instruc_id: INSTR_SWEETP_1 environment: image: mcr.microsoft.com/azureml/curated/acpt-pytorch-2.0-cuda11.7:21 conda_file: conda.yml display_name: autodoc_prediction -compute: azureml:v100cluster -experiment_name: autodoc_prediction +compute: azureml:t4cluster +experiment_name: prediction description: | \ No newline at end of file diff --git a/notebooks/generate.ipynb b/notebooks/generate.ipynb new file mode 100644 index 0000000..06b3683 --- /dev/null +++ b/notebooks/generate.ipynb @@ -0,0 +1,126 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "from autora.doc.runtime.predict_hf import Predictor\n", + "from autora.doc.runtime.prompts import INSTR, SYS, InstructionPrompts, SystemPrompts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# model = \"../../models\" # if model has been previously downloaded via huggingface-cli\n", + "model = \"meta-llama/Llama-2-7b-chat-hf\"\n", + "pred = Predictor(model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "TEST_CODE = \"\"\"\n", + "from sweetpea import *\n", + "from sweetpea.primitives import *\n", + "\n", + "number_list = [125, 132, 139, 146, 160, 167, 174, 181]\n", + "letter_list = ['b', 'd', 'f', 'h', 's', 'u', 'w', 'y']\n", + "\n", + "number = Factor(\"number\", number_list)\n", + "letter = Factor(\"letter\", letter_list)\n", + "task = Factor(\"task\", [\"number task\", \"letter task\", \"free choice task\"])\n", + "\n", + "\n", + "def is_forced_trial_switch(task):\n", + " return (task[-1] == \"number task\" and task[0] == \"letter task\") or \\\n", + " (task[-1] == \"letter task\" and task[0] == \"number task\")\n", + "\n", + "\n", + "def is_forced_trial_repeat(task):\n", + " return (task[-1] == \"number task\" and task[0] == \"number task\") or \\\n", + " (task[-1] == \"letter task\" and task[0] == \"letter task\")\n", + "\n", + "\n", + "def is_free_trial_transition(task):\n", + " return task[-1] != \"free choice task\" and task[0] == \"free choice task\"\n", + "\n", + "\n", + "def is_free_trial_repeat(task):\n", + " return task[-1] == \"free choice task\" and task[0] == \"free choice task\"\n", + "\n", + "\n", + "def is_not_relevant_transition(task):\n", + " return not (is_forced_trial_repeat(task) or is_forced_trial_switch(task) or is_free_trial_repeat(\n", + " task) or is_free_trial_transition(task))\n", + "\n", + "\n", + "transit = Factor(\"task transition\", [\n", + " DerivedLevel(\"forced switch\", transition(is_forced_trial_switch, [task]), 3),\n", + " DerivedLevel(\"forced repeat\", transition(is_forced_trial_repeat, [task])),\n", + " DerivedLevel(\"free transition\", transition(is_free_trial_transition, [task]), 4),\n", + " DerivedLevel(\"free repeat\", transition(is_free_trial_repeat, [task]), 4),\n", + " DerivedLevel(\"forced first\", transition(is_not_relevant_transition, [task]), 4)\n", + "])\n", + "design = [letter, number, task, transit]\n", + "crossing = [[letter], [number], [transit]]\n", + "constraints = [MinimumTrials(256)]\n", + "\n", + "block = MultiCrossBlock(design, crossing, constraints)\n", + "\n", + "experiment = synthesize_trials(block, 1)\n", + "\n", + "save_experiments_csv(block, experiment, 'code_1_sequences/seq')\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "output = pred.predict(\n", + " SYS[SystemPrompts.SYS_1],\n", + " INSTR[InstructionPrompts.INSTR_SWEETP_EXAMPLE],\n", + " [TEST_CODE],\n", + " temperature=0.05,\n", + " top_k=10,\n", + " num_ret_seq=3,\n", + ")[0]\n", + "for i, o in enumerate(output):\n", + " print(f\"******** Output {i} ********\\n{o}*************\\n\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "autodoc", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.18" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/autora/doc/pipelines/main.py b/src/autora/doc/pipelines/main.py index de7e906..5afc6bf 100644 --- a/src/autora/doc/pipelines/main.py +++ b/src/autora/doc/pipelines/main.py @@ -1,3 +1,4 @@ +import itertools import logging from timeit import default_timer as timer from typing import List @@ -16,13 +17,24 @@ logger = logging.getLogger(__name__) -@app.command() -def eval(data_file: str, model_path: str, sys_id: SystemPrompts, instruc_id: InstructionPrompts) -> List[str]: +@app.command(help="Evaluate model on a data file") +def eval( + data_file: str = typer.Argument(..., help="JSONL Data file to evaluate on"), + model_path: str = typer.Option("meta-llama/Llama-2-7b-chat-hf", help="Path to HF model"), + sys_id: SystemPrompts = typer.Option(SystemPrompts.SYS_1, help="System prompt ID"), + instruc_id: InstructionPrompts = typer.Option( + InstructionPrompts.INSTR_SWEETP_1, help="Instruction prompt ID" + ), + param: List[str] = typer.Option( + [], help="Additional float parameters to pass to the model as name=float pairs" + ), +) -> List[List[str]]: import jsonlines import mlflow mlflow.autolog() + param_dict = {pair[0]: float(pair[1]) for pair in [pair.split("=") for pair in param]} run = mlflow.active_run() sys_prompt = SYS[sys_id] @@ -33,6 +45,7 @@ def eval(data_file: str, model_path: str, sys_id: SystemPrompts, instruc_id: Ins logger.info(f"Active run_id: {run.info.run_id}") logger.info(f"running predict with {data_file}") logger.info(f"model path: {model_path}") + mlflow.log_params(param_dict) with jsonlines.open(data_file) as reader: items = [item for item in reader] @@ -41,16 +54,19 @@ def eval(data_file: str, model_path: str, sys_id: SystemPrompts, instruc_id: Ins pred = Predictor(model_path) timer_start = timer() - predictions = pred.predict(sys_prompt, instr_prompt, inputs) + predictions = pred.predict(sys_prompt, instr_prompt, inputs, **param_dict) timer_end = timer() pred_time = timer_end - timer_start mlflow.log_metric("prediction_time/doc", pred_time / (len(inputs))) for i in range(len(inputs)): mlflow.log_text(labels[i], f"label_{i}.txt") mlflow.log_text(inputs[i], f"input_{i}.py") - mlflow.log_text(predictions[i], f"prediction_{i}.txt") + for j in range(len(predictions[i])): + mlflow.log_text(predictions[i][j], f"prediction_{i}_{j}.txt") - tokens = pred.tokenize(predictions)["input_ids"] + # flatten predictions for counting tokens + predictions_flat = list(itertools.chain.from_iterable(predictions)) + tokens = pred.tokenize(predictions_flat)["input_ids"] total_tokens = sum([len(token) for token in tokens]) mlflow.log_metric("total_tokens", total_tokens) mlflow.log_metric("tokens/sec", total_tokens / pred_time) @@ -59,18 +75,28 @@ def eval(data_file: str, model_path: str, sys_id: SystemPrompts, instruc_id: Ins @app.command() def generate( - python_file: str, - model_path: str = "meta-llama/llama-2-7b-chat-hf", - output: str = "output.txt", - sys_id: SystemPrompts = SystemPrompts.SYS_1, - instruc_id: InstructionPrompts = InstructionPrompts.INSTR_SWEETP_1, + python_file: str = typer.Argument(..., help="Python file to generate documentation for"), + model_path: str = typer.Option("meta-llama/Llama-2-7b-chat-hf", help="Path to HF model"), + output: str = typer.Option("output.txt", help="Output file"), + sys_id: SystemPrompts = typer.Option(SystemPrompts.SYS_1, help="System prompt ID"), + instruc_id: InstructionPrompts = typer.Option( + InstructionPrompts.INSTR_SWEETP_1, help="Instruction prompt ID" + ), + param: List[str] = typer.Option( + [], help="Additional float parameters to pass to the model as name=float pairs" + ), ) -> None: + param_dict = {pair[0]: float(pair[1]) for pair in [pair.split("=") for pair in param]} + """ + Generate documentation from python file + """ with open(python_file, "r") as f: - inputs = [f.read()] + input = f.read() sys_prompt = SYS[sys_id] instr_prompt = INSTR[instruc_id] pred = Predictor(model_path) - predictions = pred.predict(sys_prompt, instr_prompt, inputs) + # grab first result since we only passed one input + predictions = pred.predict(sys_prompt, instr_prompt, [input], **param_dict)[0] assert len(predictions) == 1, f"Expected only one output, got {len(predictions)}" logger.info(f"Writing output to {output}") with open(output, "w") as f: diff --git a/src/autora/doc/runtime/predict_hf.py b/src/autora/doc/runtime/predict_hf.py index 23c484e..307c99e 100644 --- a/src/autora/doc/runtime/predict_hf.py +++ b/src/autora/doc/runtime/predict_hf.py @@ -27,22 +27,36 @@ def __init__(self, model_path: str): tokenizer=self.tokenizer, ) - def predict(self, sys: str, instr: str, inputs: List[str]) -> List[str]: - logger.info(f"Generating {len(inputs)} predictions") + def predict( + self, + sys: str, + instr: str, + inputs: List[str], + temperature: float = 0.6, + top_p: float = 0.95, + top_k: float = 40, + max_length: float = 2048, + num_ret_seq: float = 1, + ) -> List[List[str]]: + logger.info( + f"Generating {len(inputs)} predictions. Temperature: {temperature}, top_p: {top_p}, top_k: {top_k}, " + f"max_length: {max_length}" + ) prompts = [TEMP_LLAMA2.format(sys=sys, instr=instr, input=input) for input in inputs] - # TODO: Make these parameters configurable sequences = self.pipeline( prompts, do_sample=True, - temperature=0.6, - top_p=0.95, - top_k=40, - num_return_sequences=1, + temperature=temperature, + top_p=top_p, + top_k=int(top_k), + num_return_sequences=int(num_ret_seq), eos_token_id=self.tokenizer.eos_token_id, - max_length=2048, + max_length=int(max_length), ) - results = [Predictor.trim_prompt(sequence[0]["generated_text"]) for sequence in sequences] + results = [ + [Predictor.trim_prompt(seq["generated_text"]) for seq in sequence] for sequence in sequences + ] logger.info(f"Generated {len(results)} results") return results diff --git a/src/autora/doc/runtime/prompts.py b/src/autora/doc/runtime/prompts.py index 75019fc..4480fb8 100644 --- a/src/autora/doc/runtime/prompts.py +++ b/src/autora/doc/runtime/prompts.py @@ -23,6 +23,53 @@ INSTR_SWEETP_1 = """Please generate high-level two paragraph documentation for the following experiment. The first paragraph should explain the purpose and the second one the procedure, but don't use the word 'Paragraph'""" +# The following prompt uses an example (code, doc) to specify the desired behavior +EX_CODE = """ +from sweetpea import * + +color = Factor('color', ['red', 'green', 'blue', 'yellow']) +word = Factor('word', ['red', 'green', 'blue', 'yellow']) + +def is_congruent(word, color): + return (word == color) + +def is_not_congruent(word, color): + return not is_congruent(word, color) + +congruent = DerivedLevel('congruent', WithinTrial(is_congruent, [word, color])) +incongruent = DerivedLevel('incongruent', WithinTrial(is_not_congruent, [word, color])) + +congruency = Factor('congruency', [congruent, incongruent]) + +constraints = [MinimumTrials(48)] +design = [word, color, congruency] +crossing = [word, congruency] + +block = CrossBlock(design, crossing, constraints) + +experiment = synthesize_trials(block, 1) + +save_experiments_csv(block, experiment, 'code_1_sequences/seq') +""" + +EX_DOC = """There are two regular factors: color and word. The color factor consists of four levels: "red", "green", +"blue", and "yellow". The word factor also consists of the four levels: "red", "green", "blue", and "yellow". +There is another derived factor referred to as congruency. The congruency factor depends on the regular factors word +and color and has two levels: "congruent" and "incongruent". A trial is considered "congruent" if the word matches +the color, otherwise, it is considered "incongruent". We counterbalanced the word factor with the congruency factor. +All experiment sequences contained at least 48 trials.""" + +INSTR_SWEETP_EXAMPLE = f"""Consider the following experiment code: +--- +{EX_CODE} +--- +Here's a a good English description: +--- +{EX_DOC} +--- +Using the same style, please generate a high-level one paragraph description for the following experiment code: +""" + class SystemPrompts(str, Enum): SYS_1 = "SYS_1" @@ -30,7 +77,11 @@ class SystemPrompts(str, Enum): class InstructionPrompts(str, Enum): INSTR_SWEETP_1 = "INSTR_SWEETP_1" + INSTR_SWEETP_EXAMPLE = "INSTR_SWEETP_EXAMPLE" SYS = {SystemPrompts.SYS_1: SYS_1} -INSTR = {InstructionPrompts.INSTR_SWEETP_1: INSTR_SWEETP_1} +INSTR = { + InstructionPrompts.INSTR_SWEETP_1: INSTR_SWEETP_1, + InstructionPrompts.INSTR_SWEETP_EXAMPLE: INSTR_SWEETP_EXAMPLE, +} diff --git a/tests/test_main.py b/tests/test_main.py index 3e67bab..097e8c7 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -9,17 +9,19 @@ def test_predict() -> None: data = Path(__file__).parent.joinpath("../data/data.jsonl").resolve() - outputs = eval(str(data), TEST_HF_MODEL, SystemPrompts.SYS_1, InstructionPrompts.INSTR_SWEETP_1) + outputs = eval(str(data), TEST_HF_MODEL, SystemPrompts.SYS_1, InstructionPrompts.INSTR_SWEETP_1, []) assert len(outputs) == 3, "Expected 3 outputs" for output in outputs: - assert len(output) > 0, "Expected non-empty output" + assert len(output[0]) > 0, "Expected non-empty output" def test_generate() -> None: python_file = __file__ output = Path("output.txt") output.unlink(missing_ok=True) - generate(python_file, TEST_HF_MODEL, str(output), SystemPrompts.SYS_1, InstructionPrompts.INSTR_SWEETP_1) + generate( + python_file, TEST_HF_MODEL, str(output), SystemPrompts.SYS_1, InstructionPrompts.INSTR_SWEETP_1, [] + ) assert output.exists(), f"Expected output file {output} to exist" with open(str(output), "r") as f: assert len(f.read()) > 0, f"Expected non-empty output file {output}"