diff --git a/azureml/eval.yml b/azureml/eval.yml index 1f31ea4..156351f 100644 --- a/azureml/eval.yml +++ b/azureml/eval.yml @@ -5,6 +5,7 @@ command: > --model-path ${{inputs.model_path}} --sys-id ${{inputs.sys_id}} --instruc-id ${{inputs.instruc_id}} + --param do_sample=${{inputs.do_sample}} --param temperature=${{inputs.temperature}} --param top_k=${{inputs.top_k}} --param top_p=${{inputs.top_p}} @@ -18,9 +19,10 @@ inputs: # type: uri_folder # path: azureml://datastores/workspaceblobstore/paths/base_models model_path: meta-llama/Llama-2-7b-chat-hf - temperature: 0.7 + temperature: 0.01 + do_sample: 0 top_p: 0.95 - top_k: 40 + top_k: 1 sys_id: SYS_1 instruc_id: INSTR_SWEETP_1 # using a curated environment doesn't work because we need additional packages diff --git a/azureml/generate.yml b/azureml/generate.yml index 7e3f902..ce7eb59 100644 --- a/azureml/generate.yml +++ b/azureml/generate.yml @@ -5,6 +5,7 @@ command: > --output ./outputs/output.txt --sys-id ${{inputs.sys_id}} --instruc-id ${{inputs.instruc_id}} + --param do_sample=${{inputs.do_sample}} --param temperature=${{inputs.temperature}} --param top_k=${{inputs.top_k}} --param top_p=${{inputs.top_p}} @@ -17,6 +18,7 @@ inputs: # path: azureml://datastores/workspaceblobstore/paths/base_models model_path: meta-llama/Llama-2-7b-chat-hf temperature: 0.7 + do_sample: 0 top_p: 0.95 top_k: 40 sys_id: SYS_1 diff --git a/data/autora/code1.txt b/data/autora/code1.txt new file mode 100644 index 0000000..735bb20 --- /dev/null +++ b/data/autora/code1.txt @@ -0,0 +1,69 @@ +import numpy as np +import pandas as pd +import sympy as sp +from autora.experiment_runner.synthetic.abstract.equation import equation_experiment +from autora.experimentalist.random import random_pool +from autora.state import StandardState, estimator_on_state, on_state +from autora.theorist.bms import BMSRegressor +from autora.variable import ValueType, Variable, VariableCollection + +#################################################################################### +## Define initial data +#################################################################################### + +#### Define variable data #### +iv = Variable(name="x", value_range=(0, 2 * np.pi), allowed_values=np.linspace(0, 2 * np.pi, 30)) +dv = Variable(name="y", type=ValueType.REAL) +variables = VariableCollection(independent_variables=[iv], dependent_variables=[dv]) + +#### Define seed condition data #### +conditions = random_pool(variables, num_samples=10, random_state=0) + +#################################################################################### +## Define experimentalist +#################################################################################### + +experimentalist = on_state(random_pool, output=["conditions"]) + +#################################################################################### +## Define experiment runner +#################################################################################### + +sin_experiment = equation_experiment( + sp.simplify("sin(x)"), variables.independent_variables, variables.dependent_variables[0] +) +sin_runner = sin_experiment.experiment_runner + +experiment_runner = on_state(sin_runner, output=["experiment_data"]) + +#################################################################################### +## Define theorist +#################################################################################### + +theorist = estimator_on_state(BMSRegressor(epochs=100)) + +#################################################################################### +## Define state +#################################################################################### + +s = StandardState( + variables=variables, conditions=conditions, experiment_data=pd.DataFrame(columns=["x", "y"]) +) + +#################################################################################### +## Cycle through the state +#################################################################################### + +print("Pre-Defined State:") +print(f"Number of datapoints collected: {len(s['experiment_data'])}") +print(f"Derived models: {s['models']}") +print("\n") + +for i in range(5): + s = experimentalist(s, num_samples=10, random_state=42) + s = experiment_runner(s, added_noise=1.0, random_state=42) + s = theorist(s) + print(f"\nCycle {i+1} Results:") + print(f"Number of datapoints collected: {len(s['experiment_data'])}") + print(f"Derived models: {s['models']}") + print("\n") diff --git a/data/autora/data.jsonl b/data/autora/data.jsonl new file mode 100644 index 0000000..b2a5702 --- /dev/null +++ b/data/autora/data.jsonl @@ -0,0 +1 @@ +{"instruction": "import numpy as np\nimport pandas as pd\nimport sympy as sp\nfrom autora.experiment_runner.synthetic.abstract.equation import equation_experiment\nfrom autora.experimentalist.random import random_pool\nfrom autora.state import StandardState, estimator_on_state, on_state\nfrom autora.theorist.bms import BMSRegressor\nfrom autora.variable import ValueType, Variable, VariableCollection\n\n####################################################################################\n## Define initial data\n####################################################################################\n\n#### Define variable data ####\niv = Variable(name=\"x\", value_range=(0, 2 * np.pi), allowed_values=np.linspace(0, 2 * np.pi, 30))\ndv = Variable(name=\"y\", type=ValueType.REAL)\nvariables = VariableCollection(independent_variables=[iv], dependent_variables=[dv])\n\n#### Define seed condition data ####\nconditions = random_pool(variables, num_samples=10, random_state=0)\n\n####################################################################################\n## Define experimentalist\n####################################################################################\n\nexperimentalist = on_state(random_pool, output=[\"conditions\"])\n\n####################################################################################\n## Define experiment runner\n####################################################################################\n\nsin_experiment = equation_experiment(\n sp.simplify(\"sin(x)\"), variables.independent_variables, variables.dependent_variables[0]\n)\nsin_runner = sin_experiment.experiment_runner\n\nexperiment_runner = on_state(sin_runner, output=[\"experiment_data\"])\n\n####################################################################################\n## Define theorist\n####################################################################################\n\ntheorist = estimator_on_state(BMSRegressor(epochs=100))\n\n####################################################################################\n## Define state\n####################################################################################\n\ns = StandardState(\n variables=variables, conditions=conditions, experiment_data=pd.DataFrame(columns=[\"x\", \"y\"])\n)\n\n####################################################################################\n## Cycle through the state\n####################################################################################\n\nprint(\"Pre-Defined State:\")\nprint(f\"Number of datapoints collected: {len(s['experiment_data'])}\")\nprint(f\"Derived models: {s['models']}\")\nprint(\"\\n\")\n\nfor i in range(5):\n s = experimentalist(s, num_samples=10, random_state=42)\n s = experiment_runner(s, added_noise=1.0, random_state=42)\n s = theorist(s)\n print(f\"\\nCycle {i+1} Results:\")\n print(f\"Number of datapoints collected: {len(s['experiment_data'])}\")\n print(f\"Derived models: {s['models']}\")\n print(\"\\n\")\n", "output": "The following example demonstrates how to use AutoRA to automate the process of model discovery, experimental design, and data collection. \n\nThe discovery problem is defined by a single independent variable $x \\in [0, 2 \\pi]$ and dependent variable $y$.\nThe experiment amounts to a simple sine wave, $y = \\sin(x)$, which is the model we are trying to discover.\n\nThe discovery cycle iterates between the experimentalist, experiment runner, and theorist. Here, we us a \"random\" experimentalist, which samples novel experimental conditions for $x$ every cycle. \nThe experiment runner then collects data for the corresponding $y$ values. Finally, the theorist uses a [Bayesian Machine Scientist](https://autoresearch.github.io/autora/user-guide/theorists/bms/) (BMS; GuimerĂ  et al., in Science Advances) to identify a scientific model that explains the data. "} diff --git a/data/autora/text1.txt b/data/autora/text1.txt new file mode 100644 index 0000000..2da4e3b --- /dev/null +++ b/data/autora/text1.txt @@ -0,0 +1,7 @@ +The following example demonstrates how to use AutoRA to automate the process of model discovery, experimental design, and data collection. + +The discovery problem is defined by a single independent variable $x \in [0, 2 \pi]$ and dependent variable $y$. +The experiment amounts to a simple sine wave, $y = \sin(x)$, which is the model we are trying to discover. + +The discovery cycle iterates between the experimentalist, experiment runner, and theorist. Here, we us a "random" experimentalist, which samples novel experimental conditions for $x$ every cycle. +The experiment runner then collects data for the corresponding $y$ values. Finally, the theorist uses a [Bayesian Machine Scientist](https://autoresearch.github.io/autora/user-guide/theorists/bms/) (BMS; GuimerĂ  et al., in Science Advances) to identify a scientific model that explains the data. diff --git a/data/data.jsonl b/data/sweetpea/data.jsonl similarity index 100% rename from data/data.jsonl rename to data/sweetpea/data.jsonl diff --git a/src/autora/doc/pipelines/main.py b/src/autora/doc/pipelines/main.py index 5afc6bf..a553153 100644 --- a/src/autora/doc/pipelines/main.py +++ b/src/autora/doc/pipelines/main.py @@ -108,6 +108,21 @@ def import_model(model_name: str) -> None: pass +@app.command() +def import_data(code_file: str, text_file: str, output_file: str = "data.jsonl") -> None: + from pathlib import Path + + import jsonlines + + # alpaca jsonl format: + def read_text(file: str) -> str: + return Path(file).read_text() + + d = {"instruction": read_text(code_file), "output": read_text(text_file)} + with jsonlines.open(output_file, "a") as file: + file.write(d) + + if __name__ == "__main__": logger.info(f"Torch version: {torch.__version__} , Cuda available: {torch.cuda.is_available()}") diff --git a/src/autora/doc/runtime/predict_hf.py b/src/autora/doc/runtime/predict_hf.py index 307c99e..85e6919 100644 --- a/src/autora/doc/runtime/predict_hf.py +++ b/src/autora/doc/runtime/predict_hf.py @@ -32,20 +32,23 @@ def predict( sys: str, instr: str, inputs: List[str], - temperature: float = 0.6, + do_sample: float = 0.0, + temperature: float = 0.01, top_p: float = 0.95, - top_k: float = 40, + top_k: float = 1, max_length: float = 2048, num_ret_seq: float = 1, ) -> List[List[str]]: + # convert to bool in case it came in as a generate float param from the CLI + do_sample = bool(do_sample) logger.info( - f"Generating {len(inputs)} predictions. Temperature: {temperature}, top_p: {top_p}, top_k: {top_k}, " - f"max_length: {max_length}" + f"Generating {len(inputs)} predictions. do_sample: {do_sample}, temperature: {temperature}, top_p: {top_p}," + f" top_k: {top_k}, max_length: {max_length}" ) prompts = [TEMP_LLAMA2.format(sys=sys, instr=instr, input=input) for input in inputs] sequences = self.pipeline( prompts, - do_sample=True, + do_sample=do_sample, temperature=temperature, top_p=top_p, top_k=int(top_k), diff --git a/tests/test_main.py b/tests/test_main.py index 097e8c7..f5a283e 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -1,6 +1,6 @@ from pathlib import Path -from autora.doc.pipelines.main import eval, generate +from autora.doc.pipelines.main import eval, generate, import_data from autora.doc.runtime.prompts import InstructionPrompts, SystemPrompts # dummy HF model for testing @@ -8,7 +8,7 @@ def test_predict() -> None: - data = Path(__file__).parent.joinpath("../data/data.jsonl").resolve() + data = Path(__file__).parent.joinpath("../data/sweetpea/data.jsonl").resolve() outputs = eval(str(data), TEST_HF_MODEL, SystemPrompts.SYS_1, InstructionPrompts.INSTR_SWEETP_1, []) assert len(outputs) == 3, "Expected 3 outputs" for output in outputs: @@ -25,3 +25,12 @@ def test_generate() -> None: assert output.exists(), f"Expected output file {output} to exist" with open(str(output), "r") as f: assert len(f.read()) > 0, f"Expected non-empty output file {output}" + + +def test_import(tmp_path: Path) -> None: + data = tmp_path.joinpath("data.jsonl") + code = Path(__file__).parent.joinpath("../data/autora/code1.txt").resolve() + text = Path(__file__).parent.joinpath("../data/autora/text1.txt").resolve() + import_data(str(code), str(text), str(data)) + new_lines = data.read_text().splitlines() + assert len(new_lines) == 1, "Expected one new line"