Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Do deterministic inference #25

Merged
merged 7 commits into from
Jan 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions azureml/eval.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ command: >
--model-path ${{inputs.model_path}}
--sys-id ${{inputs.sys_id}}
--instruc-id ${{inputs.instruc_id}}
--param do_sample=${{inputs.do_sample}}
--param temperature=${{inputs.temperature}}
--param top_k=${{inputs.top_k}}
--param top_p=${{inputs.top_p}}
Expand All @@ -18,9 +19,10 @@ inputs:
# type: uri_folder
# path: azureml://datastores/workspaceblobstore/paths/base_models
model_path: meta-llama/Llama-2-7b-chat-hf
temperature: 0.7
temperature: 0.01
do_sample: 0
top_p: 0.95
top_k: 40
top_k: 1
sys_id: SYS_1
instruc_id: INSTR_SWEETP_1
# using a curated environment doesn't work because we need additional packages
Expand Down
2 changes: 2 additions & 0 deletions azureml/generate.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ command: >
--output ./outputs/output.txt
--sys-id ${{inputs.sys_id}}
--instruc-id ${{inputs.instruc_id}}
--param do_sample=${{inputs.do_sample}}
--param temperature=${{inputs.temperature}}
--param top_k=${{inputs.top_k}}
--param top_p=${{inputs.top_p}}
Expand All @@ -17,6 +18,7 @@ inputs:
# path: azureml://datastores/workspaceblobstore/paths/base_models
model_path: meta-llama/Llama-2-7b-chat-hf
temperature: 0.7
do_sample: 0
top_p: 0.95
top_k: 40
sys_id: SYS_1
Expand Down
69 changes: 69 additions & 0 deletions data/autora/code1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import numpy as np
import pandas as pd
import sympy as sp
from autora.experiment_runner.synthetic.abstract.equation import equation_experiment
from autora.experimentalist.random import random_pool
from autora.state import StandardState, estimator_on_state, on_state
from autora.theorist.bms import BMSRegressor
from autora.variable import ValueType, Variable, VariableCollection

####################################################################################
## Define initial data
####################################################################################

#### Define variable data ####
iv = Variable(name="x", value_range=(0, 2 * np.pi), allowed_values=np.linspace(0, 2 * np.pi, 30))
dv = Variable(name="y", type=ValueType.REAL)
variables = VariableCollection(independent_variables=[iv], dependent_variables=[dv])

#### Define seed condition data ####
conditions = random_pool(variables, num_samples=10, random_state=0)

####################################################################################
## Define experimentalist
####################################################################################

experimentalist = on_state(random_pool, output=["conditions"])

####################################################################################
## Define experiment runner
####################################################################################

sin_experiment = equation_experiment(
sp.simplify("sin(x)"), variables.independent_variables, variables.dependent_variables[0]
)
sin_runner = sin_experiment.experiment_runner

experiment_runner = on_state(sin_runner, output=["experiment_data"])

####################################################################################
## Define theorist
####################################################################################

theorist = estimator_on_state(BMSRegressor(epochs=100))

####################################################################################
## Define state
####################################################################################

s = StandardState(
variables=variables, conditions=conditions, experiment_data=pd.DataFrame(columns=["x", "y"])
)

####################################################################################
## Cycle through the state
####################################################################################

print("Pre-Defined State:")
print(f"Number of datapoints collected: {len(s['experiment_data'])}")
print(f"Derived models: {s['models']}")
print("\n")

for i in range(5):
s = experimentalist(s, num_samples=10, random_state=42)
s = experiment_runner(s, added_noise=1.0, random_state=42)
s = theorist(s)
print(f"\nCycle {i+1} Results:")
print(f"Number of datapoints collected: {len(s['experiment_data'])}")
print(f"Derived models: {s['models']}")
print("\n")
1 change: 1 addition & 0 deletions data/autora/data.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"instruction": "import numpy as np\nimport pandas as pd\nimport sympy as sp\nfrom autora.experiment_runner.synthetic.abstract.equation import equation_experiment\nfrom autora.experimentalist.random import random_pool\nfrom autora.state import StandardState, estimator_on_state, on_state\nfrom autora.theorist.bms import BMSRegressor\nfrom autora.variable import ValueType, Variable, VariableCollection\n\n####################################################################################\n## Define initial data\n####################################################################################\n\n#### Define variable data ####\niv = Variable(name=\"x\", value_range=(0, 2 * np.pi), allowed_values=np.linspace(0, 2 * np.pi, 30))\ndv = Variable(name=\"y\", type=ValueType.REAL)\nvariables = VariableCollection(independent_variables=[iv], dependent_variables=[dv])\n\n#### Define seed condition data ####\nconditions = random_pool(variables, num_samples=10, random_state=0)\n\n####################################################################################\n## Define experimentalist\n####################################################################################\n\nexperimentalist = on_state(random_pool, output=[\"conditions\"])\n\n####################################################################################\n## Define experiment runner\n####################################################################################\n\nsin_experiment = equation_experiment(\n sp.simplify(\"sin(x)\"), variables.independent_variables, variables.dependent_variables[0]\n)\nsin_runner = sin_experiment.experiment_runner\n\nexperiment_runner = on_state(sin_runner, output=[\"experiment_data\"])\n\n####################################################################################\n## Define theorist\n####################################################################################\n\ntheorist = estimator_on_state(BMSRegressor(epochs=100))\n\n####################################################################################\n## Define state\n####################################################################################\n\ns = StandardState(\n variables=variables, conditions=conditions, experiment_data=pd.DataFrame(columns=[\"x\", \"y\"])\n)\n\n####################################################################################\n## Cycle through the state\n####################################################################################\n\nprint(\"Pre-Defined State:\")\nprint(f\"Number of datapoints collected: {len(s['experiment_data'])}\")\nprint(f\"Derived models: {s['models']}\")\nprint(\"\\n\")\n\nfor i in range(5):\n s = experimentalist(s, num_samples=10, random_state=42)\n s = experiment_runner(s, added_noise=1.0, random_state=42)\n s = theorist(s)\n print(f\"\\nCycle {i+1} Results:\")\n print(f\"Number of datapoints collected: {len(s['experiment_data'])}\")\n print(f\"Derived models: {s['models']}\")\n print(\"\\n\")\n", "output": "The following example demonstrates how to use AutoRA to automate the process of model discovery, experimental design, and data collection. \n\nThe discovery problem is defined by a single independent variable $x \\in [0, 2 \\pi]$ and dependent variable $y$.\nThe experiment amounts to a simple sine wave, $y = \\sin(x)$, which is the model we are trying to discover.\n\nThe discovery cycle iterates between the experimentalist, experiment runner, and theorist. Here, we us a \"random\" experimentalist, which samples novel experimental conditions for $x$ every cycle. \nThe experiment runner then collects data for the corresponding $y$ values. Finally, the theorist uses a [Bayesian Machine Scientist](https://autoresearch.github.io/autora/user-guide/theorists/bms/) (BMS; Guimerà et al., in Science Advances) to identify a scientific model that explains the data. "}
7 changes: 7 additions & 0 deletions data/autora/text1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
The following example demonstrates how to use AutoRA to automate the process of model discovery, experimental design, and data collection.

The discovery problem is defined by a single independent variable $x \in [0, 2 \pi]$ and dependent variable $y$.
The experiment amounts to a simple sine wave, $y = \sin(x)$, which is the model we are trying to discover.

The discovery cycle iterates between the experimentalist, experiment runner, and theorist. Here, we us a "random" experimentalist, which samples novel experimental conditions for $x$ every cycle.
The experiment runner then collects data for the corresponding $y$ values. Finally, the theorist uses a [Bayesian Machine Scientist](https://autoresearch.github.io/autora/user-guide/theorists/bms/) (BMS; Guimerà et al., in Science Advances) to identify a scientific model that explains the data.
File renamed without changes.
15 changes: 15 additions & 0 deletions src/autora/doc/pipelines/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,21 @@ def import_model(model_name: str) -> None:
pass


@app.command()
def import_data(code_file: str, text_file: str, output_file: str = "data.jsonl") -> None:
from pathlib import Path

import jsonlines

# alpaca jsonl format:
def read_text(file: str) -> str:
return Path(file).read_text()

d = {"instruction": read_text(code_file), "output": read_text(text_file)}
with jsonlines.open(output_file, "a") as file:
file.write(d)


if __name__ == "__main__":
logger.info(f"Torch version: {torch.__version__} , Cuda available: {torch.cuda.is_available()}")

Expand Down
13 changes: 8 additions & 5 deletions src/autora/doc/runtime/predict_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,20 +32,23 @@ def predict(
sys: str,
instr: str,
inputs: List[str],
temperature: float = 0.6,
do_sample: float = 0.0,
temperature: float = 0.01,
top_p: float = 0.95,
top_k: float = 40,
top_k: float = 1,
max_length: float = 2048,
num_ret_seq: float = 1,
) -> List[List[str]]:
# convert to bool in case it came in as a generate float param from the CLI
do_sample = bool(do_sample)
logger.info(
f"Generating {len(inputs)} predictions. Temperature: {temperature}, top_p: {top_p}, top_k: {top_k}, "
f"max_length: {max_length}"
f"Generating {len(inputs)} predictions. do_sample: {do_sample}, temperature: {temperature}, top_p: {top_p},"
f" top_k: {top_k}, max_length: {max_length}"
)
prompts = [TEMP_LLAMA2.format(sys=sys, instr=instr, input=input) for input in inputs]
sequences = self.pipeline(
prompts,
do_sample=True,
do_sample=do_sample,
temperature=temperature,
top_p=top_p,
top_k=int(top_k),
Expand Down
13 changes: 11 additions & 2 deletions tests/test_main.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
from pathlib import Path

from autora.doc.pipelines.main import eval, generate
from autora.doc.pipelines.main import eval, generate, import_data
from autora.doc.runtime.prompts import InstructionPrompts, SystemPrompts

# dummy HF model for testing
TEST_HF_MODEL = "hf-internal-testing/tiny-random-FalconForCausalLM"


def test_predict() -> None:
data = Path(__file__).parent.joinpath("../data/data.jsonl").resolve()
data = Path(__file__).parent.joinpath("../data/sweetpea/data.jsonl").resolve()
outputs = eval(str(data), TEST_HF_MODEL, SystemPrompts.SYS_1, InstructionPrompts.INSTR_SWEETP_1, [])
assert len(outputs) == 3, "Expected 3 outputs"
for output in outputs:
Expand All @@ -25,3 +25,12 @@ def test_generate() -> None:
assert output.exists(), f"Expected output file {output} to exist"
with open(str(output), "r") as f:
assert len(f.read()) > 0, f"Expected non-empty output file {output}"


def test_import(tmp_path: Path) -> None:
data = tmp_path.joinpath("data.jsonl")
code = Path(__file__).parent.joinpath("../data/autora/code1.txt").resolve()
text = Path(__file__).parent.joinpath("../data/autora/text1.txt").resolve()
import_data(str(code), str(text), str(data))
new_lines = data.read_text().splitlines()
assert len(new_lines) == 1, "Expected one new line"