Skip to content

Commit

Permalink
Merge pull request #49 from e-p-armstrong/usability-overhaul
Browse files Browse the repository at this point in the history
Augmentoolkit 3.0
  • Loading branch information
e-p-armstrong authored Sep 12, 2024
2 parents aced615 + ff07fe1 commit 228114c
Show file tree
Hide file tree
Showing 195 changed files with 12,504 additions and 170,144 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
.DS_Store
**/*.pyc
output*
hidden*
log.txt
output
**/simplified_data*
Expand All @@ -24,4 +25,5 @@ website_scrape*
twitter_data.*
_model_training_configs/autometa*
_model_training_configs/intelorca*
editing_output*
editing_output*
**/*EIGHTY_SIX*.txt
5 changes: 5 additions & 0 deletions BOILERPLATE_TO_MAKE_YOUR_OWN_PIPELINE/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# This is an empty test pipeline that exists to give you a starting point, with Augmentoolkit's conventions and abstractions already in place, to start building out your own pipelines for your own usecases.

Please consider opening a PR and contributing it if you make something cool! Or just use it yourself that is OK too.

# If you run into problems while making a pipeline, consider creating an issue!
25 changes: 25 additions & 0 deletions BOILERPLATE_TO_MAKE_YOUR_OWN_PIPELINE/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
API:
API_KEY_A: key
API_KEY_B: ToIaiNGFuJ1wLNjlt8DBhMejhLJhx30ZVKVVTVQ5kLGP3YQY
BASE_URL_A: https://api.together.xyz
BASE_URL_B: https://api.fireworks.ai/inference/v1
LOGICAL_MODEL_A: meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo
LOGICAL_MODEL_B: accounts/fireworks/models/llama-v3p1-8b-instruct
MODE_A: api
MODE_B: api
PATH:
DEFAULT_PROMPTS: ./prompts
INPUT: ./raw_txt_input
OUTPUT: ./output
PROMPTS: ./prompts
PHASES:
PHASE_INDEX: 2
WORK_IN_PHASES: True
SYSTEM:
COMPLETION_MODE: False
CONCURRENCY_LIMIT: 3
STOP: True
SUBSET_SIZE: 3
USE_MIN_P: False
USE_SUBSET: True # you will probably want to have use_subset on during testing and development to save money.
CHUNK_SIZE: 2000
90 changes: 90 additions & 0 deletions BOILERPLATE_TO_MAKE_YOUR_OWN_PIPELINE/processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import random
import traceback
from augmentoolkit.generation_functions.engine_wrapper_class import EngineWrapper
from augmentoolkit.utils.write_output_to_file import write_output_to_file
from BOILERPLATE_TO_MAKE_YOUR_OWN_PIPELINE.steps import API_KEY_A, API_KEY_B, BASE_URL_A, BASE_URL_B, CONCURRENCY_LIMIT, LOGICAL_MODEL_A, LOGICAL_MODEL_B, MODE_A, MODE_B, add_key, chunking_algorithm, count_tokens, make_id


import nltk
from tqdm import asyncio as tqdmasyncio


import asyncio
import glob
import logging
import os
import sys
import time
import yaml

config_path = os.environ["CONFIG_PATH"]
with open (config_path, "r") as file:
config = yaml.safe_load(file)

WORK_IN_PHASES = bool(config["PHASES"]["WORK_IN_PHASES"])
PHASE_INDEX = int(config["PHASES"]["PHASE_INDEX"])
USE_SUBSET = bool(config["SYSTEM"]["USE_SUBSET"])
SUBSET_SIZE = int(config["SYSTEM"]["SUBSET_SIZE"])
CHUNK_SIZE = int(config["SYSTEM"]["CHUNK_SIZE"])
INPUT = config["PATH"]["INPUT"]


async def main():
# NOTE Load the source texts
print("Welcome to your test pipeline!")
print(f"Input folder: {INPUT}")
start_time = time.time()
print("Begun")

# Set up rate-limit-conscious functions
semaphore = asyncio.Semaphore(CONCURRENCY_LIMIT)
async def run_task_with_limit(task):
async with semaphore:
return await task

extensions = [".txt", ".md"]

source_texts = []
for extension in extensions:
path = f"{INPUT}/**/*" + extension
source_texts = source_texts + glob.glob(path, recursive=True)

if source_texts:
print(source_texts)
else:
print(f"No source texts found in: {INPUT}")

# NOTE Initialize the Engine (or API client)
engine_wrapper = EngineWrapper(
model=LOGICAL_MODEL_A,
api_key=API_KEY_A,
base_url=BASE_URL_A,
mode=MODE_A,
)

engine_wrapper_large = EngineWrapper(
model=LOGICAL_MODEL_B,
api_key=API_KEY_B,
base_url=BASE_URL_B,
mode=MODE_B,
)

# any HF path to a transformer model will do, as long as it has a tokenizer

sentence_chunks = []
for source_text in source_texts:
sentence_chunks += chunking_algorithm(source_text, max_token_length=CHUNK_SIZE)

# NOTE Generate the data
output_list = []
data_generations_tasks = [add_key(input_data=chunk, engine_wrapper=engine_wrapper_large, idx=idx, output_list=output_list) for idx, chunk in enumerate(sentence_chunks)]
coroutines = [run_task_with_limit(task) for task in data_generations_tasks]
for future in tqdmasyncio.tqdm.as_completed(coroutines):
await future

print(f"Time taken: {time.time() - start_time}")
print("You generated some data! Check the output folder for the results.")
print("here's one of the results: ")
print(output_list[0])

asyncio.run(main())
44 changes: 44 additions & 0 deletions BOILERPLATE_TO_MAKE_YOUR_OWN_PIPELINE/prompts/test_prompt.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
- role: system
content: |
You are an expert poet. Write a poem inspired by what you are shown.
- role: user
content: |
Input:
When life gives you lemons, don't make lemonade. Make life TAKE THE LEMONS BACK. GET MAD!
- role: assistant
content: | # You should always have few-shot examples
Defiance's Zest
In life's orchard, sour fruits abound,
Yellow globes fall, scatter the ground.
Convention whispers, "Sweeten the tart,"
But rebellion stirs within my heart.
No sugared drink shall I concoct,
No lemonade stand will be stocked.
These citrus spheres, I'll not accept,
A different path, I have now stepped.
With fire in eyes and strength in hand,
I gather lemons from this land.
Back to life, I'll hurl each one,
A citrus revolt has now begun.
"Take them back!" I shout with might,
"Your sour gifts, I'll no more sight!"
For I am not a passive receiver,
But a bold and brazen achiever.
Let life know, I write my tale,
No bitter fruit shall make me quail.
In face of sourness, I'll stand tall,
And rewrite the rules of nature's call.
So when life's lemons come your way,
Remember you have the power to say:
"Not today, life, not this time,"
And make defiance your paradigm.
- role: user
content: | # Note how interpolation is done here. The template names line up with the keys of the input object.
Passage:
{chunk}
Loading

0 comments on commit 228114c

Please sign in to comment.