Skip to content

Commit

Permalink
chore: Refactor code to remove unnecessary conditional statement, add…
Browse files Browse the repository at this point in the history
… readme, and tiny programs example
  • Loading branch information
ncoop57 committed Sep 11, 2024
1 parent 34bf0b7 commit 9e46486
Show file tree
Hide file tree
Showing 6 changed files with 1,231 additions and 72 deletions.
69 changes: 63 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,6 @@

<!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! -->

This file will become your README and also the index of your
documentation.

## Developer Guide

If you are new to using `nbdev` here are some useful pointers to get you
Expand Down Expand Up @@ -58,10 +55,70 @@ find package manager specific guidelines on

## How to use

Fill me in please! Don’t forget code examples:
First you need to define the structure of the data you want to generate.
`instructor`, which is the library that fastdata uses to generate data,
requires you to define the schema of the data you want to generate. This
is done using pydantic models.

``` python
from pydantic import BaseModel, Field

class Translation(BaseModel):
english: str = Field(description="An english phrase")
german: str = Field(description="An equivalent german phrase that is a translation of the english phrase")
```

Next, you need to define the prompt that will be used to generate the
data and any inputs you want to pass to the prompt.

``` python
prompt_template = """\
Generate English and German translations on the following topic:
{topic}
"""

inputs = [{"topic": "Otters are cute"}, {"topic": "I love programming"}]
```

Finally, we can generate some data with fastdata.

> [!NOTE]
>
> We only support Anthropic models at the moment. Therefore, make sure
> you have an API key for the model you want to use and the proper
> environment variables set or pass the api key to the
> [`FastData`](https://AnswerDotAI.github.io/fastdata/core.html#fastdata)
> class `FastData(api_key="sk-ant-api03-...")`.
``` python
1+1
from fastdata.core import FastData

import pprint

# Create a pretty printer object with custom settings
pp = pprint.PrettyPrinter(indent=4, width=100, compact=False)

fast_data = FastData()
translations = fast_data.generate(
prompt_template=prompt_template,
inputs=inputs,
response_model=Translation,
model="claude-3-haiku-20240307"
)

# Pretty print the translations
print("Translations:")
pp.pprint(translations)
```

2
100%|██████████| 2/2 [00:00<00:00, 2.21it/s]

Translations:
[ {'english': 'Otters are cute', 'german': 'Otter sind süß'},
{'english': 'I love programming', 'german': 'Ich liebe das Programmieren'}]

If you’d like to see how best to generate data with fastdata, check out
our blog post [here](https://www.answer.ai/blog/introducing-fastdata)
and some of the examples in the
[examples](https://github.com/AnswerDotAI/fastdata/tree/main/examples)
directory.
222 changes: 222 additions & 0 deletions examples/tiny_programs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
from datasets import Dataset, load_dataset
from fastdata.core import FastData
from pydantic import BaseModel, Field
from typing import Literal

class TinyProgram(BaseModel):
requirements: str = Field(description="A description of the requirements for the program to help the persona.")
code: str = Field(description="The code that satisfies the requirements. Ensure it is well written and documented.")

class TranslationCritique(BaseModel):
critique: str = Field(description="A critique of the code.")
score: Literal[1, 2, 3, 4, 5] = Field(description="A score of the code from 1 to 5.")

examples = [
TinyProgram(
requirements="A Python-based data aggregation and analysis tool that scrapes key Salvadoran news websites and government portals for the latest political updates, election results, and policy changes. The program would use standard libraries like requests for web scraping, re for text parsing, and pandas for data manipulation. It would store the collected information in a structured format, perform basic sentiment analysis on news articles, and generate a daily summary report highlighting significant political events, trending topics, and shifts in public opinion. The tool could also track mentions of key political figures and parties, providing a quick overview of their media presence and associated sentiments.",
code="""\
```python
import requests
from bs4 import BeautifulSoup
import pandas as pd
from textblob import TextBlob
from collections import Counter
import datetime
def scrape_news(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
articles = soup.find_all('article', class_='article-item')
news_data = []
for article in articles:
title = article.find('h2', class_='article-title').text.strip()
summary = article.find('p', class_='article-summary').text.strip()
news_data.append({'title': title, 'summary': summary})
return news_data
def analyze_sentiment(text):
return TextBlob(text).sentiment.polarity
def generate_report(data):
df = pd.DataFrame(data)
df['sentiment'] = df['summary'].apply(analyze_sentiment)
# Calculate average sentiment
avg_sentiment = df['sentiment'].mean()
# Find most mentioned words
all_words = ' '.join(df['title'] + ' ' + df['summary']).lower().split()
word_freq = Counter(word for word in all_words if len(word) > 3)
top_words = word_freq.most_common(5)
# Generate report
report = f"Daily Political Analysis Report for El Salvador - {datetime.date.today()}\n\n"
report += f"Number of articles analyzed: {len(df)}\n"
report += f"Average sentiment: {'Positive' if avg_sentiment > 0 else 'Negative'} ({avg_sentiment:.2f})\n\n"
report += "Top mentioned words:\n"
for word, count in top_words:
report += f"- {word}: {count} times\n"
report += "\nMost positive article:\n"
pos_article = df.loc[df['sentiment'].idxmax()]
report += f"Title: {pos_article['title']}\nSentiment: {pos_article['sentiment']:.2f}\n\n"
report += "Most negative article:\n"
neg_article = df.loc[df['sentiment'].idxmin()]
report += f"Title: {neg_article['title']}\nSentiment: {neg_article['sentiment']:.2f}\n"
return report
def main():
url = "https://www.elsalvador.com/noticias/nacional/" # Example Salvadoran news website
news_data = scrape_news(url)
report = generate_report(news_data)
print(report)
# Optionally, save the report to a file
with open(f"el_salvador_political_report_{datetime.date.today()}.txt", "w") as f:
f.write(report)
if __name__ == "__main__":
main()
```
"""
),
TinyProgram(
requirements="A \"Joke Personalizer\" program that takes a comedian's standard jokes as input and automatically modifies them to include inside references and shared memories from their school days. The program should use a dictionary of preset keywords (e.g., \"cafeteria\", \"Mr. Johnson's class\") and replace generic terms in the jokes with these personalized references. It should also have a \"nostalgia meter\" that rates how many childhood callbacks are in each joke, and a \"groan factor\" estimator based on the corniness of the puns. Bonus feature: a \"detention probability calculator\" that humorously predicts how likely the joke would have gotten them in trouble back in school.",
code="""\
```python
import random
import re
class JokePersonalizer:
def __init__(self):
self.inside_references = {
"restaurant": "cafeteria",
"teacher": "Mr. Johnson",
"friend": "Danny 'Braces' Smith",
"car": "rusty old bike",
"mall": "corner store",
"party": "detention",
"cool": "totally radical",
"phone": "Gameboy",
"computer": "TI-83 calculator",
"internet": "library encyclopedia"
}
self.pun_words = ["cheesy", "corny", "hilarious", "side-splitting", "knee-slapping"]
def personalize_joke(self, joke):
for generic, personal in self.inside_references.items():
joke = re.sub(r'\b' + generic + r'\b', personal, joke, flags=re.IGNORECASE)
return joke
def nostalgia_meter(self, joke):
count = sum(1 for ref in self.inside_references.values() if ref.lower() in joke.lower())
return min(count * 20, 100) # 20 points per reference, max 100
def groan_factor(self, joke):
pun_count = sum(1 for word in self.pun_words if word.lower() in joke.lower())
return min(pun_count * 25, 100) # 25 points per pun word, max 100
def detention_probability(self, joke):
naughty_words = ["detention", "trouble", "principal's office", "suspended"]
probability = sum(10 for word in naughty_words if word.lower() in joke.lower())
return min(probability, 100) # 10% per naughty word, max 100%
def process_joke(self, original_joke):
personalized_joke = self.personalize_joke(original_joke)
nostalgia = self.nostalgia_meter(personalized_joke)
groan = self.groan_factor(personalized_joke)
detention_prob = self.detention_probability(personalized_joke)
return {
"original": original_joke,
"personalized": personalized_joke,
"nostalgia_rating": nostalgia,
"groan_factor": groan,
"detention_probability": detention_prob
}
# Example usage
personalizer = JokePersonalizer()
jokes = [
"I went to a restaurant last night and had the best meal ever!",
"My teacher asked me to stay after class, it was so cool!",
"I threw a party and nobody came. It was a real phone-y situation!",
]
for joke in jokes:
result = personalizer.process_joke(joke)
print(f"Original: {result['original']}")
print(f"Personalized: {result['personalized']}")
print(f"Nostalgia Rating: {result['nostalgia_rating']}%")
print(f"Groan Factor: {result['groan_factor']}%")
print(f"Detention Probability: {result['detention_probability']}%")
print()
```
"""
),
]
examples = "\n".join(f"- {example}" for example in examples)

# Load personas
personas = load_dataset("proj-persona/PersonaHub", "persona", split='train').select(range(1_000))['persona']

prompt_template = """\
Here are some examples:
{examples}
Create requirements and the python program that satisfies them for the following persona: {persona}
"""

# Generate tiny programs
fast_data = FastData()
tiny_programs = fast_data.generate(
prompt_template=prompt_template,
inputs=[{"persona": persona, "examples": examples} for persona in personas],
response_model=TinyProgram,
model="claude-3-haiku-20240307"
)
# remove Nones
tiny_programs = [t for t in tiny_programs if t is not None]

critique_template = """\
Below is a code snippet. Evaluate its educational value for teaching programming to beginners in this language, using the additive 5-point scoring system described below. Points are accumulated based on the satisfaction of each criterion:
- Add 1 point if the code is syntactically correct and runs without errors, providing a basic example of working code in the language.
- Add another point if the code demonstrates fundamental programming concepts (e.g., variables, control structures, functions) in a straightforward manner, even if it's not optimized or doesn't follow all best practices.
- Award a third point if the code is well-commented, explaining key concepts and the purpose of different code sections. It should be readable and illustrate good naming conventions, making it easier for beginners to understand.
- Grant a fourth point if the code showcases language-specific features or common programming patterns in an accessible way. It should provide clear examples of how to apply these concepts practically.
- Bestow a fifth point if the code is an exemplary teaching tool, striking an excellent balance between simplicity and real-world applicability. It should inspire further learning, possibly including deliberate mistakes or opportunities for improvement that a teacher could use as discussion points.
The code snippet:
```python
{code}
```
After examining the code:
- Briefly justify your total score, up to 100 words, focusing on its effectiveness as a teaching tool for beginners.
- Conclude with the score.
"""

critiques = fast_data.generate(
prompt_template=critique_template,
inputs=[{"code": f"{t['code']}"} for t in tiny_programs],
response_model=TranslationCritique,
model="claude-3-5-sonnet-20240620"
)

# Update tiny_programs with critiques
for program, critique in zip(tiny_programs, critiques):
if program is None or critique is None:
continue
program['critique'] = critique['critique']
program['score'] = critique['score']


ds = Dataset.from_list(tiny_programs)
ds.push_to_hub("answerdotai/tiny_programs", private=True)
3 changes: 1 addition & 2 deletions fastdata/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ def process_input(input_data):
futures = [executor.submit(process_input, input_data) for input_data in inputs]
for future in tqdm(concurrent.futures.as_completed(futures), total=len(inputs)):
result = future.result()
if result:
results.append(result)
results.append(result)

return results
25 changes: 6 additions & 19 deletions nbs/00_core.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -45,7 +45,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -95,15 +95,14 @@
" futures = [executor.submit(process_input, input_data) for input_data in inputs]\n",
" for future in tqdm(concurrent.futures.as_completed(futures), total=len(inputs)):\n",
" result = future.result()\n",
" if result:\n",
" results.append(result)\n",
" results.append(result)\n",
" \n",
" return results"
]
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -178,7 +177,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -284,7 +283,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -298,18 +297,6 @@
"display_name": "python3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
Expand Down
Loading

0 comments on commit 9e46486

Please sign in to comment.