chore: Refactor code to remove unnecessary conditional statement, add…

… readme, and tiny programs example
AnswerDotAI · Sep 11, 2024 · 9e46486 · 9e46486
1 parent 34bf0b7
commit 9e46486
Show file tree

Hide file tree

Showing 6 changed files with 1,231 additions and 72 deletions.
diff --git a/README.md b/README.md
@@ -3,9 +3,6 @@
 
 <!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! -->
 
-This file will become your README and also the index of your
-documentation.
-
 ## Developer Guide
 
 If you are new to using `nbdev` here are some useful pointers to get you
@@ -58,10 +55,70 @@ find package manager specific guidelines on
 
 ## How to use
 
-Fill me in please! Don’t forget code examples:
+First you need to define the structure of the data you want to generate.
+`instructor`, which is the library that fastdata uses to generate data,
+requires you to define the schema of the data you want to generate. This
+is done using pydantic models.
+
+``` python
+from pydantic import BaseModel, Field
+
+class Translation(BaseModel):
+    english: str = Field(description="An english phrase")
+    german: str = Field(description="An equivalent german phrase that is a translation of the english phrase")
+```
+
+Next, you need to define the prompt that will be used to generate the
+data and any inputs you want to pass to the prompt.
+
+``` python
+prompt_template = """\
+Generate English and German translations on the following topic:
+{topic}
+"""
+
+inputs = [{"topic": "Otters are cute"}, {"topic": "I love programming"}]
+```
+
+Finally, we can generate some data with fastdata.
+
+> [!NOTE]
+>
+> We only support Anthropic models at the moment. Therefore, make sure
+> you have an API key for the model you want to use and the proper
+> environment variables set or pass the api key to the
+> [`FastData`](https://AnswerDotAI.github.io/fastdata/core.html#fastdata)
+> class `FastData(api_key="sk-ant-api03-...")`.
 
 ``` python
-1+1
+from fastdata.core import FastData
+
+import pprint
+
+# Create a pretty printer object with custom settings
+pp = pprint.PrettyPrinter(indent=4, width=100, compact=False)
+
+fast_data = FastData()
+translations = fast_data.generate(
+    prompt_template=prompt_template,
+    inputs=inputs,
+    response_model=Translation,
+    model="claude-3-haiku-20240307"
+)
+
+# Pretty print the translations
+print("Translations:")
+pp.pprint(translations)
 ```
 
-    2
+    100%|██████████| 2/2 [00:00<00:00,  2.21it/s]
+
+    Translations:
+    [   {'english': 'Otters are cute', 'german': 'Otter sind süß'},
+        {'english': 'I love programming', 'german': 'Ich liebe das Programmieren'}]
+
+If you’d like to see how best to generate data with fastdata, check out
+our blog post [here](https://www.answer.ai/blog/introducing-fastdata)
+and some of the examples in the
+[examples](https://github.com/AnswerDotAI/fastdata/tree/main/examples)
+directory.
diff --git a/examples/tiny_programs.py b/examples/tiny_programs.py
@@ -0,0 +1,222 @@
+from datasets import Dataset, load_dataset
+from fastdata.core import FastData
+from pydantic import BaseModel, Field
+from typing import Literal
+
+class TinyProgram(BaseModel):
+    requirements: str = Field(description="A description of the requirements for the program to help the persona.")
+    code: str = Field(description="The code that satisfies the requirements. Ensure it is well written and documented.")
+
+class TranslationCritique(BaseModel):
+    critique: str = Field(description="A critique of the code.")
+    score: Literal[1, 2, 3, 4, 5] = Field(description="A score of the code from 1 to 5.")
+
+examples = [
+    TinyProgram(
+        requirements="A Python-based data aggregation and analysis tool that scrapes key Salvadoran news websites and government portals for the latest political updates, election results, and policy changes. The program would use standard libraries like requests for web scraping, re for text parsing, and pandas for data manipulation. It would store the collected information in a structured format, perform basic sentiment analysis on news articles, and generate a daily summary report highlighting significant political events, trending topics, and shifts in public opinion. The tool could also track mentions of key political figures and parties, providing a quick overview of their media presence and associated sentiments.",
+        code="""\
+```python
+import requests
+from bs4 import BeautifulSoup
+import pandas as pd
+from textblob import TextBlob
+from collections import Counter
+import datetime
+
+def scrape_news(url):
+    response = requests.get(url)
+    soup = BeautifulSoup(response.content, 'html.parser')
+    articles = soup.find_all('article', class_='article-item')
+    
+    news_data = []
+    for article in articles:
+        title = article.find('h2', class_='article-title').text.strip()
+        summary = article.find('p', class_='article-summary').text.strip()
+        news_data.append({'title': title, 'summary': summary})
+    
+    return news_data
+
+def analyze_sentiment(text):
+    return TextBlob(text).sentiment.polarity
+
+def generate_report(data):
+    df = pd.DataFrame(data)
+    df['sentiment'] = df['summary'].apply(analyze_sentiment)
+    
+    # Calculate average sentiment
+    avg_sentiment = df['sentiment'].mean()
+    
+    # Find most mentioned words
+    all_words = ' '.join(df['title'] + ' ' + df['summary']).lower().split()
+    word_freq = Counter(word for word in all_words if len(word) > 3)
+    top_words = word_freq.most_common(5)
+    
+    # Generate report
+    report = f"Daily Political Analysis Report for El Salvador - {datetime.date.today()}\n\n"
+    report += f"Number of articles analyzed: {len(df)}\n"
+    report += f"Average sentiment: {'Positive' if avg_sentiment > 0 else 'Negative'} ({avg_sentiment:.2f})\n\n"
+    report += "Top mentioned words:\n"
+    for word, count in top_words:
+        report += f"- {word}: {count} times\n"
+    
+    report += "\nMost positive article:\n"
+    pos_article = df.loc[df['sentiment'].idxmax()]
+    report += f"Title: {pos_article['title']}\nSentiment: {pos_article['sentiment']:.2f}\n\n"
+    
+    report += "Most negative article:\n"
+    neg_article = df.loc[df['sentiment'].idxmin()]
+    report += f"Title: {neg_article['title']}\nSentiment: {neg_article['sentiment']:.2f}\n"
+    
+    return report
+
+def main():
+    url = "https://www.elsalvador.com/noticias/nacional/"  # Example Salvadoran news website
+    news_data = scrape_news(url)
+    report = generate_report(news_data)
+    print(report)
+    
+    # Optionally, save the report to a file
+    with open(f"el_salvador_political_report_{datetime.date.today()}.txt", "w") as f:
+        f.write(report)
+
+if __name__ == "__main__":
+    main()
+```
+"""
+    ),
+    TinyProgram(
+        requirements="A \"Joke Personalizer\" program that takes a comedian's standard jokes as input and automatically modifies them to include inside references and shared memories from their school days. The program should use a dictionary of preset keywords (e.g., \"cafeteria\", \"Mr. Johnson's class\") and replace generic terms in the jokes with these personalized references. It should also have a \"nostalgia meter\" that rates how many childhood callbacks are in each joke, and a \"groan factor\" estimator based on the corniness of the puns. Bonus feature: a \"detention probability calculator\" that humorously predicts how likely the joke would have gotten them in trouble back in school.",
+        code="""\
+```python
+import random
+import re
+
+class JokePersonalizer:
+    def __init__(self):
+        self.inside_references = {
+            "restaurant": "cafeteria",
+            "teacher": "Mr. Johnson",
+            "friend": "Danny 'Braces' Smith",
+            "car": "rusty old bike",
+            "mall": "corner store",
+            "party": "detention",
+            "cool": "totally radical",
+            "phone": "Gameboy",
+            "computer": "TI-83 calculator",
+            "internet": "library encyclopedia"
+        }
+        self.pun_words = ["cheesy", "corny", "hilarious", "side-splitting", "knee-slapping"]
+
+    def personalize_joke(self, joke):
+        for generic, personal in self.inside_references.items():
+            joke = re.sub(r'\b' + generic + r'\b', personal, joke, flags=re.IGNORECASE)
+        return joke
+
+    def nostalgia_meter(self, joke):
+        count = sum(1 for ref in self.inside_references.values() if ref.lower() in joke.lower())
+        return min(count * 20, 100)  # 20 points per reference, max 100
+
+    def groan_factor(self, joke):
+        pun_count = sum(1 for word in self.pun_words if word.lower() in joke.lower())
+        return min(pun_count * 25, 100)  # 25 points per pun word, max 100
+
+    def detention_probability(self, joke):
+        naughty_words = ["detention", "trouble", "principal's office", "suspended"]
+        probability = sum(10 for word in naughty_words if word.lower() in joke.lower())
+        return min(probability, 100)  # 10% per naughty word, max 100%
+
+    def process_joke(self, original_joke):
+        personalized_joke = self.personalize_joke(original_joke)
+        nostalgia = self.nostalgia_meter(personalized_joke)
+        groan = self.groan_factor(personalized_joke)
+        detention_prob = self.detention_probability(personalized_joke)
+        
+        return {
+            "original": original_joke,
+            "personalized": personalized_joke,
+            "nostalgia_rating": nostalgia,
+            "groan_factor": groan,
+            "detention_probability": detention_prob
+        }
+
+# Example usage
+personalizer = JokePersonalizer()
+
+jokes = [
+    "I went to a restaurant last night and had the best meal ever!",
+    "My teacher asked me to stay after class, it was so cool!",
+    "I threw a party and nobody came. It was a real phone-y situation!",
+]
+
+for joke in jokes:
+    result = personalizer.process_joke(joke)
+    print(f"Original: {result['original']}")
+    print(f"Personalized: {result['personalized']}")
+    print(f"Nostalgia Rating: {result['nostalgia_rating']}%")
+    print(f"Groan Factor: {result['groan_factor']}%")
+    print(f"Detention Probability: {result['detention_probability']}%")
+    print()
+```
+"""
+    ),
+]
+examples = "\n".join(f"- {example}" for example in examples)
+
+# Load personas
+personas = load_dataset("proj-persona/PersonaHub", "persona", split='train').select(range(1_000))['persona']
+
+prompt_template = """\
+Here are some examples:
+{examples}
+
+Create requirements and the python program that satisfies them for the following persona: {persona}
+"""
+
+# Generate tiny programs
+fast_data = FastData()
+tiny_programs = fast_data.generate(
+    prompt_template=prompt_template,
+    inputs=[{"persona": persona, "examples": examples} for persona in personas],
+    response_model=TinyProgram,
+    model="claude-3-haiku-20240307"
+)
+# remove Nones
+tiny_programs = [t for t in tiny_programs if t is not None]
+
+critique_template = """\
+Below is a code snippet. Evaluate its educational value for teaching programming to beginners in this language, using the additive 5-point scoring system described below. Points are accumulated based on the satisfaction of each criterion:
+
+- Add 1 point if the code is syntactically correct and runs without errors, providing a basic example of working code in the language.
+- Add another point if the code demonstrates fundamental programming concepts (e.g., variables, control structures, functions) in a straightforward manner, even if it's not optimized or doesn't follow all best practices.
+- Award a third point if the code is well-commented, explaining key concepts and the purpose of different code sections. It should be readable and illustrate good naming conventions, making it easier for beginners to understand.
+- Grant a fourth point if the code showcases language-specific features or common programming patterns in an accessible way. It should provide clear examples of how to apply these concepts practically.
+- Bestow a fifth point if the code is an exemplary teaching tool, striking an excellent balance between simplicity and real-world applicability. It should inspire further learning, possibly including deliberate mistakes or opportunities for improvement that a teacher could use as discussion points.
+
+The code snippet:
+```python
+{code}
+```
+
+After examining the code:
+
+- Briefly justify your total score, up to 100 words, focusing on its effectiveness as a teaching tool for beginners.
+- Conclude with the score.
+"""
+
+critiques = fast_data.generate(
+    prompt_template=critique_template,
+    inputs=[{"code": f"{t['code']}"} for t in tiny_programs],
+    response_model=TranslationCritique,
+    model="claude-3-5-sonnet-20240620"
+)
+
+# Update tiny_programs with critiques
+for program, critique in zip(tiny_programs, critiques):
+    if program is None or critique is None:
+        continue
+    program['critique'] = critique['critique']
+    program['score'] = critique['score']
+
+
+ds = Dataset.from_list(tiny_programs)
+ds.push_to_hub("answerdotai/tiny_programs", private=True)
diff --git a/fastdata/core.py b/fastdata/core.py
@@ -59,7 +59,6 @@ def process_input(input_data):
             futures = [executor.submit(process_input, input_data) for input_data in inputs]
             for future in tqdm(concurrent.futures.as_completed(futures), total=len(inputs)):
                 result = future.result()
-                if result:
-                    results.append(result)
+                results.append(result)
 
         return results
diff --git a/nbs/00_core.ipynb b/nbs/00_core.ipynb
@@ -30,7 +30,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -45,7 +45,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -95,15 +95,14 @@
     "            futures = [executor.submit(process_input, input_data) for input_data in inputs]\n",
     "            for future in tqdm(concurrent.futures.as_completed(futures), total=len(inputs)):\n",
     "                result = future.result()\n",
-    "                if result:\n",
-    "                    results.append(result)\n",
+    "                results.append(result)\n",
     "        \n",
     "        return results"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -178,7 +177,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -284,7 +283,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -298,18 +297,6 @@
    "display_name": "python3",
    "language": "python",
    "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
   }
  },
  "nbformat": 4,