diff --git a/.gitignore b/.gitignore index 4ccf5c8..8449676 100644 --- a/.gitignore +++ b/.gitignore @@ -160,3 +160,5 @@ cython_debug/ #.idea/ .vscode/settings.json + +.DS_Store diff --git a/pyproject.toml b/pyproject.toml index 150b2bb..d5db495 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,8 @@ dependencies = [ "sqlalchemy", "pydantic_settings", "fastapi", - "JSON-log-formatter" + "JSON-log-formatter", + "openai" ] [project.urls] diff --git a/scripts/add_topic_poc.py b/scripts/add_topic_poc.py new file mode 100644 index 0000000..85c6473 --- /dev/null +++ b/scripts/add_topic_poc.py @@ -0,0 +1,84 @@ +import json +import os +from argparse import ArgumentParser +from typing import Dict, List + +from dotenv import load_dotenv +from openai import OpenAI + +response_sample = """ +{ + "1700958646329": { + "topics": ["医療", "福祉", "政治"], + "language": "en" + } +} +""" + +def get_topic(client: OpenAI, note_id: int, tweet: str, note: str) -> Dict[str, List[str]]: + print(f"note id: {note_id}") + with open(os.path.join(os.path.dirname(__file__), "fewshot_sample.json"), "r") as f: + fewshot_sample = json.load(f) + + chat_completion = client.chat.completions.create( + messages=[ + { + "role": "system", + "content": f"""以下はツイートと、それに追加されたコミュニティノートです。 +ツイート: +``` +{fewshot_sample["tweet"]} +``` +コミュニティノート: +``` +{fewshot_sample["note"]} +``` +このセットに対してのトピックは「{" ".join(fewshot_sample["topics"])}」です。 +これを踏まえて、以下のセットに対して同じ粒度で複数のトピック(少なくとも3つ)を提示してください。形式はJSONで、キーをtopicsとして値にトピックを配列で格納してください。また、ツイートに用いられている言語も推定し、キーをlanguageとしてiso 639-1に準拠した言語コードを格納してください。topicsとlanguageを格納するオブジェクトはnote idをキーとした値に格納してください +レスポンスの例 (1700958646329はnote id): +``` +{response_sample} +``` +""", + }, + { + "role": "user", + "content": f""" +note id: {note_id} +ツイート: +``` +{tweet} +``` +コミュニティノート: +``` +{note} +``` +""", + }, + ], + model="gpt-3.5-turbo", + temperature=0.0, + ) + + return json.loads(chat_completion.choices[0].message.content) + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument("input_file") + parser.add_argument("output_file") + args = parser.parse_args() + load_dotenv() + client = OpenAI() + with open(args.input_file, "r") as f: + notes = json.load(f) + with open(args.output_file, "w", encoding="utf-8") as f: + json.dump( + [ + get_topic(client, note["noteId"], note["tweetBody"], note["noteBody"]) + for note in notes + ], + f, + ensure_ascii=False, + indent=2, + ) + diff --git a/scripts/fewshot_sample.json b/scripts/fewshot_sample.json new file mode 100644 index 0000000..1e8e4bf --- /dev/null +++ b/scripts/fewshot_sample.json @@ -0,0 +1,7 @@ +{ + "tweet": "For those that care — 432 hz improves mental clarity, removes emotional blockages, reduces stress and anxiety, better sleep quality, increases creativity & inspiration, and strengthens the immune system. Play it while you sleep & watch these areas improve!", + "note": "There are no placebo controlled studies which support this. There is no evidence that this frequency has different effects from any other arbitrary frequency. https://ask.audio/articles/music-theory-432-hz-tuning-separating-fact-from-fiction", + "topics": [ + "医療", "福祉" + ] +} \ No newline at end of file