From ccdc6dc65df3b2ea321abd96beb09333c8fc5996 Mon Sep 17 00:00:00 2001 From: osoken Date: Tue, 21 Nov 2023 22:12:14 +0900 Subject: [PATCH 1/6] feat(package): add openai package --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 150b2bb..d5db495 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,8 @@ dependencies = [ "sqlalchemy", "pydantic_settings", "fastapi", - "JSON-log-formatter" + "JSON-log-formatter", + "openai" ] [project.urls] From 32013c173c6618d2e863e54bb8ede632f90cb34a Mon Sep 17 00:00:00 2001 From: osoken Date: Tue, 21 Nov 2023 22:14:01 +0900 Subject: [PATCH 2/6] feat(poc): add topic extraction script using gpt-4 --- scripts/add_topic_poc.py | 66 +++++++++++++++++++++++++++++++++++++ scripts/fewshot_sample.json | 7 ++++ 2 files changed, 73 insertions(+) create mode 100644 scripts/add_topic_poc.py create mode 100644 scripts/fewshot_sample.json diff --git a/scripts/add_topic_poc.py b/scripts/add_topic_poc.py new file mode 100644 index 0000000..fc250c5 --- /dev/null +++ b/scripts/add_topic_poc.py @@ -0,0 +1,66 @@ +import json +import os +from argparse import ArgumentParser + +from dotenv import load_dotenv +from openai import OpenAI + + +def get_topic(client: OpenAI, tweet: str, note: str) -> str: + with open(os.path.join(os.path.dirname(__file__), "fewshot_sample.json"), "r") as f: + fewshot_sample = json.load(f) + + chat_completion = client.chat.completions.create( + messages=[ + { + "role": "system", + "content": f"""以下はツイートと、それに追加されたコミュニティノートです。 +ツイート: +``` +{fewshot_sample["tweet"]} +``` +コミュニティノート: +``` +{fewshot_sample["note"]} +``` +このセットに対してのトピックは「{" ".join(fewshot_sample["topics"])}」です。 +これを踏まえて、以下のツイートとコミュニティノートに対して同じ粒度で複数のトピックを提示してください。形式はJSONで、キーをtopicsとして値にトピックを配列で格納してください。 +""", + }, + { + "role": "user", + "content": f"""ツイート: +``` +{tweet} +``` +コミュニティノート: +``` +{note} +``` +""", + }, + ], + model="gpt-4", + ) + + return json.loads(chat_completion.choices[0].message.content) + + +if __name__ == "__main__": + parser = ArgumentParser() + # parser.add_argument("input_file") + # parser.add_argument("output_file") + args = parser.parse_args() + load_dotenv() + client = OpenAI() + # with open(args.input_file, "r", encoding="utf-8") as f: + # tweets = json.load(f) + print( + get_topic( + client, + """Tweet content goes here. +""", + """Community note goes here +""", + ) + ) diff --git a/scripts/fewshot_sample.json b/scripts/fewshot_sample.json new file mode 100644 index 0000000..25991c8 --- /dev/null +++ b/scripts/fewshot_sample.json @@ -0,0 +1,7 @@ +{ + "tweet": "test twwet", + "note": "test note", + "topics": [ + "test topic" + ] +} \ No newline at end of file From b7b59f3138f9b246e47f7b08aaca3d4d366045ab Mon Sep 17 00:00:00 2001 From: osoken Date: Tue, 21 Nov 2023 22:21:43 +0900 Subject: [PATCH 3/6] fix(poc): lower temperature and fix type annotation --- scripts/add_topic_poc.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/add_topic_poc.py b/scripts/add_topic_poc.py index fc250c5..74a0afd 100644 --- a/scripts/add_topic_poc.py +++ b/scripts/add_topic_poc.py @@ -1,12 +1,13 @@ import json import os from argparse import ArgumentParser +from typing import Dict, List from dotenv import load_dotenv from openai import OpenAI -def get_topic(client: OpenAI, tweet: str, note: str) -> str: +def get_topic(client: OpenAI, tweet: str, note: str) -> Dict[str, List[str]]: with open(os.path.join(os.path.dirname(__file__), "fewshot_sample.json"), "r") as f: fewshot_sample = json.load(f) @@ -41,6 +42,7 @@ def get_topic(client: OpenAI, tweet: str, note: str) -> str: }, ], model="gpt-4", + temperature=0.0, ) return json.loads(chat_completion.choices[0].message.content) From 22a837ed6ea9d6fe549616503238b4ce03738274 Mon Sep 17 00:00:00 2001 From: kota-yata Date: Wed, 6 Dec 2023 16:05:26 +0900 Subject: [PATCH 4/6] add response sample to the prompt message --- scripts/add_topic_poc.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/scripts/add_topic_poc.py b/scripts/add_topic_poc.py index 74a0afd..4d60fb4 100644 --- a/scripts/add_topic_poc.py +++ b/scripts/add_topic_poc.py @@ -26,6 +26,12 @@ def get_topic(client: OpenAI, tweet: str, note: str) -> Dict[str, List[str]]: ``` このセットに対してのトピックは「{" ".join(fewshot_sample["topics"])}」です。 これを踏まえて、以下のツイートとコミュニティノートに対して同じ粒度で複数のトピックを提示してください。形式はJSONで、キーをtopicsとして値にトピックを配列で格納してください。 +レスポンスの例: +``` +{ + "topics": ["医療", "福祉", ...] +} +``` """, }, { @@ -41,7 +47,7 @@ def get_topic(client: OpenAI, tweet: str, note: str) -> Dict[str, List[str]]: """, }, ], - model="gpt-4", + model="gpt-3.5-turbo", temperature=0.0, ) From 51d4b387889833e1c587e2a0edca854c3b64fdbd Mon Sep 17 00:00:00 2001 From: kota-yata Date: Mon, 11 Dec 2023 15:32:42 +0900 Subject: [PATCH 5/6] modify prompt --- scripts/add_topic_poc.py | 47 +++++++++++++++++++++++-------------- scripts/data_sample.json | 44 ++++++++++++++++++++++++++++++++++ scripts/fewshot_sample.json | 6 ++--- 3 files changed, 76 insertions(+), 21 deletions(-) create mode 100644 scripts/data_sample.json diff --git a/scripts/add_topic_poc.py b/scripts/add_topic_poc.py index 4d60fb4..c5d8123 100644 --- a/scripts/add_topic_poc.py +++ b/scripts/add_topic_poc.py @@ -6,8 +6,16 @@ from dotenv import load_dotenv from openai import OpenAI +response_sample = """ +{ + "1700958646329": { + "topics": ["医療, 福祉"...], + "language": "en" + } +} +""" -def get_topic(client: OpenAI, tweet: str, note: str) -> Dict[str, List[str]]: +def get_topic(client: OpenAI, note_id: int, tweet: str, note: str) -> Dict[str, List[str]]: with open(os.path.join(os.path.dirname(__file__), "fewshot_sample.json"), "r") as f: fewshot_sample = json.load(f) @@ -25,18 +33,18 @@ def get_topic(client: OpenAI, tweet: str, note: str) -> Dict[str, List[str]]: {fewshot_sample["note"]} ``` このセットに対してのトピックは「{" ".join(fewshot_sample["topics"])}」です。 -これを踏まえて、以下のツイートとコミュニティノートに対して同じ粒度で複数のトピックを提示してください。形式はJSONで、キーをtopicsとして値にトピックを配列で格納してください。 -レスポンスの例: +これを踏まえて、以下のセットに対して同じ粒度で複数のトピック(少なくとも3つ)を提示してください。形式はJSONで、キーをtopicsとして値にトピックを配列で格納してください。また、ツイートに用いられている言語も推定し、キーをlanguageとしてiso 639-1に準拠した言語コードを格納してください。topicsとlanguageを格納するオブジェクトはnote idをキーとした値に格納してください +レスポンスの例 (1700958646329はnote id): ``` -{ - "topics": ["医療", "福祉", ...] -} +{response_sample} ``` """, }, { "role": "user", - "content": f"""ツイート: + "content": f""" +note id: {note_id} +ツイート: ``` {tweet} ``` @@ -56,19 +64,22 @@ def get_topic(client: OpenAI, tweet: str, note: str) -> Dict[str, List[str]]: if __name__ == "__main__": parser = ArgumentParser() - # parser.add_argument("input_file") + parser.add_argument("input_file") # parser.add_argument("output_file") args = parser.parse_args() load_dotenv() client = OpenAI() - # with open(args.input_file, "r", encoding="utf-8") as f: - # tweets = json.load(f) - print( - get_topic( - client, - """Tweet content goes here. -""", - """Community note goes here -""", + with open(args.input_file, "r", encoding="utf-8") as f: + notes = json.load(f) + # create output json file with topics. make the file an array of objects which is each response + with open("data/output.json", "w", encoding="utf-8") as f: + json.dump( + [ + get_topic(client, note["noteId"], note["tweetBody"], note["noteBody"]) + for note in notes + ], + f, + ensure_ascii=False, + indent=2, ) - ) + diff --git a/scripts/data_sample.json b/scripts/data_sample.json new file mode 100644 index 0000000..dd28d54 --- /dev/null +++ b/scripts/data_sample.json @@ -0,0 +1,44 @@ +[ + { + "noteId": 1727464245254574464, + "tweetId": 1724436362508558382, + "noteBody": "This app logs all of your messages in plain text to multiple services and stores data completely unencrypted. The authentication token was sent over unencrypted HTTP as well, making it vulnerable to interception. https://arstechnica.com/gadgets/2023/11/nothings-imessage-app-was-a-security-catastrophe-taken-down-in-24-hours/", + "tweetBody": "Today Nothing did something truly wild, and game me an exclusive look at it. So for this video I break down what it is (an app) what it does (iMessage for Android) and what Apple might do when they see this 🕵️ https://t.co/eGHsdkxwCT" + }, + { + "noteId": 1727940249454534919, + "tweetId": 1727796341374403056, + "noteBody": "I recall the PM being verbally attacked by the PC party in the house of commons using the fake news of the terrorist attack as spread by CNN's now very discredited news story. https://youtu.be/SrXX1M_UmDs?si=jz7FZBuhNyyrLkHa", + "tweetBody": "When biased “journalists” try doing the Liberals’ dirty work, they should at least get their facts straight.\u00a0\n\nIf they can’t do that, they could at least bring some apples with them. https://t.co/zkxqbkgO1O" + }, + { + "noteId": 1727405862547833120, + "tweetId": 1727392893470691400, + "noteBody": "Western NY airports are not locked down. Buffalo and Niagara Falls remain open. https://twitter.com/BUFAirport/status/1727398373379170386", + "tweetBody": "BREAKING: A CAR BOMB WAS DRIVEN INTO A CHECKPOINT THE US CANADA BORDER. ALL WESTERN NY AIRPORTS ON LOCKDOWN https://t.co/vXdWBUCOHg" + }, + { + "noteId": 1727826231926878548, + "tweetId": 1727744194780008463, + "noteBody": "Over an hour before Poillevre spoke in Parliament, CTV posted this article with the sentence 'Sources did tell CTV News earlier in the day that Canadian government officials were initially operating under the assumption that it was terror-related.' https://toronto.ctvnews.ca/two-people-dead-in-rainbow-bridge-vehicle", + "tweetBody": "So Pierre Poilievre, in a truly scummy moment, tried to politicize the tragedy at the Rainbow Bridge. A real leader wouldn’t use a terror attack that killed two people for political points. Especially when it wasn’t even a terror attack. https://t.co/qFDkRgB9xY" + }, + { + "noteId": 1727833882559672335, + "tweetId": 1727755158959100132, + "noteBody": "The first CTV story about this was posted at 10:33 AM. They updated it throughout the day as new information came in. The final version of the story at 5:50 PM still did not mention terrorism. https://toronto.ctvnews.ca/two-people-dead-in-rainbow-bridge-vehicle", + "tweetBody": "This screenshot has been sent to me a couple times today. It’s being spread by Pierre Poilievre supporters to try and claim that CTV News initially reported the Rainbow Bridge incident as a terrorist attack. This is not true. https://t.co/YzkkwGfFWY" + }, + { + "noteId": 1727736301062095269, + "tweetId": 1727586091136618859, + "noteBody": "元ツイートは個人的な意見を表明しており誤解を招いていないため、コミュニティノートは必要ありません 異論や細かいレトリックに関する指摘はリプライで行ってください", + "tweetBody": "一度車椅子や松葉杖の生活をしてみたらわかると思うんだけど、一日に500回くらい「すいません」「ありがとうございます」と言わないと生活していけないのは心を病むので仏頂面のまま譲られて当然みたいな顔で座っていて結構です https://t.co/DsS3RMMG2C" + }, + { + "noteId": 1728277125407314151, + "tweetId": 1728212143797833773, + "noteBody": "The BBC article mentions the suspect is in his forties, but has only lived in Ireland for twenty years. Despite his Irish citizenship this age gap strongly suggests he is an immigrant. https://www.bbc.co.uk/news/world-europe-67516612", + "tweetBody": "Online rumors claimed the perpetrator of a stabbing attack was an immigrant. The BBC found that the man was an Irish citizen who had lived in the country for 20 years. Police blamed a 'lunatic faction driven by a far-right ideology' for the riot in Dublin. https://t.co/cMrCCOKNGQ" + } +] diff --git a/scripts/fewshot_sample.json b/scripts/fewshot_sample.json index 25991c8..1e8e4bf 100644 --- a/scripts/fewshot_sample.json +++ b/scripts/fewshot_sample.json @@ -1,7 +1,7 @@ { - "tweet": "test twwet", - "note": "test note", + "tweet": "For those that care — 432 hz improves mental clarity, removes emotional blockages, reduces stress and anxiety, better sleep quality, increases creativity & inspiration, and strengthens the immune system. Play it while you sleep & watch these areas improve!", + "note": "There are no placebo controlled studies which support this. There is no evidence that this frequency has different effects from any other arbitrary frequency. https://ask.audio/articles/music-theory-432-hz-tuning-separating-fact-from-fiction", "topics": [ - "test topic" + "医療", "福祉" ] } \ No newline at end of file From 43a47e8f3dd20c3d0ba84d91ebe49de0e922425e Mon Sep 17 00:00:00 2001 From: kota-yata Date: Tue, 12 Dec 2023 03:56:03 +0900 Subject: [PATCH 6/6] modify add_topic_poc.py --- .gitignore | 2 ++ scripts/add_topic_poc.py | 11 +++++----- scripts/data_sample.json | 44 ---------------------------------------- 3 files changed, 7 insertions(+), 50 deletions(-) delete mode 100644 scripts/data_sample.json diff --git a/.gitignore b/.gitignore index 4ccf5c8..8449676 100644 --- a/.gitignore +++ b/.gitignore @@ -160,3 +160,5 @@ cython_debug/ #.idea/ .vscode/settings.json + +.DS_Store diff --git a/scripts/add_topic_poc.py b/scripts/add_topic_poc.py index c5d8123..85c6473 100644 --- a/scripts/add_topic_poc.py +++ b/scripts/add_topic_poc.py @@ -9,13 +9,14 @@ response_sample = """ { "1700958646329": { - "topics": ["医療, 福祉"...], + "topics": ["医療", "福祉", "政治"], "language": "en" } } """ def get_topic(client: OpenAI, note_id: int, tweet: str, note: str) -> Dict[str, List[str]]: + print(f"note id: {note_id}") with open(os.path.join(os.path.dirname(__file__), "fewshot_sample.json"), "r") as f: fewshot_sample = json.load(f) @@ -61,18 +62,16 @@ def get_topic(client: OpenAI, note_id: int, tweet: str, note: str) -> Dict[str, return json.loads(chat_completion.choices[0].message.content) - if __name__ == "__main__": parser = ArgumentParser() parser.add_argument("input_file") - # parser.add_argument("output_file") + parser.add_argument("output_file") args = parser.parse_args() load_dotenv() client = OpenAI() - with open(args.input_file, "r", encoding="utf-8") as f: + with open(args.input_file, "r") as f: notes = json.load(f) - # create output json file with topics. make the file an array of objects which is each response - with open("data/output.json", "w", encoding="utf-8") as f: + with open(args.output_file, "w", encoding="utf-8") as f: json.dump( [ get_topic(client, note["noteId"], note["tweetBody"], note["noteBody"]) diff --git a/scripts/data_sample.json b/scripts/data_sample.json deleted file mode 100644 index dd28d54..0000000 --- a/scripts/data_sample.json +++ /dev/null @@ -1,44 +0,0 @@ -[ - { - "noteId": 1727464245254574464, - "tweetId": 1724436362508558382, - "noteBody": "This app logs all of your messages in plain text to multiple services and stores data completely unencrypted. The authentication token was sent over unencrypted HTTP as well, making it vulnerable to interception. https://arstechnica.com/gadgets/2023/11/nothings-imessage-app-was-a-security-catastrophe-taken-down-in-24-hours/", - "tweetBody": "Today Nothing did something truly wild, and game me an exclusive look at it. So for this video I break down what it is (an app) what it does (iMessage for Android) and what Apple might do when they see this 🕵️ https://t.co/eGHsdkxwCT" - }, - { - "noteId": 1727940249454534919, - "tweetId": 1727796341374403056, - "noteBody": "I recall the PM being verbally attacked by the PC party in the house of commons using the fake news of the terrorist attack as spread by CNN's now very discredited news story. https://youtu.be/SrXX1M_UmDs?si=jz7FZBuhNyyrLkHa", - "tweetBody": "When biased “journalists” try doing the Liberals’ dirty work, they should at least get their facts straight.\u00a0\n\nIf they can’t do that, they could at least bring some apples with them. https://t.co/zkxqbkgO1O" - }, - { - "noteId": 1727405862547833120, - "tweetId": 1727392893470691400, - "noteBody": "Western NY airports are not locked down. Buffalo and Niagara Falls remain open. https://twitter.com/BUFAirport/status/1727398373379170386", - "tweetBody": "BREAKING: A CAR BOMB WAS DRIVEN INTO A CHECKPOINT THE US CANADA BORDER. ALL WESTERN NY AIRPORTS ON LOCKDOWN https://t.co/vXdWBUCOHg" - }, - { - "noteId": 1727826231926878548, - "tweetId": 1727744194780008463, - "noteBody": "Over an hour before Poillevre spoke in Parliament, CTV posted this article with the sentence 'Sources did tell CTV News earlier in the day that Canadian government officials were initially operating under the assumption that it was terror-related.' https://toronto.ctvnews.ca/two-people-dead-in-rainbow-bridge-vehicle", - "tweetBody": "So Pierre Poilievre, in a truly scummy moment, tried to politicize the tragedy at the Rainbow Bridge. A real leader wouldn’t use a terror attack that killed two people for political points. Especially when it wasn’t even a terror attack. https://t.co/qFDkRgB9xY" - }, - { - "noteId": 1727833882559672335, - "tweetId": 1727755158959100132, - "noteBody": "The first CTV story about this was posted at 10:33 AM. They updated it throughout the day as new information came in. The final version of the story at 5:50 PM still did not mention terrorism. https://toronto.ctvnews.ca/two-people-dead-in-rainbow-bridge-vehicle", - "tweetBody": "This screenshot has been sent to me a couple times today. It’s being spread by Pierre Poilievre supporters to try and claim that CTV News initially reported the Rainbow Bridge incident as a terrorist attack. This is not true. https://t.co/YzkkwGfFWY" - }, - { - "noteId": 1727736301062095269, - "tweetId": 1727586091136618859, - "noteBody": "元ツイートは個人的な意見を表明しており誤解を招いていないため、コミュニティノートは必要ありません 異論や細かいレトリックに関する指摘はリプライで行ってください", - "tweetBody": "一度車椅子や松葉杖の生活をしてみたらわかると思うんだけど、一日に500回くらい「すいません」「ありがとうございます」と言わないと生活していけないのは心を病むので仏頂面のまま譲られて当然みたいな顔で座っていて結構です https://t.co/DsS3RMMG2C" - }, - { - "noteId": 1728277125407314151, - "tweetId": 1728212143797833773, - "noteBody": "The BBC article mentions the suspect is in his forties, but has only lived in Ireland for twenty years. Despite his Irish citizenship this age gap strongly suggests he is an immigrant. https://www.bbc.co.uk/news/world-europe-67516612", - "tweetBody": "Online rumors claimed the perpetrator of a stabbing attack was an immigrant. The BBC found that the man was an Irish citizen who had lived in the country for 20 years. Police blamed a 'lunatic faction driven by a far-right ideology' for the riot in Dublin. https://t.co/cMrCCOKNGQ" - } -]