From 51d4b387889833e1c587e2a0edca854c3b64fdbd Mon Sep 17 00:00:00 2001 From: kota-yata Date: Mon, 11 Dec 2023 15:32:42 +0900 Subject: [PATCH] modify prompt --- scripts/add_topic_poc.py | 47 +++++++++++++++++++++++-------------- scripts/data_sample.json | 44 ++++++++++++++++++++++++++++++++++ scripts/fewshot_sample.json | 6 ++--- 3 files changed, 76 insertions(+), 21 deletions(-) create mode 100644 scripts/data_sample.json diff --git a/scripts/add_topic_poc.py b/scripts/add_topic_poc.py index 4d60fb4..c5d8123 100644 --- a/scripts/add_topic_poc.py +++ b/scripts/add_topic_poc.py @@ -6,8 +6,16 @@ from dotenv import load_dotenv from openai import OpenAI +response_sample = """ +{ + "1700958646329": { + "topics": ["医療, 福祉"...], + "language": "en" + } +} +""" -def get_topic(client: OpenAI, tweet: str, note: str) -> Dict[str, List[str]]: +def get_topic(client: OpenAI, note_id: int, tweet: str, note: str) -> Dict[str, List[str]]: with open(os.path.join(os.path.dirname(__file__), "fewshot_sample.json"), "r") as f: fewshot_sample = json.load(f) @@ -25,18 +33,18 @@ def get_topic(client: OpenAI, tweet: str, note: str) -> Dict[str, List[str]]: {fewshot_sample["note"]} ``` このセットに対してのトピックは「{" ".join(fewshot_sample["topics"])}」です。 -これを踏まえて、以下のツイートとコミュニティノートに対して同じ粒度で複数のトピックを提示してください。形式はJSONで、キーをtopicsとして値にトピックを配列で格納してください。 -レスポンスの例: +これを踏まえて、以下のセットに対して同じ粒度で複数のトピック(少なくとも3つ)を提示してください。形式はJSONで、キーをtopicsとして値にトピックを配列で格納してください。また、ツイートに用いられている言語も推定し、キーをlanguageとしてiso 639-1に準拠した言語コードを格納してください。topicsとlanguageを格納するオブジェクトはnote idをキーとした値に格納してください +レスポンスの例 (1700958646329はnote id): ``` -{ - "topics": ["医療", "福祉", ...] -} +{response_sample} ``` """, }, { "role": "user", - "content": f"""ツイート: + "content": f""" +note id: {note_id} +ツイート: ``` {tweet} ``` @@ -56,19 +64,22 @@ def get_topic(client: OpenAI, tweet: str, note: str) -> Dict[str, List[str]]: if __name__ == "__main__": parser = ArgumentParser() - # parser.add_argument("input_file") + parser.add_argument("input_file") # parser.add_argument("output_file") args = parser.parse_args() load_dotenv() client = OpenAI() - # with open(args.input_file, "r", encoding="utf-8") as f: - # tweets = json.load(f) - print( - get_topic( - client, - """Tweet content goes here. -""", - """Community note goes here -""", + with open(args.input_file, "r", encoding="utf-8") as f: + notes = json.load(f) + # create output json file with topics. make the file an array of objects which is each response + with open("data/output.json", "w", encoding="utf-8") as f: + json.dump( + [ + get_topic(client, note["noteId"], note["tweetBody"], note["noteBody"]) + for note in notes + ], + f, + ensure_ascii=False, + indent=2, ) - ) + diff --git a/scripts/data_sample.json b/scripts/data_sample.json new file mode 100644 index 0000000..dd28d54 --- /dev/null +++ b/scripts/data_sample.json @@ -0,0 +1,44 @@ +[ + { + "noteId": 1727464245254574464, + "tweetId": 1724436362508558382, + "noteBody": "This app logs all of your messages in plain text to multiple services and stores data completely unencrypted. The authentication token was sent over unencrypted HTTP as well, making it vulnerable to interception. https://arstechnica.com/gadgets/2023/11/nothings-imessage-app-was-a-security-catastrophe-taken-down-in-24-hours/", + "tweetBody": "Today Nothing did something truly wild, and game me an exclusive look at it. So for this video I break down what it is (an app) what it does (iMessage for Android) and what Apple might do when they see this 🕵️ https://t.co/eGHsdkxwCT" + }, + { + "noteId": 1727940249454534919, + "tweetId": 1727796341374403056, + "noteBody": "I recall the PM being verbally attacked by the PC party in the house of commons using the fake news of the terrorist attack as spread by CNN's now very discredited news story. https://youtu.be/SrXX1M_UmDs?si=jz7FZBuhNyyrLkHa", + "tweetBody": "When biased “journalists” try doing the Liberals’ dirty work, they should at least get their facts straight.\u00a0\n\nIf they can’t do that, they could at least bring some apples with them. https://t.co/zkxqbkgO1O" + }, + { + "noteId": 1727405862547833120, + "tweetId": 1727392893470691400, + "noteBody": "Western NY airports are not locked down. Buffalo and Niagara Falls remain open. https://twitter.com/BUFAirport/status/1727398373379170386", + "tweetBody": "BREAKING: A CAR BOMB WAS DRIVEN INTO A CHECKPOINT THE US CANADA BORDER. ALL WESTERN NY AIRPORTS ON LOCKDOWN https://t.co/vXdWBUCOHg" + }, + { + "noteId": 1727826231926878548, + "tweetId": 1727744194780008463, + "noteBody": "Over an hour before Poillevre spoke in Parliament, CTV posted this article with the sentence 'Sources did tell CTV News earlier in the day that Canadian government officials were initially operating under the assumption that it was terror-related.' https://toronto.ctvnews.ca/two-people-dead-in-rainbow-bridge-vehicle", + "tweetBody": "So Pierre Poilievre, in a truly scummy moment, tried to politicize the tragedy at the Rainbow Bridge. A real leader wouldn’t use a terror attack that killed two people for political points. Especially when it wasn’t even a terror attack. https://t.co/qFDkRgB9xY" + }, + { + "noteId": 1727833882559672335, + "tweetId": 1727755158959100132, + "noteBody": "The first CTV story about this was posted at 10:33 AM. They updated it throughout the day as new information came in. The final version of the story at 5:50 PM still did not mention terrorism. https://toronto.ctvnews.ca/two-people-dead-in-rainbow-bridge-vehicle", + "tweetBody": "This screenshot has been sent to me a couple times today. It’s being spread by Pierre Poilievre supporters to try and claim that CTV News initially reported the Rainbow Bridge incident as a terrorist attack. This is not true. https://t.co/YzkkwGfFWY" + }, + { + "noteId": 1727736301062095269, + "tweetId": 1727586091136618859, + "noteBody": "元ツイートは個人的な意見を表明しており誤解を招いていないため、コミュニティノートは必要ありません 異論や細かいレトリックに関する指摘はリプライで行ってください", + "tweetBody": "一度車椅子や松葉杖の生活をしてみたらわかると思うんだけど、一日に500回くらい「すいません」「ありがとうございます」と言わないと生活していけないのは心を病むので仏頂面のまま譲られて当然みたいな顔で座っていて結構です https://t.co/DsS3RMMG2C" + }, + { + "noteId": 1728277125407314151, + "tweetId": 1728212143797833773, + "noteBody": "The BBC article mentions the suspect is in his forties, but has only lived in Ireland for twenty years. Despite his Irish citizenship this age gap strongly suggests he is an immigrant. https://www.bbc.co.uk/news/world-europe-67516612", + "tweetBody": "Online rumors claimed the perpetrator of a stabbing attack was an immigrant. The BBC found that the man was an Irish citizen who had lived in the country for 20 years. Police blamed a 'lunatic faction driven by a far-right ideology' for the riot in Dublin. https://t.co/cMrCCOKNGQ" + } +] diff --git a/scripts/fewshot_sample.json b/scripts/fewshot_sample.json index 25991c8..1e8e4bf 100644 --- a/scripts/fewshot_sample.json +++ b/scripts/fewshot_sample.json @@ -1,7 +1,7 @@ { - "tweet": "test twwet", - "note": "test note", + "tweet": "For those that care — 432 hz improves mental clarity, removes emotional blockages, reduces stress and anxiety, better sleep quality, increases creativity & inspiration, and strengthens the immune system. Play it while you sleep & watch these areas improve!", + "note": "There are no placebo controlled studies which support this. There is no evidence that this frequency has different effects from any other arbitrary frequency. https://ask.audio/articles/music-theory-432-hz-tuning-separating-fact-from-fiction", "topics": [ - "test topic" + "医療", "福祉" ] } \ No newline at end of file