From d542a763a335dbb337753ee727597de0d73cf42e Mon Sep 17 00:00:00 2001 From: yu23ki14 Date: Wed, 2 Oct 2024 10:38:23 +0900 Subject: [PATCH] add link csv --- etl/.env.example | 14 +- etl/src/birdxplorer_etl/extract.py | 35 +- etl/src/birdxplorer_etl/settings.py | 2 + etl/src/birdxplorer_etl/test.json | 456 --------------------------- etl/src/birdxplorer_etl/transform.py | 65 +++- 5 files changed, 103 insertions(+), 469 deletions(-) delete mode 100644 etl/src/birdxplorer_etl/test.json diff --git a/etl/.env.example b/etl/.env.example index 112d707..c8a0054 100644 --- a/etl/.env.example +++ b/etl/.env.example @@ -1,6 +1,14 @@ X_BEARER_TOKEN= -AI_MODEL= + +COMMUNITY_NOTE_DAYS_AGO=3 + +TARGET_TWITTER_POST_START_UNIX_MILLISECOND=1719851000000 +TARGET_TWITTER_POST_END_UNIX_MILLISECOND=1719891000000 + +AI_MODEL=openai OPENAPI_TOKEN= CLAUDE_TOKEN= -TARGET_NOTE_ESTIMATE_TOPIC_START_UNIX_MILLISECOND=1720900800000 -TARGET_NOTE_ESTIMATE_TOPIC_END_UNIX_MILLISECOND=1722110400000 \ No newline at end of file +TARGET_NOTE_ESTIMATE_TOPIC_START_UNIX_MILLISECOND=1719851000000 +TARGET_NOTE_ESTIMATE_TOPIC_END_UNIX_MILLISECOND=1719891000000 + +USE_DUMMY_DATA=False \ No newline at end of file diff --git a/etl/src/birdxplorer_etl/extract.py b/etl/src/birdxplorer_etl/extract.py index d47c9be..b29c5de 100644 --- a/etl/src/birdxplorer_etl/extract.py +++ b/etl/src/birdxplorer_etl/extract.py @@ -5,7 +5,13 @@ from prefect import get_run_logger from sqlalchemy.orm import Session from lib.x.postlookup import lookup -from birdxplorer_common.storage import RowNoteRecord, RowPostRecord, RowUserRecord, RowNoteStatusRecord +from birdxplorer_common.storage import ( + RowNoteRecord, + RowPostRecord, + RowUserRecord, + RowNoteStatusRecord, + RowPostEmbedURLRecord, +) import settings @@ -30,10 +36,12 @@ def extract_data(db: Session): break dateString = date.strftime("%Y/%m/%d") - # note_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/notes/notes-00000.tsv" - note_url = ( - "https://raw.githubusercontent.com/codeforjapan/BirdXplorer/refs/heads/main/etl/data/notes_sample.tsv" - ) + note_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/notes/notes-00000.tsv" + if settings.USE_DUMMY_DATA: + note_url = ( + "https://raw.githubusercontent.com/codeforjapan/BirdXplorer/refs/heads/main/etl/data/notes_sample.tsv" + ) + logger.info(note_url) res = requests.get(note_url) @@ -53,8 +61,10 @@ def extract_data(db: Session): rows_to_add = [] db.bulk_save_objects(rows_to_add) - # status_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/noteStatusHistory/noteStatusHistory-00000.tsv" - status_url = "https://raw.githubusercontent.com/codeforjapan/BirdXplorer/refs/heads/main/etl/data/noteStatus_sample.tsv" + status_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/noteStatusHistory/noteStatusHistory-00000.tsv" + if settings.USE_DUMMY_DATA: + status_url = "https://raw.githubusercontent.com/codeforjapan/BirdXplorer/refs/heads/main/etl/data/noteStatus_sample.tsv" + logger.info(status_url) res = requests.get(status_url) @@ -155,6 +165,17 @@ def extract_data(db: Session): lang=post["data"]["lang"], ) db.add(db_post) + + if "entities" in post["data"] and "urls" in post["data"]["entities"]: + for url in post["data"]["entities"]["urls"]: + if "unwound_url" in url: + post_url = RowPostEmbedURLRecord( + post_id=post["data"]["id"], + url=url["url"] if url["url"] else None, + expanded_url=url["expanded_url"] if url["expanded_url"] else None, + unwound_url=url["unwound_url"] if url["unwound_url"] else None, + ) + db.add(post_url) note.row_post_id = tweet_id db.commit() continue diff --git a/etl/src/birdxplorer_etl/settings.py b/etl/src/birdxplorer_etl/settings.py index d6e9f2e..c775af3 100644 --- a/etl/src/birdxplorer_etl/settings.py +++ b/etl/src/birdxplorer_etl/settings.py @@ -17,3 +17,5 @@ CLAUDE_TOKEN = os.getenv("CLAUDE_TOKEN") TARGET_NOTE_ESTIMATE_TOPIC_START_UNIX_MILLISECOND = os.getenv("TARGET_NOTE_ESTIMATE_TOPIC_START_UNIX_MILLISECOND") TARGET_NOTE_ESTIMATE_TOPIC_END_UNIX_MILLISECOND = os.getenv("TARGET_NOTE_ESTIMATE_TOPIC_END_UNIX_MILLISECOND") + +USE_DUMMY_DATA = os.getenv("USE_DUMMY_DATA", "False") == "True" diff --git a/etl/src/birdxplorer_etl/test.json b/etl/src/birdxplorer_etl/test.json deleted file mode 100644 index 395615b..0000000 --- a/etl/src/birdxplorer_etl/test.json +++ /dev/null @@ -1,456 +0,0 @@ -{ - "data": { - "possibly_sensitive": false, - "author_id": "871657411525963776", - "edit_history_tweet_ids": ["1811769819198267436"], - "lang": "en", - "created_at": "2024-07-12T14:29:20.000Z", - "entities": { - "annotations": [ - { - "start": 150, - "end": 164, - "probability": 0.6361, - "type": "Other", - "normalized_text": "Hyundai Ioniq 5" - }, - { - "start": 223, - "end": 229, - "probability": 0.7171, - "type": "Other", - "normalized_text": "VW Polo" - } - ], - "hashtags": [{ "start": 232, "end": 241, "tag": "mobesity" }], - "urls": [ - { - "start": 245, - "end": 268, - "url": "https://t.co/KiifOc4ZGS", - "expanded_url": "https://roaddamagecalculator.com/", - "display_url": "roaddamagecalculator.com", - "status": 200, - "title": "Road Damage", - "description": "Ever wondered how much more damage one vehicle does to the road than", - "unwound_url": "https://roaddamagecalculator.com/" - }, - { - "start": 269, - "end": 292, - "url": "https://t.co/uV0j73ujer", - "expanded_url": "https://twitter.com/_chris_brand_/status/1811769819198267436/photo/1", - "display_url": "pic.x.com/uv0j73ujer", - "media_key": "3_1811768379000107008" - } - ] - }, - "public_metrics": { - "retweet_count": 426, - "reply_count": 200, - "like_count": 1499, - "quote_count": 65, - "bookmark_count": 240, - "impression_count": 163126 - }, - "attachments": { "media_keys": ["3_1811768379000107008"] }, - "text": "The change in damage to the road surface is proportional to the difference in axle weight to the fourth power\n\nSo, a 2.4-tonne electric SUV such as a Hyundai Ioniq 5 would therefore do 16 times more damage than a 1.2-tonne VW Polo\n\n#mobesity\n\nšŸ‘‰ https://t.co/KiifOc4ZGS https://t.co/uV0j73ujer", - "id": "1811769819198267436", - "edit_controls": { - "edits_remaining": 5, - "is_edit_eligible": false, - "editable_until": "2024-07-12T15:29:20.000Z" - }, - "context_annotations": [ - { - "domain": { - "id": "30", - "name": "Entities [Entity Service]", - "description": "Entity Service top level domain, every item that is in Entity Service should be in this domain" - }, - "entity": { - "id": "781972125179518977", - "name": "Auto Manufacturer - Auto" - } - }, - { - "domain": { - "id": "46", - "name": "Business Taxonomy", - "description": "Categories within Brand Verticals that narrow down the scope of Brands" - }, - "entity": { - "id": "1557696420500541440", - "name": "Automotive, Aircraft & Boat Business", - "description": "Brands, companies, advertisers and every non-person handle with the profit intent related to automobile, spacecraft, aircraft and boat manufacturing" - } - }, - { - "domain": { - "id": "47", - "name": "Brand", - "description": "Brands and Companies" - }, - "entity": { "id": "10026295039", "name": "Hyundai" } - }, - { - "domain": { - "id": "47", - "name": "Brand", - "description": "Brands and Companies" - }, - "entity": { "id": "10026353537", "name": "Volkswagen" } - }, - { - "domain": { - "id": "131", - "name": "Unified Twitter Taxonomy", - "description": "A taxonomy of user interests. " - }, - "entity": { "id": "10026295039", "name": "Hyundai" } - }, - { - "domain": { - "id": "131", - "name": "Unified Twitter Taxonomy", - "description": "A taxonomy of user interests. " - }, - "entity": { "id": "10026353537", "name": "Volkswagen" } - }, - { - "domain": { - "id": "131", - "name": "Unified Twitter Taxonomy", - "description": "A taxonomy of user interests. " - }, - "entity": { "id": "1196845866138533888", "name": "Automobile Brands" } - }, - { - "domain": { - "id": "30", - "name": "Entities [Entity Service]", - "description": "Entity Service top level domain, every item that is in Entity Service should be in this domain" - }, - "entity": { - "id": "781972125179518977", - "name": "Auto Manufacturer - Auto" - } - }, - { - "domain": { - "id": "47", - "name": "Brand", - "description": "Brands and Companies" - }, - "entity": { "id": "10026295039", "name": "Hyundai" } - }, - { - "domain": { - "id": "48", - "name": "Product", - "description": "Products created by Brands. Examples: Ford Explorer, Apple iPhone." - }, - "entity": { "id": "10044387828", "name": "Hyundai - Ioniq" } - }, - { - "domain": { - "id": "65", - "name": "Interests and Hobbies Vertical", - "description": "Top level interests and hobbies groupings, like Food or Travel" - }, - "entity": { - "id": "847528391163092993", - "name": "Automotive", - "description": "Car culture" - } - }, - { - "domain": { - "id": "66", - "name": "Interests and Hobbies Category", - "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations" - }, - "entity": { - "id": "847528576551337984", - "name": "Hybrid and electric vehicles", - "description": "Hybrid and electric vehicles" - } - }, - { - "domain": { - "id": "66", - "name": "Interests and Hobbies Category", - "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations" - }, - "entity": { - "id": "847529204530921472", - "name": "SUVs", - "description": "SUVs" - } - } - ], - "conversation_id": "1811769819198267436", - "reply_settings": "everyone" - }, - "includes": { - "media": [ - { - "height": 495, - "width": 607, - "url": "https://pbs.twimg.com/media/GSSxvdWWMAAuVk_.png", - "media_key": "3_1811768379000107008", - "type": "photo" - } - ], - "users": [ - { - "id": "871657411525963776", - "verified_type": "none", - "url": "https://t.co/sBthi7IAvB", - "description": "Scientist. Prof at Oxford University https://t.co/0JetFU9aYd @TSUOxford and @ecioxford. Sustainable and healthy transport. Active travel. Transport and climate policy.", - "most_recent_tweet_id": "1829881319222043134", - "name": "Prof. Christian Brand", - "public_metrics": { - "followers_count": 2379, - "following_count": 917, - "tweet_count": 4733, - "listed_count": 37, - "like_count": 7760 - }, - "entities": { - "url": { - "urls": [ - { - "start": 0, - "end": 23, - "url": "https://t.co/sBthi7IAvB", - "expanded_url": "http://www.tsu.ox.ac.uk/people/cbrand.html", - "display_url": "tsu.ox.ac.uk/people/cbrand.ā€¦" - } - ] - }, - "description": { - "urls": [ - { - "start": 37, - "end": 60, - "url": "https://t.co/0JetFU9aYd", - "expanded_url": "http://ox.ac.uk", - "display_url": "ox.ac.uk" - } - ], - "mentions": [ - { "start": 61, "end": 71, "username": "TSUOxford" }, - { "start": 76, "end": 86, "username": "ecioxford" } - ] - } - }, - "profile_image_url": "https://pbs.twimg.com/profile_images/1707022613413818368/YLvCT_0r_normal.jpg", - "protected": false, - "username": "_chris_brand_", - "pinned_tweet_id": "1585556640933232640", - "verified": false, - "location": "Oxford, UK", - "created_at": "2017-06-05T09:18:16.000Z" - } - ], - "tweets": [ - { - "possibly_sensitive": false, - "author_id": "871657411525963776", - "edit_history_tweet_ids": ["1811769819198267436"], - "lang": "en", - "created_at": "2024-07-12T14:29:20.000Z", - "entities": { - "annotations": [ - { - "start": 150, - "end": 164, - "probability": 0.6361, - "type": "Other", - "normalized_text": "Hyundai Ioniq 5" - }, - { - "start": 223, - "end": 229, - "probability": 0.7171, - "type": "Other", - "normalized_text": "VW Polo" - } - ], - "hashtags": [{ "start": 232, "end": 241, "tag": "mobesity" }], - "urls": [ - { - "start": 245, - "end": 268, - "url": "https://t.co/KiifOc4ZGS", - "expanded_url": "https://roaddamagecalculator.com/", - "display_url": "roaddamagecalculator.com", - "status": 200, - "title": "Road Damage", - "description": "Ever wondered how much more damage one vehicle does to the road than", - "unwound_url": "https://roaddamagecalculator.com/" - }, - { - "start": 269, - "end": 292, - "url": "https://t.co/uV0j73ujer", - "expanded_url": "https://twitter.com/_chris_brand_/status/1811769819198267436/photo/1", - "display_url": "pic.x.com/uv0j73ujer", - "media_key": "3_1811768379000107008" - } - ] - }, - "public_metrics": { - "retweet_count": 426, - "reply_count": 200, - "like_count": 1499, - "quote_count": 65, - "bookmark_count": 240, - "impression_count": 163126 - }, - "attachments": { "media_keys": ["3_1811768379000107008"] }, - "text": "The change in damage to the road surface is proportional to the difference in axle weight to the fourth power\n\nSo, a 2.4-tonne electric SUV such as a Hyundai Ioniq 5 would therefore do 16 times more damage than a 1.2-tonne VW Polo\n\n#mobesity\n\nšŸ‘‰ https://t.co/KiifOc4ZGS https://t.co/uV0j73ujer", - "id": "1811769819198267436", - "edit_controls": { - "edits_remaining": 5, - "is_edit_eligible": false, - "editable_until": "2024-07-12T15:29:20.000Z" - }, - "context_annotations": [ - { - "domain": { - "id": "30", - "name": "Entities [Entity Service]", - "description": "Entity Service top level domain, every item that is in Entity Service should be in this domain" - }, - "entity": { - "id": "781972125179518977", - "name": "Auto Manufacturer - Auto" - } - }, - { - "domain": { - "id": "46", - "name": "Business Taxonomy", - "description": "Categories within Brand Verticals that narrow down the scope of Brands" - }, - "entity": { - "id": "1557696420500541440", - "name": "Automotive, Aircraft & Boat Business", - "description": "Brands, companies, advertisers and every non-person handle with the profit intent related to automobile, spacecraft, aircraft and boat manufacturing" - } - }, - { - "domain": { - "id": "47", - "name": "Brand", - "description": "Brands and Companies" - }, - "entity": { "id": "10026295039", "name": "Hyundai" } - }, - { - "domain": { - "id": "47", - "name": "Brand", - "description": "Brands and Companies" - }, - "entity": { "id": "10026353537", "name": "Volkswagen" } - }, - { - "domain": { - "id": "131", - "name": "Unified Twitter Taxonomy", - "description": "A taxonomy of user interests. " - }, - "entity": { "id": "10026295039", "name": "Hyundai" } - }, - { - "domain": { - "id": "131", - "name": "Unified Twitter Taxonomy", - "description": "A taxonomy of user interests. " - }, - "entity": { "id": "10026353537", "name": "Volkswagen" } - }, - { - "domain": { - "id": "131", - "name": "Unified Twitter Taxonomy", - "description": "A taxonomy of user interests. " - }, - "entity": { - "id": "1196845866138533888", - "name": "Automobile Brands" - } - }, - { - "domain": { - "id": "30", - "name": "Entities [Entity Service]", - "description": "Entity Service top level domain, every item that is in Entity Service should be in this domain" - }, - "entity": { - "id": "781972125179518977", - "name": "Auto Manufacturer - Auto" - } - }, - { - "domain": { - "id": "47", - "name": "Brand", - "description": "Brands and Companies" - }, - "entity": { "id": "10026295039", "name": "Hyundai" } - }, - { - "domain": { - "id": "48", - "name": "Product", - "description": "Products created by Brands. Examples: Ford Explorer, Apple iPhone." - }, - "entity": { "id": "10044387828", "name": "Hyundai - Ioniq" } - }, - { - "domain": { - "id": "65", - "name": "Interests and Hobbies Vertical", - "description": "Top level interests and hobbies groupings, like Food or Travel" - }, - "entity": { - "id": "847528391163092993", - "name": "Automotive", - "description": "Car culture" - } - }, - { - "domain": { - "id": "66", - "name": "Interests and Hobbies Category", - "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations" - }, - "entity": { - "id": "847528576551337984", - "name": "Hybrid and electric vehicles", - "description": "Hybrid and electric vehicles" - } - }, - { - "domain": { - "id": "66", - "name": "Interests and Hobbies Category", - "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations" - }, - "entity": { - "id": "847529204530921472", - "name": "SUVs", - "description": "SUVs" - } - } - ], - "conversation_id": "1811769819198267436", - "reply_settings": "everyone" - } - ] - } -} diff --git a/etl/src/birdxplorer_etl/transform.py b/etl/src/birdxplorer_etl/transform.py index 6de446b..bd977f9 100644 --- a/etl/src/birdxplorer_etl/transform.py +++ b/etl/src/birdxplorer_etl/transform.py @@ -1,6 +1,12 @@ from sqlalchemy import select, func, and_, Integer from sqlalchemy.orm import Session -from birdxplorer_common.storage import RowNoteRecord, RowPostRecord, RowUserRecord, RowNoteStatusRecord +from birdxplorer_common.storage import ( + RowNoteRecord, + RowPostRecord, + RowUserRecord, + RowNoteStatusRecord, + RowPostEmbedURLRecord, +) from birdxplorer_etl.lib.ai_model.ai_model_interface import get_ai_service from birdxplorer_etl.settings import ( TARGET_NOTE_ESTIMATE_TOPIC_START_UNIX_MILLISECOND, @@ -9,6 +15,8 @@ import csv import os from prefect import get_run_logger +import uuid +import random def transform_data(db: Session): @@ -138,6 +146,10 @@ def transform_data(db: Session): writer.writerow(user) offset += limit + # Transform row post embed link + generate_post_link(db) + + # Transform row post embed url data and generate post_embed_url.csv csv_seed_file_path = "./seed/topic_seed.csv" output_csv_file_path = "./data/transformed/topic.csv" records = [] @@ -161,16 +173,62 @@ def transform_data(db: Session): for record in records: writer.writerow({"topic_id": record["topic_id"], "label": {k: v for k, v in record["label"].items()}}) - generate_note_topic() + generate_note_topic(db) return +def generate_post_link(db: Session): + link_csv_file_path = "./data/transformed/post_link.csv" + association_csv_file_path = "./data/transformed/post_link_association.csv" + + if os.path.exists(link_csv_file_path): + os.remove(link_csv_file_path) + with open(link_csv_file_path, "a", newline="", encoding="utf-8") as file: + fieldnames = ["link_id", "url"] + writer = csv.DictWriter(file, fieldnames=fieldnames) + writer.writeheader() + + if os.path.exists(association_csv_file_path): + os.remove(association_csv_file_path) + with open(association_csv_file_path, "a", newline="", encoding="utf-8") as file: + fieldnames = ["post_id", "link_id"] + writer = csv.DictWriter(file, fieldnames=fieldnames) + writer.writeheader() + + offset = 0 + limit = 1000 + num_of_links = db.query(func.count(RowPostEmbedURLRecord.post_id)).scalar() + + records = [] + while offset < num_of_links: + links = db.query(RowPostEmbedURLRecord).limit(limit).offset(offset) + + for link in links: + random.seed(link.unwound_url) + link_id = uuid.UUID(int=random.getrandbits(128)) + is_link_exist = next((record for record in records if record["link_id"] == link_id), None) + if is_link_exist is None: + with open(link_csv_file_path, "a", newline="", encoding="utf-8") as file: + fieldnames = ["link_id", "unwound_url"] + writer = csv.DictWriter(file, fieldnames=fieldnames) + writer.writerow({"link_id": link_id, "unwound_url": link.unwound_url}) + record = {"post_id": link.post_id, "link_id": link_id, "unwound_url": link.unwound_url} + records.append(record) + with open(association_csv_file_path, "a", newline="", encoding="utf-8") as file: + fieldnames = ["post_id", "link_id"] + writer = csv.DictWriter(file, fieldnames=fieldnames) + writer.writerow({"post_id": link.post_id, "link_id": link_id}) + offset += limit + + def generate_note_topic(db: Session): - note_csv_file_path = "./data/transformed/note.csv" output_csv_file_path = "./data/transformed/note_topic_association.csv" ai_service = get_ai_service() + if os.path.exists(output_csv_file_path): + os.remove(output_csv_file_path) + records = [] with open(output_csv_file_path, "w", newline="", encoding="utf-8", buffering=1) as file: fieldnames = ["note_id", "topic_id"] @@ -214,6 +272,7 @@ def generate_note_topic(db: Session): ) records = [] print(index) + offset += limit for record in records: writer.writerow(