From b493e3f507b0018894061c120a55a78e5a3a0d5d Mon Sep 17 00:00:00 2001 From: yu23ki14 Date: Tue, 1 Oct 2024 11:25:00 +0900 Subject: [PATCH 1/4] tmp --- api/birdxplorer_api/routers/data.py | 2 + api/tests/conftest.py | 8 + common/birdxplorer_common/models.py | 1 + common/birdxplorer_common/storage.py | 42 ++ common/tests/conftest.py | 6 + common/tests/test_storage.py | 12 + etl/src/birdxplorer_etl/extract.py | 55 ++- etl/src/birdxplorer_etl/lib/sqlite/init.py | 14 +- etl/src/birdxplorer_etl/test.json | 456 +++++++++++++++++++++ 9 files changed, 589 insertions(+), 7 deletions(-) create mode 100644 etl/src/birdxplorer_etl/test.json diff --git a/api/birdxplorer_api/routers/data.py b/api/birdxplorer_api/routers/data.py index 75ed648..36be229 100644 --- a/api/birdxplorer_api/routers/data.py +++ b/api/birdxplorer_api/routers/data.py @@ -76,6 +76,7 @@ def get_notes( created_at_to: Union[None, TwitterTimestamp] = Query(default=None), topic_ids: Union[List[TopicId], None] = Query(default=None), post_ids: Union[List[PostId], None] = Query(default=None), + current_status: Union[None, List[str]] = Query(default=None), language: Union[LanguageIdentifier, None] = Query(default=None), ) -> NoteListResponse: return NoteListResponse( @@ -86,6 +87,7 @@ def get_notes( created_at_to=created_at_to, topic_ids=topic_ids, post_ids=post_ids, + current_status=current_status, language=language, ) ) diff --git a/api/tests/conftest.py b/api/tests/conftest.py index 97a4deb..3acfbc3 100644 --- a/api/tests/conftest.py +++ b/api/tests/conftest.py @@ -93,6 +93,7 @@ def note_samples(note_factory: NoteFactory, topic_samples: List[Topic]) -> Gener topics=[topic_samples[0]], language="ja", summary="要約文1", + current_status="NEEDS_MORE_RATINGS", created_at=1152921600000, ), note_factory.build( @@ -101,6 +102,7 @@ def note_samples(note_factory: NoteFactory, topic_samples: List[Topic]) -> Gener topics=[], language="en", summary="summary2", + current_status="NEEDS_MORE_RATINGS", created_at=1152921601000, ), note_factory.build( @@ -109,6 +111,7 @@ def note_samples(note_factory: NoteFactory, topic_samples: List[Topic]) -> Gener topics=[topic_samples[1]], language="en", summary="summary3", + current_status="", created_at=1152921602000, ), note_factory.build( @@ -117,6 +120,7 @@ def note_samples(note_factory: NoteFactory, topic_samples: List[Topic]) -> Gener topics=[topic_samples[0], topic_samples[1], topic_samples[2]], language="en", summary="summary4", + current_status="CURRENTLY_RATED_HELPFUL", created_at=1152921603000, ), note_factory.build( @@ -125,6 +129,7 @@ def note_samples(note_factory: NoteFactory, topic_samples: List[Topic]) -> Gener topics=[topic_samples[0]], language="en", summary="summary5", + current_status="CURRENTLY_RATED_HELPFUL", created_at=1152921604000, ), ] @@ -234,6 +239,7 @@ def _get_notes( created_at_to: Union[None, TwitterTimestamp] = None, topic_ids: Union[List[TopicId], None] = None, post_ids: Union[List[PostId], None] = None, + current_status: Union[None, List[str]] = None, language: Union[LanguageIdentifier, None] = None, ) -> Generator[Note, None, None]: for note in note_samples: @@ -247,6 +253,8 @@ def _get_notes( continue if post_ids is not None and note.post_id not in post_ids: continue + if current_status is not None and note.current_status not in current_status: + continue if language is not None and note.language != language: continue yield note diff --git a/common/birdxplorer_common/models.py b/common/birdxplorer_common/models.py index 72052dd..d066f70 100644 --- a/common/birdxplorer_common/models.py +++ b/common/birdxplorer_common/models.py @@ -657,6 +657,7 @@ class Note(BaseModel): language: LanguageIdentifier topics: List[Topic] summary: SummaryString + current_status: str | None created_at: TwitterTimestamp diff --git a/common/birdxplorer_common/storage.py b/common/birdxplorer_common/storage.py index e318459..d9612e1 100644 --- a/common/birdxplorer_common/storage.py +++ b/common/birdxplorer_common/storage.py @@ -68,6 +68,7 @@ class NoteRecord(Base): topics: Mapped[List[NoteTopicAssociation]] = relationship() language: Mapped[LanguageIdentifier] = mapped_column(nullable=False) summary: Mapped[SummaryString] = mapped_column(nullable=False) + current_status: Mapped[String] = mapped_column(nullable=True) created_at: Mapped[TwitterTimestamp] = mapped_column(nullable=False) @@ -132,6 +133,34 @@ class RowNoteRecord(Base): row_post: Mapped["RowPostRecord"] = relationship("RowPostRecord", back_populates="row_notes") +class RowNoteStatusRecord(Base): + __tablename__ = "row_note_status" + + note_id: Mapped[NoteId] = mapped_column(ForeignKey("row_notes.note_id"), primary_key=True) + note_author_participant_id: Mapped[ParticipantId] = mapped_column(nullable=False) + created_at_millis: Mapped[TwitterTimestamp] = mapped_column(nullable=False) + timestamp_millis_of_first_non_n_m_r_status: Mapped[TwitterTimestamp] = mapped_column() + first_non_n_m_r_status: Mapped[String] = mapped_column() + timestamp_millis_of_current_status: Mapped[TwitterTimestamp] = mapped_column() + current_status: Mapped[String] = mapped_column() + timestamp_millis_of_latest_non_n_m_r_status: Mapped[TwitterTimestamp] = mapped_column() + most_recent_non_n_m_r_status: Mapped[String] = mapped_column() + timestamp_millis_of_status_lock: Mapped[TwitterTimestamp] = mapped_column() + locked_status: Mapped[String] = mapped_column() + timestamp_millis_of_retro_lock: Mapped[TwitterTimestamp] = mapped_column() + current_core_status: Mapped[String] = mapped_column() + current_expansion_status: Mapped[String] = mapped_column() + current_group_status: Mapped[String] = mapped_column() + current_decided_by: Mapped[String] = mapped_column() + current_modeling_group: Mapped[int] = mapped_column() + timestamp_millis_of_most_recent_status_change: Mapped[TwitterTimestamp] = mapped_column() + timestamp_millis_of_nmr_due_to_min_stable_crh_time: Mapped[TwitterTimestamp] = mapped_column() + current_multi_group_status: Mapped[String] = mapped_column() + current_modeling_multi_group: Mapped[int] = mapped_column() + timestamp_minute_of_final_scoring_output: Mapped[TwitterTimestamp] = mapped_column() + timestamp_millis_of_first_nmr_due_to_min_stable_crh_time: Mapped[TwitterTimestamp] = mapped_column() + + class RowPostRecord(Base): __tablename__ = "row_posts" @@ -152,6 +181,15 @@ class RowPostRecord(Base): user: Mapped["RowUserRecord"] = relationship("RowUserRecord", back_populates="row_post") +class RowPostEmbedURLRecord(Base): + __tablename__ = "row_post_embed_urls" + + post_id: Mapped[PostId] = mapped_column(ForeignKey("row_posts.post_id"), primary_key=True) + url: Mapped[String] = mapped_column(primary_key=True) + expanded_url: Mapped[String] = mapped_column(nullable=False) + unwound_url: Mapped[String] = mapped_column(nullable=False) + + class RowUserRecord(Base): __tablename__ = "row_users" @@ -224,6 +262,7 @@ def get_notes( created_at_to: Union[None, TwitterTimestamp] = None, topic_ids: Union[List[TopicId], None] = None, post_ids: Union[List[PostId], None] = None, + current_status: Union[None, List[str]] = None, language: Union[LanguageIdentifier, None] = None, ) -> Generator[NoteModel, None, None]: with Session(self.engine) as sess: @@ -248,6 +287,8 @@ def get_notes( query = query.filter(NoteRecord.post_id.in_(post_ids)) if language is not None: query = query.filter(NoteRecord.language == language) + if current_status is not None: + query = query.filter(NoteRecord.current_status.in_(current_status)) for note_record in query.all(): yield NoteModel( note_id=note_record.note_id, @@ -265,6 +306,7 @@ def get_notes( ], language=LanguageIdentifier.normalize(note_record.language), summary=note_record.summary, + current_status=note_record.current_status, created_at=note_record.created_at, ) diff --git a/common/tests/conftest.py b/common/tests/conftest.py index a8c048b..a58f37f 100644 --- a/common/tests/conftest.py +++ b/common/tests/conftest.py @@ -126,6 +126,7 @@ def note_samples(note_factory: NoteFactory, topic_samples: List[Topic]) -> Gener topics=[topic_samples[0]], language="ja", summary="要約文1", + current_status=None, created_at=1152921600000, ), note_factory.build( @@ -134,6 +135,7 @@ def note_samples(note_factory: NoteFactory, topic_samples: List[Topic]) -> Gener topics=[], language="en", summary="summary2", + current_status=None, created_at=1152921601000, ), note_factory.build( @@ -142,6 +144,7 @@ def note_samples(note_factory: NoteFactory, topic_samples: List[Topic]) -> Gener topics=[topic_samples[1]], language="en", summary="summary3", + current_status=None, created_at=1152921602000, ), note_factory.build( @@ -150,6 +153,7 @@ def note_samples(note_factory: NoteFactory, topic_samples: List[Topic]) -> Gener topics=[topic_samples[0], topic_samples[1], topic_samples[2]], language="en", summary="summary4", + current_status=None, created_at=1152921603000, ), note_factory.build( @@ -158,6 +162,7 @@ def note_samples(note_factory: NoteFactory, topic_samples: List[Topic]) -> Gener topics=[topic_samples[0]], language="en", summary="summary5", + current_status=None, created_at=1152921604000, ), note_factory.build( @@ -166,6 +171,7 @@ def note_samples(note_factory: NoteFactory, topic_samples: List[Topic]) -> Gener topics=[topic_samples[0]], language="en", summary="summary6_empty_post_id", + current_status=None, created_at=1152921604000, ), ] diff --git a/common/tests/test_storage.py b/common/tests/test_storage.py index cc3638d..3d81209 100644 --- a/common/tests/test_storage.py +++ b/common/tests/test_storage.py @@ -205,6 +205,18 @@ def test_get_notes_by_post_ids_empty( assert expected == actual +def test_get_notes_by_note_status( + engine_for_test: Engine, + note_samples: List[Note], + note_records_sample: List[NoteRecord], +) -> None: + storage = Storage(engine=engine_for_test) + current_status = ["NEEDS_MORE_RATINGS"] + expected = [note for note in note_samples if note.current_status in current_status] + actual = list(storage.get_notes(current_status=current_status)) + assert expected == actual + + def test_get_notes_by_language( engine_for_test: Engine, note_samples: List[Note], diff --git a/etl/src/birdxplorer_etl/extract.py b/etl/src/birdxplorer_etl/extract.py index 81abf65..cd4bd8b 100644 --- a/etl/src/birdxplorer_etl/extract.py +++ b/etl/src/birdxplorer_etl/extract.py @@ -5,7 +5,13 @@ from prefect import get_run_logger from sqlalchemy.orm import Session from lib.x.postlookup import lookup -from birdxplorer_common.storage import RowNoteRecord, RowPostRecord, RowUserRecord +from birdxplorer_common.storage import ( + RowNoteRecord, + RowPostRecord, + RowUserRecord, + RowPostEmbedURLRecord, + RowNoteStatusRecord, +) import settings @@ -28,9 +34,11 @@ def extract_data(db: Session): > datetime.timestamp(date) - 24 * 60 * 60 * settings.COMMUNITY_NOTE_DAYS_AGO ): break - url = f'https://ton.twimg.com/birdwatch-public-data/{date.strftime("%Y/%m/%d")}/notes/notes-00000.tsv' - logger.info(url) - res = requests.get(url) + + dateString = date.strftime("%Y/%m/%d") + note_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/notes/notes-00000.tsv" + logger.info(note_url) + res = requests.get(note_url) if res.status_code == 200: # res.contentをdbのNoteテーブル @@ -45,7 +53,26 @@ def extract_data(db: Session): rows_to_add.append(RowNoteRecord(**row)) db.bulk_save_objects(rows_to_add) - break + status_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/noteStatusHistory/noteStatusHistory-00000.tsv" + logger.info(status_url) + res = requests.get(status_url) + + if res.status_code == 200: + # res.contentをdbのNoteStatusテーブル + tsv_data = res.content.decode("utf-8").splitlines() + reader = csv.DictReader(tsv_data, delimiter="\t") + reader.fieldnames = [stringcase.snakecase(field) for field in reader.fieldnames] + + rows_to_add = [] + for row in reader: + status = db.query(RowNoteStatusRecord).filter(RowNoteStatusRecord.note_id == row["note_id"]).first() + if status is None or status.created_at_millis > int(datetime.now().time() * 1000): + db.query(RowNoteStatusRecord).filter(RowNoteStatusRecord.note_id == row["note_id"]).delete() + rows_to_add.append(RowNoteStatusRecord(**row)) + db.bulk_save_objects(rows_to_add) + + break + date = date - timedelta(days=1) db.commit() @@ -58,7 +85,9 @@ def extract_data(db: Session): .filter(RowNoteRecord.created_at_millis <= settings.TARGET_TWITTER_POST_END_UNIX_MILLISECOND) .all() ) - logger.info(len(postExtract_targetNotes)) + + logger.info(f"Num of Target Notes: {len(postExtract_targetNotes)}") + for note in postExtract_targetNotes: tweet_id = note.tweet_id @@ -71,6 +100,8 @@ def extract_data(db: Session): logger.info(tweet_id) post = lookup(tweet_id) + logger.info(post) + if post == None or "data" not in post: continue @@ -101,6 +132,18 @@ def extract_data(db: Session): ) db.add(db_user) + if "entities" in post["data"] and "urls" in post["data"]["entities"]: + for url in post["data"]["entities"]["urls"]: + if "unwound_url" not in url or url["status"] != 200: + continue + db_post_embed_url = RowPostEmbedURLRecord( + post_id=post["data"]["id"], + url=url["url"], + expanded_url=url["expanded_url"], + unwound_url=url["unwound_url"], + ) + db.add(db_post_embed_url) + media_data = ( post["includes"]["media"][0] if "includes" in post and "media" in post["includes"] and len(post["includes"]["media"]) > 0 diff --git a/etl/src/birdxplorer_etl/lib/sqlite/init.py b/etl/src/birdxplorer_etl/lib/sqlite/init.py index c167352..fc4ab66 100644 --- a/etl/src/birdxplorer_etl/lib/sqlite/init.py +++ b/etl/src/birdxplorer_etl/lib/sqlite/init.py @@ -5,7 +5,13 @@ from sqlalchemy import create_engine, inspect from sqlalchemy.orm import sessionmaker -from birdxplorer_common.storage import RowNoteRecord, RowPostRecord, RowUserRecord +from birdxplorer_common.storage import ( + RowNoteRecord, + RowPostRecord, + RowUserRecord, + RowPostEmbedURLRecord, + RowNoteStatusRecord, +) def init_db(): @@ -24,9 +30,15 @@ def init_db(): if not inspect(engine).has_table("row_posts"): logger.info("Creating table post") RowPostRecord.metadata.create_all(engine) + if not inspect(engine).has_table("row_note_status"): + logger.info("Creating table note_status") + RowNoteStatusRecord.metadata.create_all(engine) if not inspect(engine).has_table("row_users"): logger.info("Creating table user") RowUserRecord.metadata.create_all(engine) + if not inspect(engine).has_table("row_post_embed_urls"): + logger.info("Creating table post_embed_urls") + RowPostEmbedURLRecord.metadata.create_all(engine) Session = sessionmaker(bind=engine) diff --git a/etl/src/birdxplorer_etl/test.json b/etl/src/birdxplorer_etl/test.json new file mode 100644 index 0000000..395615b --- /dev/null +++ b/etl/src/birdxplorer_etl/test.json @@ -0,0 +1,456 @@ +{ + "data": { + "possibly_sensitive": false, + "author_id": "871657411525963776", + "edit_history_tweet_ids": ["1811769819198267436"], + "lang": "en", + "created_at": "2024-07-12T14:29:20.000Z", + "entities": { + "annotations": [ + { + "start": 150, + "end": 164, + "probability": 0.6361, + "type": "Other", + "normalized_text": "Hyundai Ioniq 5" + }, + { + "start": 223, + "end": 229, + "probability": 0.7171, + "type": "Other", + "normalized_text": "VW Polo" + } + ], + "hashtags": [{ "start": 232, "end": 241, "tag": "mobesity" }], + "urls": [ + { + "start": 245, + "end": 268, + "url": "https://t.co/KiifOc4ZGS", + "expanded_url": "https://roaddamagecalculator.com/", + "display_url": "roaddamagecalculator.com", + "status": 200, + "title": "Road Damage", + "description": "Ever wondered how much more damage one vehicle does to the road than", + "unwound_url": "https://roaddamagecalculator.com/" + }, + { + "start": 269, + "end": 292, + "url": "https://t.co/uV0j73ujer", + "expanded_url": "https://twitter.com/_chris_brand_/status/1811769819198267436/photo/1", + "display_url": "pic.x.com/uv0j73ujer", + "media_key": "3_1811768379000107008" + } + ] + }, + "public_metrics": { + "retweet_count": 426, + "reply_count": 200, + "like_count": 1499, + "quote_count": 65, + "bookmark_count": 240, + "impression_count": 163126 + }, + "attachments": { "media_keys": ["3_1811768379000107008"] }, + "text": "The change in damage to the road surface is proportional to the difference in axle weight to the fourth power\n\nSo, a 2.4-tonne electric SUV such as a Hyundai Ioniq 5 would therefore do 16 times more damage than a 1.2-tonne VW Polo\n\n#mobesity\n\n👉 https://t.co/KiifOc4ZGS https://t.co/uV0j73ujer", + "id": "1811769819198267436", + "edit_controls": { + "edits_remaining": 5, + "is_edit_eligible": false, + "editable_until": "2024-07-12T15:29:20.000Z" + }, + "context_annotations": [ + { + "domain": { + "id": "30", + "name": "Entities [Entity Service]", + "description": "Entity Service top level domain, every item that is in Entity Service should be in this domain" + }, + "entity": { + "id": "781972125179518977", + "name": "Auto Manufacturer - Auto" + } + }, + { + "domain": { + "id": "46", + "name": "Business Taxonomy", + "description": "Categories within Brand Verticals that narrow down the scope of Brands" + }, + "entity": { + "id": "1557696420500541440", + "name": "Automotive, Aircraft & Boat Business", + "description": "Brands, companies, advertisers and every non-person handle with the profit intent related to automobile, spacecraft, aircraft and boat manufacturing" + } + }, + { + "domain": { + "id": "47", + "name": "Brand", + "description": "Brands and Companies" + }, + "entity": { "id": "10026295039", "name": "Hyundai" } + }, + { + "domain": { + "id": "47", + "name": "Brand", + "description": "Brands and Companies" + }, + "entity": { "id": "10026353537", "name": "Volkswagen" } + }, + { + "domain": { + "id": "131", + "name": "Unified Twitter Taxonomy", + "description": "A taxonomy of user interests. " + }, + "entity": { "id": "10026295039", "name": "Hyundai" } + }, + { + "domain": { + "id": "131", + "name": "Unified Twitter Taxonomy", + "description": "A taxonomy of user interests. " + }, + "entity": { "id": "10026353537", "name": "Volkswagen" } + }, + { + "domain": { + "id": "131", + "name": "Unified Twitter Taxonomy", + "description": "A taxonomy of user interests. " + }, + "entity": { "id": "1196845866138533888", "name": "Automobile Brands" } + }, + { + "domain": { + "id": "30", + "name": "Entities [Entity Service]", + "description": "Entity Service top level domain, every item that is in Entity Service should be in this domain" + }, + "entity": { + "id": "781972125179518977", + "name": "Auto Manufacturer - Auto" + } + }, + { + "domain": { + "id": "47", + "name": "Brand", + "description": "Brands and Companies" + }, + "entity": { "id": "10026295039", "name": "Hyundai" } + }, + { + "domain": { + "id": "48", + "name": "Product", + "description": "Products created by Brands. Examples: Ford Explorer, Apple iPhone." + }, + "entity": { "id": "10044387828", "name": "Hyundai - Ioniq" } + }, + { + "domain": { + "id": "65", + "name": "Interests and Hobbies Vertical", + "description": "Top level interests and hobbies groupings, like Food or Travel" + }, + "entity": { + "id": "847528391163092993", + "name": "Automotive", + "description": "Car culture" + } + }, + { + "domain": { + "id": "66", + "name": "Interests and Hobbies Category", + "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations" + }, + "entity": { + "id": "847528576551337984", + "name": "Hybrid and electric vehicles", + "description": "Hybrid and electric vehicles" + } + }, + { + "domain": { + "id": "66", + "name": "Interests and Hobbies Category", + "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations" + }, + "entity": { + "id": "847529204530921472", + "name": "SUVs", + "description": "SUVs" + } + } + ], + "conversation_id": "1811769819198267436", + "reply_settings": "everyone" + }, + "includes": { + "media": [ + { + "height": 495, + "width": 607, + "url": "https://pbs.twimg.com/media/GSSxvdWWMAAuVk_.png", + "media_key": "3_1811768379000107008", + "type": "photo" + } + ], + "users": [ + { + "id": "871657411525963776", + "verified_type": "none", + "url": "https://t.co/sBthi7IAvB", + "description": "Scientist. Prof at Oxford University https://t.co/0JetFU9aYd @TSUOxford and @ecioxford. Sustainable and healthy transport. Active travel. Transport and climate policy.", + "most_recent_tweet_id": "1829881319222043134", + "name": "Prof. Christian Brand", + "public_metrics": { + "followers_count": 2379, + "following_count": 917, + "tweet_count": 4733, + "listed_count": 37, + "like_count": 7760 + }, + "entities": { + "url": { + "urls": [ + { + "start": 0, + "end": 23, + "url": "https://t.co/sBthi7IAvB", + "expanded_url": "http://www.tsu.ox.ac.uk/people/cbrand.html", + "display_url": "tsu.ox.ac.uk/people/cbrand.…" + } + ] + }, + "description": { + "urls": [ + { + "start": 37, + "end": 60, + "url": "https://t.co/0JetFU9aYd", + "expanded_url": "http://ox.ac.uk", + "display_url": "ox.ac.uk" + } + ], + "mentions": [ + { "start": 61, "end": 71, "username": "TSUOxford" }, + { "start": 76, "end": 86, "username": "ecioxford" } + ] + } + }, + "profile_image_url": "https://pbs.twimg.com/profile_images/1707022613413818368/YLvCT_0r_normal.jpg", + "protected": false, + "username": "_chris_brand_", + "pinned_tweet_id": "1585556640933232640", + "verified": false, + "location": "Oxford, UK", + "created_at": "2017-06-05T09:18:16.000Z" + } + ], + "tweets": [ + { + "possibly_sensitive": false, + "author_id": "871657411525963776", + "edit_history_tweet_ids": ["1811769819198267436"], + "lang": "en", + "created_at": "2024-07-12T14:29:20.000Z", + "entities": { + "annotations": [ + { + "start": 150, + "end": 164, + "probability": 0.6361, + "type": "Other", + "normalized_text": "Hyundai Ioniq 5" + }, + { + "start": 223, + "end": 229, + "probability": 0.7171, + "type": "Other", + "normalized_text": "VW Polo" + } + ], + "hashtags": [{ "start": 232, "end": 241, "tag": "mobesity" }], + "urls": [ + { + "start": 245, + "end": 268, + "url": "https://t.co/KiifOc4ZGS", + "expanded_url": "https://roaddamagecalculator.com/", + "display_url": "roaddamagecalculator.com", + "status": 200, + "title": "Road Damage", + "description": "Ever wondered how much more damage one vehicle does to the road than", + "unwound_url": "https://roaddamagecalculator.com/" + }, + { + "start": 269, + "end": 292, + "url": "https://t.co/uV0j73ujer", + "expanded_url": "https://twitter.com/_chris_brand_/status/1811769819198267436/photo/1", + "display_url": "pic.x.com/uv0j73ujer", + "media_key": "3_1811768379000107008" + } + ] + }, + "public_metrics": { + "retweet_count": 426, + "reply_count": 200, + "like_count": 1499, + "quote_count": 65, + "bookmark_count": 240, + "impression_count": 163126 + }, + "attachments": { "media_keys": ["3_1811768379000107008"] }, + "text": "The change in damage to the road surface is proportional to the difference in axle weight to the fourth power\n\nSo, a 2.4-tonne electric SUV such as a Hyundai Ioniq 5 would therefore do 16 times more damage than a 1.2-tonne VW Polo\n\n#mobesity\n\n👉 https://t.co/KiifOc4ZGS https://t.co/uV0j73ujer", + "id": "1811769819198267436", + "edit_controls": { + "edits_remaining": 5, + "is_edit_eligible": false, + "editable_until": "2024-07-12T15:29:20.000Z" + }, + "context_annotations": [ + { + "domain": { + "id": "30", + "name": "Entities [Entity Service]", + "description": "Entity Service top level domain, every item that is in Entity Service should be in this domain" + }, + "entity": { + "id": "781972125179518977", + "name": "Auto Manufacturer - Auto" + } + }, + { + "domain": { + "id": "46", + "name": "Business Taxonomy", + "description": "Categories within Brand Verticals that narrow down the scope of Brands" + }, + "entity": { + "id": "1557696420500541440", + "name": "Automotive, Aircraft & Boat Business", + "description": "Brands, companies, advertisers and every non-person handle with the profit intent related to automobile, spacecraft, aircraft and boat manufacturing" + } + }, + { + "domain": { + "id": "47", + "name": "Brand", + "description": "Brands and Companies" + }, + "entity": { "id": "10026295039", "name": "Hyundai" } + }, + { + "domain": { + "id": "47", + "name": "Brand", + "description": "Brands and Companies" + }, + "entity": { "id": "10026353537", "name": "Volkswagen" } + }, + { + "domain": { + "id": "131", + "name": "Unified Twitter Taxonomy", + "description": "A taxonomy of user interests. " + }, + "entity": { "id": "10026295039", "name": "Hyundai" } + }, + { + "domain": { + "id": "131", + "name": "Unified Twitter Taxonomy", + "description": "A taxonomy of user interests. " + }, + "entity": { "id": "10026353537", "name": "Volkswagen" } + }, + { + "domain": { + "id": "131", + "name": "Unified Twitter Taxonomy", + "description": "A taxonomy of user interests. " + }, + "entity": { + "id": "1196845866138533888", + "name": "Automobile Brands" + } + }, + { + "domain": { + "id": "30", + "name": "Entities [Entity Service]", + "description": "Entity Service top level domain, every item that is in Entity Service should be in this domain" + }, + "entity": { + "id": "781972125179518977", + "name": "Auto Manufacturer - Auto" + } + }, + { + "domain": { + "id": "47", + "name": "Brand", + "description": "Brands and Companies" + }, + "entity": { "id": "10026295039", "name": "Hyundai" } + }, + { + "domain": { + "id": "48", + "name": "Product", + "description": "Products created by Brands. Examples: Ford Explorer, Apple iPhone." + }, + "entity": { "id": "10044387828", "name": "Hyundai - Ioniq" } + }, + { + "domain": { + "id": "65", + "name": "Interests and Hobbies Vertical", + "description": "Top level interests and hobbies groupings, like Food or Travel" + }, + "entity": { + "id": "847528391163092993", + "name": "Automotive", + "description": "Car culture" + } + }, + { + "domain": { + "id": "66", + "name": "Interests and Hobbies Category", + "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations" + }, + "entity": { + "id": "847528576551337984", + "name": "Hybrid and electric vehicles", + "description": "Hybrid and electric vehicles" + } + }, + { + "domain": { + "id": "66", + "name": "Interests and Hobbies Category", + "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations" + }, + "entity": { + "id": "847529204530921472", + "name": "SUVs", + "description": "SUVs" + } + } + ], + "conversation_id": "1811769819198267436", + "reply_settings": "everyone" + } + ] + } +} From 24c646496e8610606a130fbf3562b055886b4a78 Mon Sep 17 00:00:00 2001 From: yu23ki14 Date: Wed, 2 Oct 2024 09:11:37 +0900 Subject: [PATCH 2/4] add status --- common/birdxplorer_common/storage.py | 40 +++++++++++----------- etl/src/birdxplorer_etl/extract.py | 50 +++++++++++----------------- etl/src/birdxplorer_etl/transform.py | 42 ++++++++++++++--------- 3 files changed, 67 insertions(+), 65 deletions(-) diff --git a/common/birdxplorer_common/storage.py b/common/birdxplorer_common/storage.py index d9612e1..d5acf94 100644 --- a/common/birdxplorer_common/storage.py +++ b/common/birdxplorer_common/storage.py @@ -139,26 +139,26 @@ class RowNoteStatusRecord(Base): note_id: Mapped[NoteId] = mapped_column(ForeignKey("row_notes.note_id"), primary_key=True) note_author_participant_id: Mapped[ParticipantId] = mapped_column(nullable=False) created_at_millis: Mapped[TwitterTimestamp] = mapped_column(nullable=False) - timestamp_millis_of_first_non_n_m_r_status: Mapped[TwitterTimestamp] = mapped_column() - first_non_n_m_r_status: Mapped[String] = mapped_column() - timestamp_millis_of_current_status: Mapped[TwitterTimestamp] = mapped_column() - current_status: Mapped[String] = mapped_column() - timestamp_millis_of_latest_non_n_m_r_status: Mapped[TwitterTimestamp] = mapped_column() - most_recent_non_n_m_r_status: Mapped[String] = mapped_column() - timestamp_millis_of_status_lock: Mapped[TwitterTimestamp] = mapped_column() - locked_status: Mapped[String] = mapped_column() - timestamp_millis_of_retro_lock: Mapped[TwitterTimestamp] = mapped_column() - current_core_status: Mapped[String] = mapped_column() - current_expansion_status: Mapped[String] = mapped_column() - current_group_status: Mapped[String] = mapped_column() - current_decided_by: Mapped[String] = mapped_column() - current_modeling_group: Mapped[int] = mapped_column() - timestamp_millis_of_most_recent_status_change: Mapped[TwitterTimestamp] = mapped_column() - timestamp_millis_of_nmr_due_to_min_stable_crh_time: Mapped[TwitterTimestamp] = mapped_column() - current_multi_group_status: Mapped[String] = mapped_column() - current_modeling_multi_group: Mapped[int] = mapped_column() - timestamp_minute_of_final_scoring_output: Mapped[TwitterTimestamp] = mapped_column() - timestamp_millis_of_first_nmr_due_to_min_stable_crh_time: Mapped[TwitterTimestamp] = mapped_column() + timestamp_millis_of_first_non_n_m_r_status: Mapped[TwitterTimestamp] = mapped_column(nullable=True) + first_non_n_m_r_status: Mapped[String] = mapped_column(nullable=True) + timestamp_millis_of_current_status: Mapped[TwitterTimestamp] = mapped_column(nullable=True) + current_status: Mapped[String] = mapped_column(nullable=True) + timestamp_millis_of_latest_non_n_m_r_status: Mapped[TwitterTimestamp] = mapped_column(nullable=True) + most_recent_non_n_m_r_status: Mapped[String] = mapped_column(nullable=True) + timestamp_millis_of_status_lock: Mapped[TwitterTimestamp] = mapped_column(nullable=True) + locked_status: Mapped[String] = mapped_column(nullable=True) + timestamp_millis_of_retro_lock: Mapped[TwitterTimestamp] = mapped_column(nullable=True) + current_core_status: Mapped[String] = mapped_column(nullable=True) + current_expansion_status: Mapped[String] = mapped_column(nullable=True) + current_group_status: Mapped[String] = mapped_column(nullable=True) + current_decided_by: Mapped[String] = mapped_column(nullable=True) + current_modeling_group: Mapped[int] = mapped_column(nullable=True) + timestamp_millis_of_most_recent_status_change: Mapped[TwitterTimestamp] = mapped_column(nullable=True) + timestamp_millis_of_nmr_due_to_min_stable_crh_time: Mapped[TwitterTimestamp] = mapped_column(nullable=True) + current_multi_group_status: Mapped[String] = mapped_column(nullable=True) + current_modeling_multi_group: Mapped[int] = mapped_column(nullable=True) + timestamp_minute_of_final_scoring_output: Mapped[TwitterTimestamp] = mapped_column(nullable=True) + timestamp_millis_of_first_nmr_due_to_min_stable_crh_time: Mapped[TwitterTimestamp] = mapped_column(nullable=True) class RowPostRecord(Base): diff --git a/etl/src/birdxplorer_etl/extract.py b/etl/src/birdxplorer_etl/extract.py index cd4bd8b..d47c9be 100644 --- a/etl/src/birdxplorer_etl/extract.py +++ b/etl/src/birdxplorer_etl/extract.py @@ -5,13 +5,7 @@ from prefect import get_run_logger from sqlalchemy.orm import Session from lib.x.postlookup import lookup -from birdxplorer_common.storage import ( - RowNoteRecord, - RowPostRecord, - RowUserRecord, - RowPostEmbedURLRecord, - RowNoteStatusRecord, -) +from birdxplorer_common.storage import RowNoteRecord, RowPostRecord, RowUserRecord, RowNoteStatusRecord import settings @@ -36,7 +30,10 @@ def extract_data(db: Session): break dateString = date.strftime("%Y/%m/%d") - note_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/notes/notes-00000.tsv" + # note_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/notes/notes-00000.tsv" + note_url = ( + "https://raw.githubusercontent.com/codeforjapan/BirdXplorer/refs/heads/main/etl/data/notes_sample.tsv" + ) logger.info(note_url) res = requests.get(note_url) @@ -47,28 +44,37 @@ def extract_data(db: Session): reader.fieldnames = [stringcase.snakecase(field) for field in reader.fieldnames] rows_to_add = [] - for row in reader: + for index, row in enumerate(reader): if db.query(RowNoteRecord).filter(RowNoteRecord.note_id == row["note_id"]).first(): continue rows_to_add.append(RowNoteRecord(**row)) + if index % 1000 == 0: + db.bulk_save_objects(rows_to_add) + rows_to_add = [] db.bulk_save_objects(rows_to_add) - status_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/noteStatusHistory/noteStatusHistory-00000.tsv" + # status_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/noteStatusHistory/noteStatusHistory-00000.tsv" + status_url = "https://raw.githubusercontent.com/codeforjapan/BirdXplorer/refs/heads/main/etl/data/noteStatus_sample.tsv" logger.info(status_url) res = requests.get(status_url) if res.status_code == 200: - # res.contentをdbのNoteStatusテーブル tsv_data = res.content.decode("utf-8").splitlines() reader = csv.DictReader(tsv_data, delimiter="\t") reader.fieldnames = [stringcase.snakecase(field) for field in reader.fieldnames] rows_to_add = [] - for row in reader: + for index, row in enumerate(reader): + for key, value in list(row.items()): + if value == "": + row[key] = None status = db.query(RowNoteStatusRecord).filter(RowNoteStatusRecord.note_id == row["note_id"]).first() - if status is None or status.created_at_millis > int(datetime.now().time() * 1000): + if status is None or status.created_at_millis > int(datetime.now().timestamp() * 1000): db.query(RowNoteStatusRecord).filter(RowNoteStatusRecord.note_id == row["note_id"]).delete() rows_to_add.append(RowNoteStatusRecord(**row)) + if index % 1000 == 0: + db.bulk_save_objects(rows_to_add) + rows_to_add = [] db.bulk_save_objects(rows_to_add) break @@ -85,9 +91,7 @@ def extract_data(db: Session): .filter(RowNoteRecord.created_at_millis <= settings.TARGET_TWITTER_POST_END_UNIX_MILLISECOND) .all() ) - - logger.info(f"Num of Target Notes: {len(postExtract_targetNotes)}") - + logger.info(len(postExtract_targetNotes)) for note in postExtract_targetNotes: tweet_id = note.tweet_id @@ -100,8 +104,6 @@ def extract_data(db: Session): logger.info(tweet_id) post = lookup(tweet_id) - logger.info(post) - if post == None or "data" not in post: continue @@ -132,18 +134,6 @@ def extract_data(db: Session): ) db.add(db_user) - if "entities" in post["data"] and "urls" in post["data"]["entities"]: - for url in post["data"]["entities"]["urls"]: - if "unwound_url" not in url or url["status"] != 200: - continue - db_post_embed_url = RowPostEmbedURLRecord( - post_id=post["data"]["id"], - url=url["url"], - expanded_url=url["expanded_url"], - unwound_url=url["unwound_url"], - ) - db.add(db_post_embed_url) - media_data = ( post["includes"]["media"][0] if "includes" in post and "media" in post["includes"] and len(post["includes"]["media"]) > 0 diff --git a/etl/src/birdxplorer_etl/transform.py b/etl/src/birdxplorer_etl/transform.py index acefbc2..6de446b 100644 --- a/etl/src/birdxplorer_etl/transform.py +++ b/etl/src/birdxplorer_etl/transform.py @@ -1,6 +1,6 @@ from sqlalchemy import select, func, and_, Integer from sqlalchemy.orm import Session -from birdxplorer_common.storage import RowNoteRecord, RowPostRecord, RowUserRecord +from birdxplorer_common.storage import RowNoteRecord, RowPostRecord, RowUserRecord, RowNoteStatusRecord from birdxplorer_etl.lib.ai_model.ai_model_interface import get_ai_service from birdxplorer_etl.settings import ( TARGET_NOTE_ESTIMATE_TOPIC_START_UNIX_MILLISECOND, @@ -23,7 +23,7 @@ def transform_data(db: Session): os.remove("./data/transformed/note.csv") with open("./data/transformed/note.csv", "a") as file: writer = csv.writer(file) - writer.writerow(["note_id", "post_id", "summary", "created_at", "language"]) + writer.writerow(["note_id", "post_id", "summary", "current_status", "created_at", "language"]) offset = 0 limit = 1000 @@ -49,14 +49,10 @@ def transform_data(db: Session): RowNoteRecord.note_id, RowNoteRecord.row_post_id, RowNoteRecord.summary, + RowNoteStatusRecord.current_status, func.cast(RowNoteRecord.created_at_millis, Integer).label("created_at"), ) - .filter( - and_( - RowNoteRecord.created_at_millis <= TARGET_NOTE_ESTIMATE_TOPIC_END_UNIX_MILLISECOND, - RowNoteRecord.created_at_millis >= TARGET_NOTE_ESTIMATE_TOPIC_START_UNIX_MILLISECOND, - ) - ) + .join(RowNoteStatusRecord, RowNoteRecord.note_id == RowNoteStatusRecord.note_id) .limit(limit) .offset(offset) ) @@ -170,7 +166,7 @@ def transform_data(db: Session): return -def generate_note_topic(): +def generate_note_topic(db: Session): note_csv_file_path = "./data/transformed/note.csv" output_csv_file_path = "./data/transformed/note_topic_association.csv" ai_service = get_ai_service() @@ -181,17 +177,33 @@ def generate_note_topic(): writer = csv.DictWriter(file, fieldnames=fieldnames) writer.writeheader() - with open(note_csv_file_path, newline="", encoding="utf-8") as csvfile: - reader = csv.DictReader(csvfile) - for index, row in enumerate(reader): - note_id = row["note_id"] - summary = row["summary"] + offset = 0 + limit = 1000 + + num_of_users = db.query(func.count(RowUserRecord.user_id)).scalar() + + while offset < num_of_users: + topicEstimationTargetNotes = db.execute( + select(RowNoteRecord.note_id, RowNoteRecord.row_post_id, RowNoteRecord.summary) + .filter( + and_( + RowNoteRecord.created_at_millis <= TARGET_NOTE_ESTIMATE_TOPIC_END_UNIX_MILLISECOND, + RowNoteRecord.created_at_millis >= TARGET_NOTE_ESTIMATE_TOPIC_START_UNIX_MILLISECOND, + ) + ) + .join(RowNoteStatusRecord, RowNoteRecord.note_id == RowNoteStatusRecord.note_id) + .limit(limit) + .offset(offset) + ) + + for index, note in enumerate(topicEstimationTargetNotes): + note_id = note.note_id + summary = note.summary topics_info = ai_service.detect_topic(note_id, summary) if topics_info: for topic in topics_info.get("topics", []): record = {"note_id": note_id, "topic_id": topic} records.append(record) - if index % 100 == 0: for record in records: writer.writerow( From d542a763a335dbb337753ee727597de0d73cf42e Mon Sep 17 00:00:00 2001 From: yu23ki14 Date: Wed, 2 Oct 2024 10:38:23 +0900 Subject: [PATCH 3/4] add link csv --- etl/.env.example | 14 +- etl/src/birdxplorer_etl/extract.py | 35 +- etl/src/birdxplorer_etl/settings.py | 2 + etl/src/birdxplorer_etl/test.json | 456 --------------------------- etl/src/birdxplorer_etl/transform.py | 65 +++- 5 files changed, 103 insertions(+), 469 deletions(-) delete mode 100644 etl/src/birdxplorer_etl/test.json diff --git a/etl/.env.example b/etl/.env.example index 112d707..c8a0054 100644 --- a/etl/.env.example +++ b/etl/.env.example @@ -1,6 +1,14 @@ X_BEARER_TOKEN= -AI_MODEL= + +COMMUNITY_NOTE_DAYS_AGO=3 + +TARGET_TWITTER_POST_START_UNIX_MILLISECOND=1719851000000 +TARGET_TWITTER_POST_END_UNIX_MILLISECOND=1719891000000 + +AI_MODEL=openai OPENAPI_TOKEN= CLAUDE_TOKEN= -TARGET_NOTE_ESTIMATE_TOPIC_START_UNIX_MILLISECOND=1720900800000 -TARGET_NOTE_ESTIMATE_TOPIC_END_UNIX_MILLISECOND=1722110400000 \ No newline at end of file +TARGET_NOTE_ESTIMATE_TOPIC_START_UNIX_MILLISECOND=1719851000000 +TARGET_NOTE_ESTIMATE_TOPIC_END_UNIX_MILLISECOND=1719891000000 + +USE_DUMMY_DATA=False \ No newline at end of file diff --git a/etl/src/birdxplorer_etl/extract.py b/etl/src/birdxplorer_etl/extract.py index d47c9be..b29c5de 100644 --- a/etl/src/birdxplorer_etl/extract.py +++ b/etl/src/birdxplorer_etl/extract.py @@ -5,7 +5,13 @@ from prefect import get_run_logger from sqlalchemy.orm import Session from lib.x.postlookup import lookup -from birdxplorer_common.storage import RowNoteRecord, RowPostRecord, RowUserRecord, RowNoteStatusRecord +from birdxplorer_common.storage import ( + RowNoteRecord, + RowPostRecord, + RowUserRecord, + RowNoteStatusRecord, + RowPostEmbedURLRecord, +) import settings @@ -30,10 +36,12 @@ def extract_data(db: Session): break dateString = date.strftime("%Y/%m/%d") - # note_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/notes/notes-00000.tsv" - note_url = ( - "https://raw.githubusercontent.com/codeforjapan/BirdXplorer/refs/heads/main/etl/data/notes_sample.tsv" - ) + note_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/notes/notes-00000.tsv" + if settings.USE_DUMMY_DATA: + note_url = ( + "https://raw.githubusercontent.com/codeforjapan/BirdXplorer/refs/heads/main/etl/data/notes_sample.tsv" + ) + logger.info(note_url) res = requests.get(note_url) @@ -53,8 +61,10 @@ def extract_data(db: Session): rows_to_add = [] db.bulk_save_objects(rows_to_add) - # status_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/noteStatusHistory/noteStatusHistory-00000.tsv" - status_url = "https://raw.githubusercontent.com/codeforjapan/BirdXplorer/refs/heads/main/etl/data/noteStatus_sample.tsv" + status_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/noteStatusHistory/noteStatusHistory-00000.tsv" + if settings.USE_DUMMY_DATA: + status_url = "https://raw.githubusercontent.com/codeforjapan/BirdXplorer/refs/heads/main/etl/data/noteStatus_sample.tsv" + logger.info(status_url) res = requests.get(status_url) @@ -155,6 +165,17 @@ def extract_data(db: Session): lang=post["data"]["lang"], ) db.add(db_post) + + if "entities" in post["data"] and "urls" in post["data"]["entities"]: + for url in post["data"]["entities"]["urls"]: + if "unwound_url" in url: + post_url = RowPostEmbedURLRecord( + post_id=post["data"]["id"], + url=url["url"] if url["url"] else None, + expanded_url=url["expanded_url"] if url["expanded_url"] else None, + unwound_url=url["unwound_url"] if url["unwound_url"] else None, + ) + db.add(post_url) note.row_post_id = tweet_id db.commit() continue diff --git a/etl/src/birdxplorer_etl/settings.py b/etl/src/birdxplorer_etl/settings.py index d6e9f2e..c775af3 100644 --- a/etl/src/birdxplorer_etl/settings.py +++ b/etl/src/birdxplorer_etl/settings.py @@ -17,3 +17,5 @@ CLAUDE_TOKEN = os.getenv("CLAUDE_TOKEN") TARGET_NOTE_ESTIMATE_TOPIC_START_UNIX_MILLISECOND = os.getenv("TARGET_NOTE_ESTIMATE_TOPIC_START_UNIX_MILLISECOND") TARGET_NOTE_ESTIMATE_TOPIC_END_UNIX_MILLISECOND = os.getenv("TARGET_NOTE_ESTIMATE_TOPIC_END_UNIX_MILLISECOND") + +USE_DUMMY_DATA = os.getenv("USE_DUMMY_DATA", "False") == "True" diff --git a/etl/src/birdxplorer_etl/test.json b/etl/src/birdxplorer_etl/test.json deleted file mode 100644 index 395615b..0000000 --- a/etl/src/birdxplorer_etl/test.json +++ /dev/null @@ -1,456 +0,0 @@ -{ - "data": { - "possibly_sensitive": false, - "author_id": "871657411525963776", - "edit_history_tweet_ids": ["1811769819198267436"], - "lang": "en", - "created_at": "2024-07-12T14:29:20.000Z", - "entities": { - "annotations": [ - { - "start": 150, - "end": 164, - "probability": 0.6361, - "type": "Other", - "normalized_text": "Hyundai Ioniq 5" - }, - { - "start": 223, - "end": 229, - "probability": 0.7171, - "type": "Other", - "normalized_text": "VW Polo" - } - ], - "hashtags": [{ "start": 232, "end": 241, "tag": "mobesity" }], - "urls": [ - { - "start": 245, - "end": 268, - "url": "https://t.co/KiifOc4ZGS", - "expanded_url": "https://roaddamagecalculator.com/", - "display_url": "roaddamagecalculator.com", - "status": 200, - "title": "Road Damage", - "description": "Ever wondered how much more damage one vehicle does to the road than", - "unwound_url": "https://roaddamagecalculator.com/" - }, - { - "start": 269, - "end": 292, - "url": "https://t.co/uV0j73ujer", - "expanded_url": "https://twitter.com/_chris_brand_/status/1811769819198267436/photo/1", - "display_url": "pic.x.com/uv0j73ujer", - "media_key": "3_1811768379000107008" - } - ] - }, - "public_metrics": { - "retweet_count": 426, - "reply_count": 200, - "like_count": 1499, - "quote_count": 65, - "bookmark_count": 240, - "impression_count": 163126 - }, - "attachments": { "media_keys": ["3_1811768379000107008"] }, - "text": "The change in damage to the road surface is proportional to the difference in axle weight to the fourth power\n\nSo, a 2.4-tonne electric SUV such as a Hyundai Ioniq 5 would therefore do 16 times more damage than a 1.2-tonne VW Polo\n\n#mobesity\n\n👉 https://t.co/KiifOc4ZGS https://t.co/uV0j73ujer", - "id": "1811769819198267436", - "edit_controls": { - "edits_remaining": 5, - "is_edit_eligible": false, - "editable_until": "2024-07-12T15:29:20.000Z" - }, - "context_annotations": [ - { - "domain": { - "id": "30", - "name": "Entities [Entity Service]", - "description": "Entity Service top level domain, every item that is in Entity Service should be in this domain" - }, - "entity": { - "id": "781972125179518977", - "name": "Auto Manufacturer - Auto" - } - }, - { - "domain": { - "id": "46", - "name": "Business Taxonomy", - "description": "Categories within Brand Verticals that narrow down the scope of Brands" - }, - "entity": { - "id": "1557696420500541440", - "name": "Automotive, Aircraft & Boat Business", - "description": "Brands, companies, advertisers and every non-person handle with the profit intent related to automobile, spacecraft, aircraft and boat manufacturing" - } - }, - { - "domain": { - "id": "47", - "name": "Brand", - "description": "Brands and Companies" - }, - "entity": { "id": "10026295039", "name": "Hyundai" } - }, - { - "domain": { - "id": "47", - "name": "Brand", - "description": "Brands and Companies" - }, - "entity": { "id": "10026353537", "name": "Volkswagen" } - }, - { - "domain": { - "id": "131", - "name": "Unified Twitter Taxonomy", - "description": "A taxonomy of user interests. " - }, - "entity": { "id": "10026295039", "name": "Hyundai" } - }, - { - "domain": { - "id": "131", - "name": "Unified Twitter Taxonomy", - "description": "A taxonomy of user interests. " - }, - "entity": { "id": "10026353537", "name": "Volkswagen" } - }, - { - "domain": { - "id": "131", - "name": "Unified Twitter Taxonomy", - "description": "A taxonomy of user interests. " - }, - "entity": { "id": "1196845866138533888", "name": "Automobile Brands" } - }, - { - "domain": { - "id": "30", - "name": "Entities [Entity Service]", - "description": "Entity Service top level domain, every item that is in Entity Service should be in this domain" - }, - "entity": { - "id": "781972125179518977", - "name": "Auto Manufacturer - Auto" - } - }, - { - "domain": { - "id": "47", - "name": "Brand", - "description": "Brands and Companies" - }, - "entity": { "id": "10026295039", "name": "Hyundai" } - }, - { - "domain": { - "id": "48", - "name": "Product", - "description": "Products created by Brands. Examples: Ford Explorer, Apple iPhone." - }, - "entity": { "id": "10044387828", "name": "Hyundai - Ioniq" } - }, - { - "domain": { - "id": "65", - "name": "Interests and Hobbies Vertical", - "description": "Top level interests and hobbies groupings, like Food or Travel" - }, - "entity": { - "id": "847528391163092993", - "name": "Automotive", - "description": "Car culture" - } - }, - { - "domain": { - "id": "66", - "name": "Interests and Hobbies Category", - "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations" - }, - "entity": { - "id": "847528576551337984", - "name": "Hybrid and electric vehicles", - "description": "Hybrid and electric vehicles" - } - }, - { - "domain": { - "id": "66", - "name": "Interests and Hobbies Category", - "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations" - }, - "entity": { - "id": "847529204530921472", - "name": "SUVs", - "description": "SUVs" - } - } - ], - "conversation_id": "1811769819198267436", - "reply_settings": "everyone" - }, - "includes": { - "media": [ - { - "height": 495, - "width": 607, - "url": "https://pbs.twimg.com/media/GSSxvdWWMAAuVk_.png", - "media_key": "3_1811768379000107008", - "type": "photo" - } - ], - "users": [ - { - "id": "871657411525963776", - "verified_type": "none", - "url": "https://t.co/sBthi7IAvB", - "description": "Scientist. Prof at Oxford University https://t.co/0JetFU9aYd @TSUOxford and @ecioxford. Sustainable and healthy transport. Active travel. Transport and climate policy.", - "most_recent_tweet_id": "1829881319222043134", - "name": "Prof. Christian Brand", - "public_metrics": { - "followers_count": 2379, - "following_count": 917, - "tweet_count": 4733, - "listed_count": 37, - "like_count": 7760 - }, - "entities": { - "url": { - "urls": [ - { - "start": 0, - "end": 23, - "url": "https://t.co/sBthi7IAvB", - "expanded_url": "http://www.tsu.ox.ac.uk/people/cbrand.html", - "display_url": "tsu.ox.ac.uk/people/cbrand.…" - } - ] - }, - "description": { - "urls": [ - { - "start": 37, - "end": 60, - "url": "https://t.co/0JetFU9aYd", - "expanded_url": "http://ox.ac.uk", - "display_url": "ox.ac.uk" - } - ], - "mentions": [ - { "start": 61, "end": 71, "username": "TSUOxford" }, - { "start": 76, "end": 86, "username": "ecioxford" } - ] - } - }, - "profile_image_url": "https://pbs.twimg.com/profile_images/1707022613413818368/YLvCT_0r_normal.jpg", - "protected": false, - "username": "_chris_brand_", - "pinned_tweet_id": "1585556640933232640", - "verified": false, - "location": "Oxford, UK", - "created_at": "2017-06-05T09:18:16.000Z" - } - ], - "tweets": [ - { - "possibly_sensitive": false, - "author_id": "871657411525963776", - "edit_history_tweet_ids": ["1811769819198267436"], - "lang": "en", - "created_at": "2024-07-12T14:29:20.000Z", - "entities": { - "annotations": [ - { - "start": 150, - "end": 164, - "probability": 0.6361, - "type": "Other", - "normalized_text": "Hyundai Ioniq 5" - }, - { - "start": 223, - "end": 229, - "probability": 0.7171, - "type": "Other", - "normalized_text": "VW Polo" - } - ], - "hashtags": [{ "start": 232, "end": 241, "tag": "mobesity" }], - "urls": [ - { - "start": 245, - "end": 268, - "url": "https://t.co/KiifOc4ZGS", - "expanded_url": "https://roaddamagecalculator.com/", - "display_url": "roaddamagecalculator.com", - "status": 200, - "title": "Road Damage", - "description": "Ever wondered how much more damage one vehicle does to the road than", - "unwound_url": "https://roaddamagecalculator.com/" - }, - { - "start": 269, - "end": 292, - "url": "https://t.co/uV0j73ujer", - "expanded_url": "https://twitter.com/_chris_brand_/status/1811769819198267436/photo/1", - "display_url": "pic.x.com/uv0j73ujer", - "media_key": "3_1811768379000107008" - } - ] - }, - "public_metrics": { - "retweet_count": 426, - "reply_count": 200, - "like_count": 1499, - "quote_count": 65, - "bookmark_count": 240, - "impression_count": 163126 - }, - "attachments": { "media_keys": ["3_1811768379000107008"] }, - "text": "The change in damage to the road surface is proportional to the difference in axle weight to the fourth power\n\nSo, a 2.4-tonne electric SUV such as a Hyundai Ioniq 5 would therefore do 16 times more damage than a 1.2-tonne VW Polo\n\n#mobesity\n\n👉 https://t.co/KiifOc4ZGS https://t.co/uV0j73ujer", - "id": "1811769819198267436", - "edit_controls": { - "edits_remaining": 5, - "is_edit_eligible": false, - "editable_until": "2024-07-12T15:29:20.000Z" - }, - "context_annotations": [ - { - "domain": { - "id": "30", - "name": "Entities [Entity Service]", - "description": "Entity Service top level domain, every item that is in Entity Service should be in this domain" - }, - "entity": { - "id": "781972125179518977", - "name": "Auto Manufacturer - Auto" - } - }, - { - "domain": { - "id": "46", - "name": "Business Taxonomy", - "description": "Categories within Brand Verticals that narrow down the scope of Brands" - }, - "entity": { - "id": "1557696420500541440", - "name": "Automotive, Aircraft & Boat Business", - "description": "Brands, companies, advertisers and every non-person handle with the profit intent related to automobile, spacecraft, aircraft and boat manufacturing" - } - }, - { - "domain": { - "id": "47", - "name": "Brand", - "description": "Brands and Companies" - }, - "entity": { "id": "10026295039", "name": "Hyundai" } - }, - { - "domain": { - "id": "47", - "name": "Brand", - "description": "Brands and Companies" - }, - "entity": { "id": "10026353537", "name": "Volkswagen" } - }, - { - "domain": { - "id": "131", - "name": "Unified Twitter Taxonomy", - "description": "A taxonomy of user interests. " - }, - "entity": { "id": "10026295039", "name": "Hyundai" } - }, - { - "domain": { - "id": "131", - "name": "Unified Twitter Taxonomy", - "description": "A taxonomy of user interests. " - }, - "entity": { "id": "10026353537", "name": "Volkswagen" } - }, - { - "domain": { - "id": "131", - "name": "Unified Twitter Taxonomy", - "description": "A taxonomy of user interests. " - }, - "entity": { - "id": "1196845866138533888", - "name": "Automobile Brands" - } - }, - { - "domain": { - "id": "30", - "name": "Entities [Entity Service]", - "description": "Entity Service top level domain, every item that is in Entity Service should be in this domain" - }, - "entity": { - "id": "781972125179518977", - "name": "Auto Manufacturer - Auto" - } - }, - { - "domain": { - "id": "47", - "name": "Brand", - "description": "Brands and Companies" - }, - "entity": { "id": "10026295039", "name": "Hyundai" } - }, - { - "domain": { - "id": "48", - "name": "Product", - "description": "Products created by Brands. Examples: Ford Explorer, Apple iPhone." - }, - "entity": { "id": "10044387828", "name": "Hyundai - Ioniq" } - }, - { - "domain": { - "id": "65", - "name": "Interests and Hobbies Vertical", - "description": "Top level interests and hobbies groupings, like Food or Travel" - }, - "entity": { - "id": "847528391163092993", - "name": "Automotive", - "description": "Car culture" - } - }, - { - "domain": { - "id": "66", - "name": "Interests and Hobbies Category", - "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations" - }, - "entity": { - "id": "847528576551337984", - "name": "Hybrid and electric vehicles", - "description": "Hybrid and electric vehicles" - } - }, - { - "domain": { - "id": "66", - "name": "Interests and Hobbies Category", - "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations" - }, - "entity": { - "id": "847529204530921472", - "name": "SUVs", - "description": "SUVs" - } - } - ], - "conversation_id": "1811769819198267436", - "reply_settings": "everyone" - } - ] - } -} diff --git a/etl/src/birdxplorer_etl/transform.py b/etl/src/birdxplorer_etl/transform.py index 6de446b..bd977f9 100644 --- a/etl/src/birdxplorer_etl/transform.py +++ b/etl/src/birdxplorer_etl/transform.py @@ -1,6 +1,12 @@ from sqlalchemy import select, func, and_, Integer from sqlalchemy.orm import Session -from birdxplorer_common.storage import RowNoteRecord, RowPostRecord, RowUserRecord, RowNoteStatusRecord +from birdxplorer_common.storage import ( + RowNoteRecord, + RowPostRecord, + RowUserRecord, + RowNoteStatusRecord, + RowPostEmbedURLRecord, +) from birdxplorer_etl.lib.ai_model.ai_model_interface import get_ai_service from birdxplorer_etl.settings import ( TARGET_NOTE_ESTIMATE_TOPIC_START_UNIX_MILLISECOND, @@ -9,6 +15,8 @@ import csv import os from prefect import get_run_logger +import uuid +import random def transform_data(db: Session): @@ -138,6 +146,10 @@ def transform_data(db: Session): writer.writerow(user) offset += limit + # Transform row post embed link + generate_post_link(db) + + # Transform row post embed url data and generate post_embed_url.csv csv_seed_file_path = "./seed/topic_seed.csv" output_csv_file_path = "./data/transformed/topic.csv" records = [] @@ -161,16 +173,62 @@ def transform_data(db: Session): for record in records: writer.writerow({"topic_id": record["topic_id"], "label": {k: v for k, v in record["label"].items()}}) - generate_note_topic() + generate_note_topic(db) return +def generate_post_link(db: Session): + link_csv_file_path = "./data/transformed/post_link.csv" + association_csv_file_path = "./data/transformed/post_link_association.csv" + + if os.path.exists(link_csv_file_path): + os.remove(link_csv_file_path) + with open(link_csv_file_path, "a", newline="", encoding="utf-8") as file: + fieldnames = ["link_id", "url"] + writer = csv.DictWriter(file, fieldnames=fieldnames) + writer.writeheader() + + if os.path.exists(association_csv_file_path): + os.remove(association_csv_file_path) + with open(association_csv_file_path, "a", newline="", encoding="utf-8") as file: + fieldnames = ["post_id", "link_id"] + writer = csv.DictWriter(file, fieldnames=fieldnames) + writer.writeheader() + + offset = 0 + limit = 1000 + num_of_links = db.query(func.count(RowPostEmbedURLRecord.post_id)).scalar() + + records = [] + while offset < num_of_links: + links = db.query(RowPostEmbedURLRecord).limit(limit).offset(offset) + + for link in links: + random.seed(link.unwound_url) + link_id = uuid.UUID(int=random.getrandbits(128)) + is_link_exist = next((record for record in records if record["link_id"] == link_id), None) + if is_link_exist is None: + with open(link_csv_file_path, "a", newline="", encoding="utf-8") as file: + fieldnames = ["link_id", "unwound_url"] + writer = csv.DictWriter(file, fieldnames=fieldnames) + writer.writerow({"link_id": link_id, "unwound_url": link.unwound_url}) + record = {"post_id": link.post_id, "link_id": link_id, "unwound_url": link.unwound_url} + records.append(record) + with open(association_csv_file_path, "a", newline="", encoding="utf-8") as file: + fieldnames = ["post_id", "link_id"] + writer = csv.DictWriter(file, fieldnames=fieldnames) + writer.writerow({"post_id": link.post_id, "link_id": link_id}) + offset += limit + + def generate_note_topic(db: Session): - note_csv_file_path = "./data/transformed/note.csv" output_csv_file_path = "./data/transformed/note_topic_association.csv" ai_service = get_ai_service() + if os.path.exists(output_csv_file_path): + os.remove(output_csv_file_path) + records = [] with open(output_csv_file_path, "w", newline="", encoding="utf-8", buffering=1) as file: fieldnames = ["note_id", "topic_id"] @@ -214,6 +272,7 @@ def generate_note_topic(db: Session): ) records = [] print(index) + offset += limit for record in records: writer.writerow( From a9f158059b5e9b61db82f8f8a47369a22596a714 Mon Sep 17 00:00:00 2001 From: yu23ki14 Date: Wed, 2 Oct 2024 10:50:40 +0900 Subject: [PATCH 4/4] try to fix error --- common/birdxplorer_common/models.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/common/birdxplorer_common/models.py b/common/birdxplorer_common/models.py index d066f70..bc5532d 100644 --- a/common/birdxplorer_common/models.py +++ b/common/birdxplorer_common/models.py @@ -1,14 +1,25 @@ from abc import ABC, abstractmethod from datetime import datetime, timezone from enum import Enum -from typing import Any, Dict, List, Literal, Optional, Type, TypeAlias, TypeVar, Union +from typing import ( + Any, + Dict, + List, + Literal, + Optional, + Set, + Type, + TypeAlias, + TypeVar, + Union, +) from pydantic import BaseModel as PydanticBaseModel from pydantic import ConfigDict, GetCoreSchemaHandler, HttpUrl, TypeAdapter from pydantic.alias_generators import to_camel from pydantic_core import core_schema -IncEx: TypeAlias = "set[int] | set[str] | dict[int, IncEx] | dict[str, IncEx] | None" +IncEx: TypeAlias = Union[Set[int], Set[str], Dict[int, Any], Dict[str, Any], None] StrT = TypeVar("StrT", bound="BaseString") IntT = TypeVar("IntT", bound="BaseInt") FloatT = TypeVar("FloatT", bound="BaseFloat")