From b493e3f507b0018894061c120a55a78e5a3a0d5d Mon Sep 17 00:00:00 2001
From: yu23ki14 <yuki_021423@yahoo.co.jp>
Date: Tue, 1 Oct 2024 11:25:00 +0900
Subject: [PATCH 1/4] tmp

---
 api/birdxplorer_api/routers/data.py        |   2 +
 api/tests/conftest.py                      |   8 +
 common/birdxplorer_common/models.py        |   1 +
 common/birdxplorer_common/storage.py       |  42 ++
 common/tests/conftest.py                   |   6 +
 common/tests/test_storage.py               |  12 +
 etl/src/birdxplorer_etl/extract.py         |  55 ++-
 etl/src/birdxplorer_etl/lib/sqlite/init.py |  14 +-
 etl/src/birdxplorer_etl/test.json          | 456 +++++++++++++++++++++
 9 files changed, 589 insertions(+), 7 deletions(-)
 create mode 100644 etl/src/birdxplorer_etl/test.json

diff --git a/api/birdxplorer_api/routers/data.py b/api/birdxplorer_api/routers/data.py
index 75ed648..36be229 100644
--- a/api/birdxplorer_api/routers/data.py
+++ b/api/birdxplorer_api/routers/data.py
@@ -76,6 +76,7 @@ def get_notes(
         created_at_to: Union[None, TwitterTimestamp] = Query(default=None),
         topic_ids: Union[List[TopicId], None] = Query(default=None),
         post_ids: Union[List[PostId], None] = Query(default=None),
+        current_status: Union[None, List[str]] = Query(default=None),
         language: Union[LanguageIdentifier, None] = Query(default=None),
     ) -> NoteListResponse:
         return NoteListResponse(
@@ -86,6 +87,7 @@ def get_notes(
                     created_at_to=created_at_to,
                     topic_ids=topic_ids,
                     post_ids=post_ids,
+                    current_status=current_status,
                     language=language,
                 )
             )
diff --git a/api/tests/conftest.py b/api/tests/conftest.py
index 97a4deb..3acfbc3 100644
--- a/api/tests/conftest.py
+++ b/api/tests/conftest.py
@@ -93,6 +93,7 @@ def note_samples(note_factory: NoteFactory, topic_samples: List[Topic]) -> Gener
             topics=[topic_samples[0]],
             language="ja",
             summary="要約文1",
+            current_status="NEEDS_MORE_RATINGS",
             created_at=1152921600000,
         ),
         note_factory.build(
@@ -101,6 +102,7 @@ def note_samples(note_factory: NoteFactory, topic_samples: List[Topic]) -> Gener
             topics=[],
             language="en",
             summary="summary2",
+            current_status="NEEDS_MORE_RATINGS",
             created_at=1152921601000,
         ),
         note_factory.build(
@@ -109,6 +111,7 @@ def note_samples(note_factory: NoteFactory, topic_samples: List[Topic]) -> Gener
             topics=[topic_samples[1]],
             language="en",
             summary="summary3",
+            current_status="",
             created_at=1152921602000,
         ),
         note_factory.build(
@@ -117,6 +120,7 @@ def note_samples(note_factory: NoteFactory, topic_samples: List[Topic]) -> Gener
             topics=[topic_samples[0], topic_samples[1], topic_samples[2]],
             language="en",
             summary="summary4",
+            current_status="CURRENTLY_RATED_HELPFUL",
             created_at=1152921603000,
         ),
         note_factory.build(
@@ -125,6 +129,7 @@ def note_samples(note_factory: NoteFactory, topic_samples: List[Topic]) -> Gener
             topics=[topic_samples[0]],
             language="en",
             summary="summary5",
+            current_status="CURRENTLY_RATED_HELPFUL",
             created_at=1152921604000,
         ),
     ]
@@ -234,6 +239,7 @@ def _get_notes(
         created_at_to: Union[None, TwitterTimestamp] = None,
         topic_ids: Union[List[TopicId], None] = None,
         post_ids: Union[List[PostId], None] = None,
+        current_status: Union[None, List[str]] = None,
         language: Union[LanguageIdentifier, None] = None,
     ) -> Generator[Note, None, None]:
         for note in note_samples:
@@ -247,6 +253,8 @@ def _get_notes(
                 continue
             if post_ids is not None and note.post_id not in post_ids:
                 continue
+            if current_status is not None and note.current_status not in current_status:
+                continue
             if language is not None and note.language != language:
                 continue
             yield note
diff --git a/common/birdxplorer_common/models.py b/common/birdxplorer_common/models.py
index 72052dd..d066f70 100644
--- a/common/birdxplorer_common/models.py
+++ b/common/birdxplorer_common/models.py
@@ -657,6 +657,7 @@ class Note(BaseModel):
     language: LanguageIdentifier
     topics: List[Topic]
     summary: SummaryString
+    current_status: str | None
     created_at: TwitterTimestamp
 
 
diff --git a/common/birdxplorer_common/storage.py b/common/birdxplorer_common/storage.py
index e318459..d9612e1 100644
--- a/common/birdxplorer_common/storage.py
+++ b/common/birdxplorer_common/storage.py
@@ -68,6 +68,7 @@ class NoteRecord(Base):
     topics: Mapped[List[NoteTopicAssociation]] = relationship()
     language: Mapped[LanguageIdentifier] = mapped_column(nullable=False)
     summary: Mapped[SummaryString] = mapped_column(nullable=False)
+    current_status: Mapped[String] = mapped_column(nullable=True)
     created_at: Mapped[TwitterTimestamp] = mapped_column(nullable=False)
 
 
@@ -132,6 +133,34 @@ class RowNoteRecord(Base):
     row_post: Mapped["RowPostRecord"] = relationship("RowPostRecord", back_populates="row_notes")
 
 
+class RowNoteStatusRecord(Base):
+    __tablename__ = "row_note_status"
+
+    note_id: Mapped[NoteId] = mapped_column(ForeignKey("row_notes.note_id"), primary_key=True)
+    note_author_participant_id: Mapped[ParticipantId] = mapped_column(nullable=False)
+    created_at_millis: Mapped[TwitterTimestamp] = mapped_column(nullable=False)
+    timestamp_millis_of_first_non_n_m_r_status: Mapped[TwitterTimestamp] = mapped_column()
+    first_non_n_m_r_status: Mapped[String] = mapped_column()
+    timestamp_millis_of_current_status: Mapped[TwitterTimestamp] = mapped_column()
+    current_status: Mapped[String] = mapped_column()
+    timestamp_millis_of_latest_non_n_m_r_status: Mapped[TwitterTimestamp] = mapped_column()
+    most_recent_non_n_m_r_status: Mapped[String] = mapped_column()
+    timestamp_millis_of_status_lock: Mapped[TwitterTimestamp] = mapped_column()
+    locked_status: Mapped[String] = mapped_column()
+    timestamp_millis_of_retro_lock: Mapped[TwitterTimestamp] = mapped_column()
+    current_core_status: Mapped[String] = mapped_column()
+    current_expansion_status: Mapped[String] = mapped_column()
+    current_group_status: Mapped[String] = mapped_column()
+    current_decided_by: Mapped[String] = mapped_column()
+    current_modeling_group: Mapped[int] = mapped_column()
+    timestamp_millis_of_most_recent_status_change: Mapped[TwitterTimestamp] = mapped_column()
+    timestamp_millis_of_nmr_due_to_min_stable_crh_time: Mapped[TwitterTimestamp] = mapped_column()
+    current_multi_group_status: Mapped[String] = mapped_column()
+    current_modeling_multi_group: Mapped[int] = mapped_column()
+    timestamp_minute_of_final_scoring_output: Mapped[TwitterTimestamp] = mapped_column()
+    timestamp_millis_of_first_nmr_due_to_min_stable_crh_time: Mapped[TwitterTimestamp] = mapped_column()
+
+
 class RowPostRecord(Base):
     __tablename__ = "row_posts"
 
@@ -152,6 +181,15 @@ class RowPostRecord(Base):
     user: Mapped["RowUserRecord"] = relationship("RowUserRecord", back_populates="row_post")
 
 
+class RowPostEmbedURLRecord(Base):
+    __tablename__ = "row_post_embed_urls"
+
+    post_id: Mapped[PostId] = mapped_column(ForeignKey("row_posts.post_id"), primary_key=True)
+    url: Mapped[String] = mapped_column(primary_key=True)
+    expanded_url: Mapped[String] = mapped_column(nullable=False)
+    unwound_url: Mapped[String] = mapped_column(nullable=False)
+
+
 class RowUserRecord(Base):
     __tablename__ = "row_users"
 
@@ -224,6 +262,7 @@ def get_notes(
         created_at_to: Union[None, TwitterTimestamp] = None,
         topic_ids: Union[List[TopicId], None] = None,
         post_ids: Union[List[PostId], None] = None,
+        current_status: Union[None, List[str]] = None,
         language: Union[LanguageIdentifier, None] = None,
     ) -> Generator[NoteModel, None, None]:
         with Session(self.engine) as sess:
@@ -248,6 +287,8 @@ def get_notes(
                 query = query.filter(NoteRecord.post_id.in_(post_ids))
             if language is not None:
                 query = query.filter(NoteRecord.language == language)
+            if current_status is not None:
+                query = query.filter(NoteRecord.current_status.in_(current_status))
             for note_record in query.all():
                 yield NoteModel(
                     note_id=note_record.note_id,
@@ -265,6 +306,7 @@ def get_notes(
                     ],
                     language=LanguageIdentifier.normalize(note_record.language),
                     summary=note_record.summary,
+                    current_status=note_record.current_status,
                     created_at=note_record.created_at,
                 )
 
diff --git a/common/tests/conftest.py b/common/tests/conftest.py
index a8c048b..a58f37f 100644
--- a/common/tests/conftest.py
+++ b/common/tests/conftest.py
@@ -126,6 +126,7 @@ def note_samples(note_factory: NoteFactory, topic_samples: List[Topic]) -> Gener
             topics=[topic_samples[0]],
             language="ja",
             summary="要約文1",
+            current_status=None,
             created_at=1152921600000,
         ),
         note_factory.build(
@@ -134,6 +135,7 @@ def note_samples(note_factory: NoteFactory, topic_samples: List[Topic]) -> Gener
             topics=[],
             language="en",
             summary="summary2",
+            current_status=None,
             created_at=1152921601000,
         ),
         note_factory.build(
@@ -142,6 +144,7 @@ def note_samples(note_factory: NoteFactory, topic_samples: List[Topic]) -> Gener
             topics=[topic_samples[1]],
             language="en",
             summary="summary3",
+            current_status=None,
             created_at=1152921602000,
         ),
         note_factory.build(
@@ -150,6 +153,7 @@ def note_samples(note_factory: NoteFactory, topic_samples: List[Topic]) -> Gener
             topics=[topic_samples[0], topic_samples[1], topic_samples[2]],
             language="en",
             summary="summary4",
+            current_status=None,
             created_at=1152921603000,
         ),
         note_factory.build(
@@ -158,6 +162,7 @@ def note_samples(note_factory: NoteFactory, topic_samples: List[Topic]) -> Gener
             topics=[topic_samples[0]],
             language="en",
             summary="summary5",
+            current_status=None,
             created_at=1152921604000,
         ),
         note_factory.build(
@@ -166,6 +171,7 @@ def note_samples(note_factory: NoteFactory, topic_samples: List[Topic]) -> Gener
             topics=[topic_samples[0]],
             language="en",
             summary="summary6_empty_post_id",
+            current_status=None,
             created_at=1152921604000,
         ),
     ]
diff --git a/common/tests/test_storage.py b/common/tests/test_storage.py
index cc3638d..3d81209 100644
--- a/common/tests/test_storage.py
+++ b/common/tests/test_storage.py
@@ -205,6 +205,18 @@ def test_get_notes_by_post_ids_empty(
     assert expected == actual
 
 
+def test_get_notes_by_note_status(
+    engine_for_test: Engine,
+    note_samples: List[Note],
+    note_records_sample: List[NoteRecord],
+) -> None:
+    storage = Storage(engine=engine_for_test)
+    current_status = ["NEEDS_MORE_RATINGS"]
+    expected = [note for note in note_samples if note.current_status in current_status]
+    actual = list(storage.get_notes(current_status=current_status))
+    assert expected == actual
+
+
 def test_get_notes_by_language(
     engine_for_test: Engine,
     note_samples: List[Note],
diff --git a/etl/src/birdxplorer_etl/extract.py b/etl/src/birdxplorer_etl/extract.py
index 81abf65..cd4bd8b 100644
--- a/etl/src/birdxplorer_etl/extract.py
+++ b/etl/src/birdxplorer_etl/extract.py
@@ -5,7 +5,13 @@
 from prefect import get_run_logger
 from sqlalchemy.orm import Session
 from lib.x.postlookup import lookup
-from birdxplorer_common.storage import RowNoteRecord, RowPostRecord, RowUserRecord
+from birdxplorer_common.storage import (
+    RowNoteRecord,
+    RowPostRecord,
+    RowUserRecord,
+    RowPostEmbedURLRecord,
+    RowNoteStatusRecord,
+)
 import settings
 
 
@@ -28,9 +34,11 @@ def extract_data(db: Session):
             > datetime.timestamp(date) - 24 * 60 * 60 * settings.COMMUNITY_NOTE_DAYS_AGO
         ):
             break
-        url = f'https://ton.twimg.com/birdwatch-public-data/{date.strftime("%Y/%m/%d")}/notes/notes-00000.tsv'
-        logger.info(url)
-        res = requests.get(url)
+
+        dateString = date.strftime("%Y/%m/%d")
+        note_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/notes/notes-00000.tsv"
+        logger.info(note_url)
+        res = requests.get(note_url)
 
         if res.status_code == 200:
             # res.contentをdbのNoteテーブル
@@ -45,7 +53,26 @@ def extract_data(db: Session):
                 rows_to_add.append(RowNoteRecord(**row))
             db.bulk_save_objects(rows_to_add)
 
-            break
+            status_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/noteStatusHistory/noteStatusHistory-00000.tsv"
+            logger.info(status_url)
+            res = requests.get(status_url)
+
+            if res.status_code == 200:
+                # res.contentをdbのNoteStatusテーブル
+                tsv_data = res.content.decode("utf-8").splitlines()
+                reader = csv.DictReader(tsv_data, delimiter="\t")
+                reader.fieldnames = [stringcase.snakecase(field) for field in reader.fieldnames]
+
+                rows_to_add = []
+                for row in reader:
+                    status = db.query(RowNoteStatusRecord).filter(RowNoteStatusRecord.note_id == row["note_id"]).first()
+                    if status is None or status.created_at_millis > int(datetime.now().time() * 1000):
+                        db.query(RowNoteStatusRecord).filter(RowNoteStatusRecord.note_id == row["note_id"]).delete()
+                        rows_to_add.append(RowNoteStatusRecord(**row))
+                db.bulk_save_objects(rows_to_add)
+
+                break
+
         date = date - timedelta(days=1)
 
     db.commit()
@@ -58,7 +85,9 @@ def extract_data(db: Session):
         .filter(RowNoteRecord.created_at_millis <= settings.TARGET_TWITTER_POST_END_UNIX_MILLISECOND)
         .all()
     )
-    logger.info(len(postExtract_targetNotes))
+
+    logger.info(f"Num of Target Notes: {len(postExtract_targetNotes)}")
+
     for note in postExtract_targetNotes:
         tweet_id = note.tweet_id
 
@@ -71,6 +100,8 @@ def extract_data(db: Session):
         logger.info(tweet_id)
         post = lookup(tweet_id)
 
+        logger.info(post)
+
         if post == None or "data" not in post:
             continue
 
@@ -101,6 +132,18 @@ def extract_data(db: Session):
             )
             db.add(db_user)
 
+        if "entities" in post["data"] and "urls" in post["data"]["entities"]:
+            for url in post["data"]["entities"]["urls"]:
+                if "unwound_url" not in url or url["status"] != 200:
+                    continue
+                db_post_embed_url = RowPostEmbedURLRecord(
+                    post_id=post["data"]["id"],
+                    url=url["url"],
+                    expanded_url=url["expanded_url"],
+                    unwound_url=url["unwound_url"],
+                )
+                db.add(db_post_embed_url)
+
         media_data = (
             post["includes"]["media"][0]
             if "includes" in post and "media" in post["includes"] and len(post["includes"]["media"]) > 0
diff --git a/etl/src/birdxplorer_etl/lib/sqlite/init.py b/etl/src/birdxplorer_etl/lib/sqlite/init.py
index c167352..fc4ab66 100644
--- a/etl/src/birdxplorer_etl/lib/sqlite/init.py
+++ b/etl/src/birdxplorer_etl/lib/sqlite/init.py
@@ -5,7 +5,13 @@
 from sqlalchemy import create_engine, inspect
 from sqlalchemy.orm import sessionmaker
 
-from birdxplorer_common.storage import RowNoteRecord, RowPostRecord, RowUserRecord
+from birdxplorer_common.storage import (
+    RowNoteRecord,
+    RowPostRecord,
+    RowUserRecord,
+    RowPostEmbedURLRecord,
+    RowNoteStatusRecord,
+)
 
 
 def init_db():
@@ -24,9 +30,15 @@ def init_db():
     if not inspect(engine).has_table("row_posts"):
         logger.info("Creating table post")
         RowPostRecord.metadata.create_all(engine)
+    if not inspect(engine).has_table("row_note_status"):
+        logger.info("Creating table note_status")
+        RowNoteStatusRecord.metadata.create_all(engine)
     if not inspect(engine).has_table("row_users"):
         logger.info("Creating table user")
         RowUserRecord.metadata.create_all(engine)
+    if not inspect(engine).has_table("row_post_embed_urls"):
+        logger.info("Creating table post_embed_urls")
+        RowPostEmbedURLRecord.metadata.create_all(engine)
 
     Session = sessionmaker(bind=engine)
 
diff --git a/etl/src/birdxplorer_etl/test.json b/etl/src/birdxplorer_etl/test.json
new file mode 100644
index 0000000..395615b
--- /dev/null
+++ b/etl/src/birdxplorer_etl/test.json
@@ -0,0 +1,456 @@
+{
+  "data": {
+    "possibly_sensitive": false,
+    "author_id": "871657411525963776",
+    "edit_history_tweet_ids": ["1811769819198267436"],
+    "lang": "en",
+    "created_at": "2024-07-12T14:29:20.000Z",
+    "entities": {
+      "annotations": [
+        {
+          "start": 150,
+          "end": 164,
+          "probability": 0.6361,
+          "type": "Other",
+          "normalized_text": "Hyundai Ioniq 5"
+        },
+        {
+          "start": 223,
+          "end": 229,
+          "probability": 0.7171,
+          "type": "Other",
+          "normalized_text": "VW Polo"
+        }
+      ],
+      "hashtags": [{ "start": 232, "end": 241, "tag": "mobesity" }],
+      "urls": [
+        {
+          "start": 245,
+          "end": 268,
+          "url": "https://t.co/KiifOc4ZGS",
+          "expanded_url": "https://roaddamagecalculator.com/",
+          "display_url": "roaddamagecalculator.com",
+          "status": 200,
+          "title": "Road Damage",
+          "description": "Ever wondered how much more damage one vehicle does to the road than",
+          "unwound_url": "https://roaddamagecalculator.com/"
+        },
+        {
+          "start": 269,
+          "end": 292,
+          "url": "https://t.co/uV0j73ujer",
+          "expanded_url": "https://twitter.com/_chris_brand_/status/1811769819198267436/photo/1",
+          "display_url": "pic.x.com/uv0j73ujer",
+          "media_key": "3_1811768379000107008"
+        }
+      ]
+    },
+    "public_metrics": {
+      "retweet_count": 426,
+      "reply_count": 200,
+      "like_count": 1499,
+      "quote_count": 65,
+      "bookmark_count": 240,
+      "impression_count": 163126
+    },
+    "attachments": { "media_keys": ["3_1811768379000107008"] },
+    "text": "The change in damage to the road surface is proportional to the difference in axle weight to the fourth power\n\nSo, a 2.4-tonne electric SUV such as a Hyundai Ioniq 5 would therefore do 16 times more damage than a 1.2-tonne VW Polo\n\n#mobesity\n\n👉 https://t.co/KiifOc4ZGS https://t.co/uV0j73ujer",
+    "id": "1811769819198267436",
+    "edit_controls": {
+      "edits_remaining": 5,
+      "is_edit_eligible": false,
+      "editable_until": "2024-07-12T15:29:20.000Z"
+    },
+    "context_annotations": [
+      {
+        "domain": {
+          "id": "30",
+          "name": "Entities [Entity Service]",
+          "description": "Entity Service top level domain, every item that is in Entity Service should be in this domain"
+        },
+        "entity": {
+          "id": "781972125179518977",
+          "name": "Auto Manufacturer - Auto"
+        }
+      },
+      {
+        "domain": {
+          "id": "46",
+          "name": "Business Taxonomy",
+          "description": "Categories within Brand Verticals that narrow down the scope of Brands"
+        },
+        "entity": {
+          "id": "1557696420500541440",
+          "name": "Automotive, Aircraft & Boat Business",
+          "description": "Brands, companies, advertisers and every non-person handle with the profit intent related to automobile, spacecraft, aircraft and boat manufacturing"
+        }
+      },
+      {
+        "domain": {
+          "id": "47",
+          "name": "Brand",
+          "description": "Brands and Companies"
+        },
+        "entity": { "id": "10026295039", "name": "Hyundai" }
+      },
+      {
+        "domain": {
+          "id": "47",
+          "name": "Brand",
+          "description": "Brands and Companies"
+        },
+        "entity": { "id": "10026353537", "name": "Volkswagen" }
+      },
+      {
+        "domain": {
+          "id": "131",
+          "name": "Unified Twitter Taxonomy",
+          "description": "A taxonomy of user interests. "
+        },
+        "entity": { "id": "10026295039", "name": "Hyundai" }
+      },
+      {
+        "domain": {
+          "id": "131",
+          "name": "Unified Twitter Taxonomy",
+          "description": "A taxonomy of user interests. "
+        },
+        "entity": { "id": "10026353537", "name": "Volkswagen" }
+      },
+      {
+        "domain": {
+          "id": "131",
+          "name": "Unified Twitter Taxonomy",
+          "description": "A taxonomy of user interests. "
+        },
+        "entity": { "id": "1196845866138533888", "name": "Automobile Brands" }
+      },
+      {
+        "domain": {
+          "id": "30",
+          "name": "Entities [Entity Service]",
+          "description": "Entity Service top level domain, every item that is in Entity Service should be in this domain"
+        },
+        "entity": {
+          "id": "781972125179518977",
+          "name": "Auto Manufacturer - Auto"
+        }
+      },
+      {
+        "domain": {
+          "id": "47",
+          "name": "Brand",
+          "description": "Brands and Companies"
+        },
+        "entity": { "id": "10026295039", "name": "Hyundai" }
+      },
+      {
+        "domain": {
+          "id": "48",
+          "name": "Product",
+          "description": "Products created by Brands.  Examples: Ford Explorer, Apple iPhone."
+        },
+        "entity": { "id": "10044387828", "name": "Hyundai - Ioniq" }
+      },
+      {
+        "domain": {
+          "id": "65",
+          "name": "Interests and Hobbies Vertical",
+          "description": "Top level interests and hobbies groupings, like Food or Travel"
+        },
+        "entity": {
+          "id": "847528391163092993",
+          "name": "Automotive",
+          "description": "Car culture"
+        }
+      },
+      {
+        "domain": {
+          "id": "66",
+          "name": "Interests and Hobbies Category",
+          "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations"
+        },
+        "entity": {
+          "id": "847528576551337984",
+          "name": "Hybrid and electric vehicles",
+          "description": "Hybrid and electric vehicles"
+        }
+      },
+      {
+        "domain": {
+          "id": "66",
+          "name": "Interests and Hobbies Category",
+          "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations"
+        },
+        "entity": {
+          "id": "847529204530921472",
+          "name": "SUVs",
+          "description": "SUVs"
+        }
+      }
+    ],
+    "conversation_id": "1811769819198267436",
+    "reply_settings": "everyone"
+  },
+  "includes": {
+    "media": [
+      {
+        "height": 495,
+        "width": 607,
+        "url": "https://pbs.twimg.com/media/GSSxvdWWMAAuVk_.png",
+        "media_key": "3_1811768379000107008",
+        "type": "photo"
+      }
+    ],
+    "users": [
+      {
+        "id": "871657411525963776",
+        "verified_type": "none",
+        "url": "https://t.co/sBthi7IAvB",
+        "description": "Scientist. Prof at Oxford University https://t.co/0JetFU9aYd @TSUOxford and @ecioxford. Sustainable and healthy transport. Active travel. Transport and climate policy.",
+        "most_recent_tweet_id": "1829881319222043134",
+        "name": "Prof. Christian Brand",
+        "public_metrics": {
+          "followers_count": 2379,
+          "following_count": 917,
+          "tweet_count": 4733,
+          "listed_count": 37,
+          "like_count": 7760
+        },
+        "entities": {
+          "url": {
+            "urls": [
+              {
+                "start": 0,
+                "end": 23,
+                "url": "https://t.co/sBthi7IAvB",
+                "expanded_url": "http://www.tsu.ox.ac.uk/people/cbrand.html",
+                "display_url": "tsu.ox.ac.uk/people/cbrand.…"
+              }
+            ]
+          },
+          "description": {
+            "urls": [
+              {
+                "start": 37,
+                "end": 60,
+                "url": "https://t.co/0JetFU9aYd",
+                "expanded_url": "http://ox.ac.uk",
+                "display_url": "ox.ac.uk"
+              }
+            ],
+            "mentions": [
+              { "start": 61, "end": 71, "username": "TSUOxford" },
+              { "start": 76, "end": 86, "username": "ecioxford" }
+            ]
+          }
+        },
+        "profile_image_url": "https://pbs.twimg.com/profile_images/1707022613413818368/YLvCT_0r_normal.jpg",
+        "protected": false,
+        "username": "_chris_brand_",
+        "pinned_tweet_id": "1585556640933232640",
+        "verified": false,
+        "location": "Oxford, UK",
+        "created_at": "2017-06-05T09:18:16.000Z"
+      }
+    ],
+    "tweets": [
+      {
+        "possibly_sensitive": false,
+        "author_id": "871657411525963776",
+        "edit_history_tweet_ids": ["1811769819198267436"],
+        "lang": "en",
+        "created_at": "2024-07-12T14:29:20.000Z",
+        "entities": {
+          "annotations": [
+            {
+              "start": 150,
+              "end": 164,
+              "probability": 0.6361,
+              "type": "Other",
+              "normalized_text": "Hyundai Ioniq 5"
+            },
+            {
+              "start": 223,
+              "end": 229,
+              "probability": 0.7171,
+              "type": "Other",
+              "normalized_text": "VW Polo"
+            }
+          ],
+          "hashtags": [{ "start": 232, "end": 241, "tag": "mobesity" }],
+          "urls": [
+            {
+              "start": 245,
+              "end": 268,
+              "url": "https://t.co/KiifOc4ZGS",
+              "expanded_url": "https://roaddamagecalculator.com/",
+              "display_url": "roaddamagecalculator.com",
+              "status": 200,
+              "title": "Road Damage",
+              "description": "Ever wondered how much more damage one vehicle does to the road than",
+              "unwound_url": "https://roaddamagecalculator.com/"
+            },
+            {
+              "start": 269,
+              "end": 292,
+              "url": "https://t.co/uV0j73ujer",
+              "expanded_url": "https://twitter.com/_chris_brand_/status/1811769819198267436/photo/1",
+              "display_url": "pic.x.com/uv0j73ujer",
+              "media_key": "3_1811768379000107008"
+            }
+          ]
+        },
+        "public_metrics": {
+          "retweet_count": 426,
+          "reply_count": 200,
+          "like_count": 1499,
+          "quote_count": 65,
+          "bookmark_count": 240,
+          "impression_count": 163126
+        },
+        "attachments": { "media_keys": ["3_1811768379000107008"] },
+        "text": "The change in damage to the road surface is proportional to the difference in axle weight to the fourth power\n\nSo, a 2.4-tonne electric SUV such as a Hyundai Ioniq 5 would therefore do 16 times more damage than a 1.2-tonne VW Polo\n\n#mobesity\n\n👉 https://t.co/KiifOc4ZGS https://t.co/uV0j73ujer",
+        "id": "1811769819198267436",
+        "edit_controls": {
+          "edits_remaining": 5,
+          "is_edit_eligible": false,
+          "editable_until": "2024-07-12T15:29:20.000Z"
+        },
+        "context_annotations": [
+          {
+            "domain": {
+              "id": "30",
+              "name": "Entities [Entity Service]",
+              "description": "Entity Service top level domain, every item that is in Entity Service should be in this domain"
+            },
+            "entity": {
+              "id": "781972125179518977",
+              "name": "Auto Manufacturer - Auto"
+            }
+          },
+          {
+            "domain": {
+              "id": "46",
+              "name": "Business Taxonomy",
+              "description": "Categories within Brand Verticals that narrow down the scope of Brands"
+            },
+            "entity": {
+              "id": "1557696420500541440",
+              "name": "Automotive, Aircraft & Boat Business",
+              "description": "Brands, companies, advertisers and every non-person handle with the profit intent related to automobile, spacecraft, aircraft and boat manufacturing"
+            }
+          },
+          {
+            "domain": {
+              "id": "47",
+              "name": "Brand",
+              "description": "Brands and Companies"
+            },
+            "entity": { "id": "10026295039", "name": "Hyundai" }
+          },
+          {
+            "domain": {
+              "id": "47",
+              "name": "Brand",
+              "description": "Brands and Companies"
+            },
+            "entity": { "id": "10026353537", "name": "Volkswagen" }
+          },
+          {
+            "domain": {
+              "id": "131",
+              "name": "Unified Twitter Taxonomy",
+              "description": "A taxonomy of user interests. "
+            },
+            "entity": { "id": "10026295039", "name": "Hyundai" }
+          },
+          {
+            "domain": {
+              "id": "131",
+              "name": "Unified Twitter Taxonomy",
+              "description": "A taxonomy of user interests. "
+            },
+            "entity": { "id": "10026353537", "name": "Volkswagen" }
+          },
+          {
+            "domain": {
+              "id": "131",
+              "name": "Unified Twitter Taxonomy",
+              "description": "A taxonomy of user interests. "
+            },
+            "entity": {
+              "id": "1196845866138533888",
+              "name": "Automobile Brands"
+            }
+          },
+          {
+            "domain": {
+              "id": "30",
+              "name": "Entities [Entity Service]",
+              "description": "Entity Service top level domain, every item that is in Entity Service should be in this domain"
+            },
+            "entity": {
+              "id": "781972125179518977",
+              "name": "Auto Manufacturer - Auto"
+            }
+          },
+          {
+            "domain": {
+              "id": "47",
+              "name": "Brand",
+              "description": "Brands and Companies"
+            },
+            "entity": { "id": "10026295039", "name": "Hyundai" }
+          },
+          {
+            "domain": {
+              "id": "48",
+              "name": "Product",
+              "description": "Products created by Brands.  Examples: Ford Explorer, Apple iPhone."
+            },
+            "entity": { "id": "10044387828", "name": "Hyundai - Ioniq" }
+          },
+          {
+            "domain": {
+              "id": "65",
+              "name": "Interests and Hobbies Vertical",
+              "description": "Top level interests and hobbies groupings, like Food or Travel"
+            },
+            "entity": {
+              "id": "847528391163092993",
+              "name": "Automotive",
+              "description": "Car culture"
+            }
+          },
+          {
+            "domain": {
+              "id": "66",
+              "name": "Interests and Hobbies Category",
+              "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations"
+            },
+            "entity": {
+              "id": "847528576551337984",
+              "name": "Hybrid and electric vehicles",
+              "description": "Hybrid and electric vehicles"
+            }
+          },
+          {
+            "domain": {
+              "id": "66",
+              "name": "Interests and Hobbies Category",
+              "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations"
+            },
+            "entity": {
+              "id": "847529204530921472",
+              "name": "SUVs",
+              "description": "SUVs"
+            }
+          }
+        ],
+        "conversation_id": "1811769819198267436",
+        "reply_settings": "everyone"
+      }
+    ]
+  }
+}

From 24c646496e8610606a130fbf3562b055886b4a78 Mon Sep 17 00:00:00 2001
From: yu23ki14 <yuki_021423@yahoo.co.jp>
Date: Wed, 2 Oct 2024 09:11:37 +0900
Subject: [PATCH 2/4] add status

---
 common/birdxplorer_common/storage.py | 40 +++++++++++-----------
 etl/src/birdxplorer_etl/extract.py   | 50 +++++++++++-----------------
 etl/src/birdxplorer_etl/transform.py | 42 ++++++++++++++---------
 3 files changed, 67 insertions(+), 65 deletions(-)

diff --git a/common/birdxplorer_common/storage.py b/common/birdxplorer_common/storage.py
index d9612e1..d5acf94 100644
--- a/common/birdxplorer_common/storage.py
+++ b/common/birdxplorer_common/storage.py
@@ -139,26 +139,26 @@ class RowNoteStatusRecord(Base):
     note_id: Mapped[NoteId] = mapped_column(ForeignKey("row_notes.note_id"), primary_key=True)
     note_author_participant_id: Mapped[ParticipantId] = mapped_column(nullable=False)
     created_at_millis: Mapped[TwitterTimestamp] = mapped_column(nullable=False)
-    timestamp_millis_of_first_non_n_m_r_status: Mapped[TwitterTimestamp] = mapped_column()
-    first_non_n_m_r_status: Mapped[String] = mapped_column()
-    timestamp_millis_of_current_status: Mapped[TwitterTimestamp] = mapped_column()
-    current_status: Mapped[String] = mapped_column()
-    timestamp_millis_of_latest_non_n_m_r_status: Mapped[TwitterTimestamp] = mapped_column()
-    most_recent_non_n_m_r_status: Mapped[String] = mapped_column()
-    timestamp_millis_of_status_lock: Mapped[TwitterTimestamp] = mapped_column()
-    locked_status: Mapped[String] = mapped_column()
-    timestamp_millis_of_retro_lock: Mapped[TwitterTimestamp] = mapped_column()
-    current_core_status: Mapped[String] = mapped_column()
-    current_expansion_status: Mapped[String] = mapped_column()
-    current_group_status: Mapped[String] = mapped_column()
-    current_decided_by: Mapped[String] = mapped_column()
-    current_modeling_group: Mapped[int] = mapped_column()
-    timestamp_millis_of_most_recent_status_change: Mapped[TwitterTimestamp] = mapped_column()
-    timestamp_millis_of_nmr_due_to_min_stable_crh_time: Mapped[TwitterTimestamp] = mapped_column()
-    current_multi_group_status: Mapped[String] = mapped_column()
-    current_modeling_multi_group: Mapped[int] = mapped_column()
-    timestamp_minute_of_final_scoring_output: Mapped[TwitterTimestamp] = mapped_column()
-    timestamp_millis_of_first_nmr_due_to_min_stable_crh_time: Mapped[TwitterTimestamp] = mapped_column()
+    timestamp_millis_of_first_non_n_m_r_status: Mapped[TwitterTimestamp] = mapped_column(nullable=True)
+    first_non_n_m_r_status: Mapped[String] = mapped_column(nullable=True)
+    timestamp_millis_of_current_status: Mapped[TwitterTimestamp] = mapped_column(nullable=True)
+    current_status: Mapped[String] = mapped_column(nullable=True)
+    timestamp_millis_of_latest_non_n_m_r_status: Mapped[TwitterTimestamp] = mapped_column(nullable=True)
+    most_recent_non_n_m_r_status: Mapped[String] = mapped_column(nullable=True)
+    timestamp_millis_of_status_lock: Mapped[TwitterTimestamp] = mapped_column(nullable=True)
+    locked_status: Mapped[String] = mapped_column(nullable=True)
+    timestamp_millis_of_retro_lock: Mapped[TwitterTimestamp] = mapped_column(nullable=True)
+    current_core_status: Mapped[String] = mapped_column(nullable=True)
+    current_expansion_status: Mapped[String] = mapped_column(nullable=True)
+    current_group_status: Mapped[String] = mapped_column(nullable=True)
+    current_decided_by: Mapped[String] = mapped_column(nullable=True)
+    current_modeling_group: Mapped[int] = mapped_column(nullable=True)
+    timestamp_millis_of_most_recent_status_change: Mapped[TwitterTimestamp] = mapped_column(nullable=True)
+    timestamp_millis_of_nmr_due_to_min_stable_crh_time: Mapped[TwitterTimestamp] = mapped_column(nullable=True)
+    current_multi_group_status: Mapped[String] = mapped_column(nullable=True)
+    current_modeling_multi_group: Mapped[int] = mapped_column(nullable=True)
+    timestamp_minute_of_final_scoring_output: Mapped[TwitterTimestamp] = mapped_column(nullable=True)
+    timestamp_millis_of_first_nmr_due_to_min_stable_crh_time: Mapped[TwitterTimestamp] = mapped_column(nullable=True)
 
 
 class RowPostRecord(Base):
diff --git a/etl/src/birdxplorer_etl/extract.py b/etl/src/birdxplorer_etl/extract.py
index cd4bd8b..d47c9be 100644
--- a/etl/src/birdxplorer_etl/extract.py
+++ b/etl/src/birdxplorer_etl/extract.py
@@ -5,13 +5,7 @@
 from prefect import get_run_logger
 from sqlalchemy.orm import Session
 from lib.x.postlookup import lookup
-from birdxplorer_common.storage import (
-    RowNoteRecord,
-    RowPostRecord,
-    RowUserRecord,
-    RowPostEmbedURLRecord,
-    RowNoteStatusRecord,
-)
+from birdxplorer_common.storage import RowNoteRecord, RowPostRecord, RowUserRecord, RowNoteStatusRecord
 import settings
 
 
@@ -36,7 +30,10 @@ def extract_data(db: Session):
             break
 
         dateString = date.strftime("%Y/%m/%d")
-        note_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/notes/notes-00000.tsv"
+        # note_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/notes/notes-00000.tsv"
+        note_url = (
+            "https://raw.githubusercontent.com/codeforjapan/BirdXplorer/refs/heads/main/etl/data/notes_sample.tsv"
+        )
         logger.info(note_url)
         res = requests.get(note_url)
 
@@ -47,28 +44,37 @@ def extract_data(db: Session):
             reader.fieldnames = [stringcase.snakecase(field) for field in reader.fieldnames]
 
             rows_to_add = []
-            for row in reader:
+            for index, row in enumerate(reader):
                 if db.query(RowNoteRecord).filter(RowNoteRecord.note_id == row["note_id"]).first():
                     continue
                 rows_to_add.append(RowNoteRecord(**row))
+                if index % 1000 == 0:
+                    db.bulk_save_objects(rows_to_add)
+                    rows_to_add = []
             db.bulk_save_objects(rows_to_add)
 
-            status_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/noteStatusHistory/noteStatusHistory-00000.tsv"
+            # status_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/noteStatusHistory/noteStatusHistory-00000.tsv"
+            status_url = "https://raw.githubusercontent.com/codeforjapan/BirdXplorer/refs/heads/main/etl/data/noteStatus_sample.tsv"
             logger.info(status_url)
             res = requests.get(status_url)
 
             if res.status_code == 200:
-                # res.contentをdbのNoteStatusテーブル
                 tsv_data = res.content.decode("utf-8").splitlines()
                 reader = csv.DictReader(tsv_data, delimiter="\t")
                 reader.fieldnames = [stringcase.snakecase(field) for field in reader.fieldnames]
 
                 rows_to_add = []
-                for row in reader:
+                for index, row in enumerate(reader):
+                    for key, value in list(row.items()):
+                        if value == "":
+                            row[key] = None
                     status = db.query(RowNoteStatusRecord).filter(RowNoteStatusRecord.note_id == row["note_id"]).first()
-                    if status is None or status.created_at_millis > int(datetime.now().time() * 1000):
+                    if status is None or status.created_at_millis > int(datetime.now().timestamp() * 1000):
                         db.query(RowNoteStatusRecord).filter(RowNoteStatusRecord.note_id == row["note_id"]).delete()
                         rows_to_add.append(RowNoteStatusRecord(**row))
+                    if index % 1000 == 0:
+                        db.bulk_save_objects(rows_to_add)
+                        rows_to_add = []
                 db.bulk_save_objects(rows_to_add)
 
                 break
@@ -85,9 +91,7 @@ def extract_data(db: Session):
         .filter(RowNoteRecord.created_at_millis <= settings.TARGET_TWITTER_POST_END_UNIX_MILLISECOND)
         .all()
     )
-
-    logger.info(f"Num of Target Notes: {len(postExtract_targetNotes)}")
-
+    logger.info(len(postExtract_targetNotes))
     for note in postExtract_targetNotes:
         tweet_id = note.tweet_id
 
@@ -100,8 +104,6 @@ def extract_data(db: Session):
         logger.info(tweet_id)
         post = lookup(tweet_id)
 
-        logger.info(post)
-
         if post == None or "data" not in post:
             continue
 
@@ -132,18 +134,6 @@ def extract_data(db: Session):
             )
             db.add(db_user)
 
-        if "entities" in post["data"] and "urls" in post["data"]["entities"]:
-            for url in post["data"]["entities"]["urls"]:
-                if "unwound_url" not in url or url["status"] != 200:
-                    continue
-                db_post_embed_url = RowPostEmbedURLRecord(
-                    post_id=post["data"]["id"],
-                    url=url["url"],
-                    expanded_url=url["expanded_url"],
-                    unwound_url=url["unwound_url"],
-                )
-                db.add(db_post_embed_url)
-
         media_data = (
             post["includes"]["media"][0]
             if "includes" in post and "media" in post["includes"] and len(post["includes"]["media"]) > 0
diff --git a/etl/src/birdxplorer_etl/transform.py b/etl/src/birdxplorer_etl/transform.py
index acefbc2..6de446b 100644
--- a/etl/src/birdxplorer_etl/transform.py
+++ b/etl/src/birdxplorer_etl/transform.py
@@ -1,6 +1,6 @@
 from sqlalchemy import select, func, and_, Integer
 from sqlalchemy.orm import Session
-from birdxplorer_common.storage import RowNoteRecord, RowPostRecord, RowUserRecord
+from birdxplorer_common.storage import RowNoteRecord, RowPostRecord, RowUserRecord, RowNoteStatusRecord
 from birdxplorer_etl.lib.ai_model.ai_model_interface import get_ai_service
 from birdxplorer_etl.settings import (
     TARGET_NOTE_ESTIMATE_TOPIC_START_UNIX_MILLISECOND,
@@ -23,7 +23,7 @@ def transform_data(db: Session):
         os.remove("./data/transformed/note.csv")
     with open("./data/transformed/note.csv", "a") as file:
         writer = csv.writer(file)
-        writer.writerow(["note_id", "post_id", "summary", "created_at", "language"])
+        writer.writerow(["note_id", "post_id", "summary", "current_status", "created_at", "language"])
 
     offset = 0
     limit = 1000
@@ -49,14 +49,10 @@ def transform_data(db: Session):
                     RowNoteRecord.note_id,
                     RowNoteRecord.row_post_id,
                     RowNoteRecord.summary,
+                    RowNoteStatusRecord.current_status,
                     func.cast(RowNoteRecord.created_at_millis, Integer).label("created_at"),
                 )
-                .filter(
-                    and_(
-                        RowNoteRecord.created_at_millis <= TARGET_NOTE_ESTIMATE_TOPIC_END_UNIX_MILLISECOND,
-                        RowNoteRecord.created_at_millis >= TARGET_NOTE_ESTIMATE_TOPIC_START_UNIX_MILLISECOND,
-                    )
-                )
+                .join(RowNoteStatusRecord, RowNoteRecord.note_id == RowNoteStatusRecord.note_id)
                 .limit(limit)
                 .offset(offset)
             )
@@ -170,7 +166,7 @@ def transform_data(db: Session):
     return
 
 
-def generate_note_topic():
+def generate_note_topic(db: Session):
     note_csv_file_path = "./data/transformed/note.csv"
     output_csv_file_path = "./data/transformed/note_topic_association.csv"
     ai_service = get_ai_service()
@@ -181,17 +177,33 @@ def generate_note_topic():
         writer = csv.DictWriter(file, fieldnames=fieldnames)
         writer.writeheader()
 
-        with open(note_csv_file_path, newline="", encoding="utf-8") as csvfile:
-            reader = csv.DictReader(csvfile)
-            for index, row in enumerate(reader):
-                note_id = row["note_id"]
-                summary = row["summary"]
+        offset = 0
+        limit = 1000
+
+        num_of_users = db.query(func.count(RowUserRecord.user_id)).scalar()
+
+        while offset < num_of_users:
+            topicEstimationTargetNotes = db.execute(
+                select(RowNoteRecord.note_id, RowNoteRecord.row_post_id, RowNoteRecord.summary)
+                .filter(
+                    and_(
+                        RowNoteRecord.created_at_millis <= TARGET_NOTE_ESTIMATE_TOPIC_END_UNIX_MILLISECOND,
+                        RowNoteRecord.created_at_millis >= TARGET_NOTE_ESTIMATE_TOPIC_START_UNIX_MILLISECOND,
+                    )
+                )
+                .join(RowNoteStatusRecord, RowNoteRecord.note_id == RowNoteStatusRecord.note_id)
+                .limit(limit)
+                .offset(offset)
+            )
+
+            for index, note in enumerate(topicEstimationTargetNotes):
+                note_id = note.note_id
+                summary = note.summary
                 topics_info = ai_service.detect_topic(note_id, summary)
                 if topics_info:
                     for topic in topics_info.get("topics", []):
                         record = {"note_id": note_id, "topic_id": topic}
                         records.append(record)
-
                 if index % 100 == 0:
                     for record in records:
                         writer.writerow(

From d542a763a335dbb337753ee727597de0d73cf42e Mon Sep 17 00:00:00 2001
From: yu23ki14 <yuki_021423@yahoo.co.jp>
Date: Wed, 2 Oct 2024 10:38:23 +0900
Subject: [PATCH 3/4] add link csv

---
 etl/.env.example                     |  14 +-
 etl/src/birdxplorer_etl/extract.py   |  35 +-
 etl/src/birdxplorer_etl/settings.py  |   2 +
 etl/src/birdxplorer_etl/test.json    | 456 ---------------------------
 etl/src/birdxplorer_etl/transform.py |  65 +++-
 5 files changed, 103 insertions(+), 469 deletions(-)
 delete mode 100644 etl/src/birdxplorer_etl/test.json

diff --git a/etl/.env.example b/etl/.env.example
index 112d707..c8a0054 100644
--- a/etl/.env.example
+++ b/etl/.env.example
@@ -1,6 +1,14 @@
 X_BEARER_TOKEN=
-AI_MODEL=
+
+COMMUNITY_NOTE_DAYS_AGO=3
+
+TARGET_TWITTER_POST_START_UNIX_MILLISECOND=1719851000000
+TARGET_TWITTER_POST_END_UNIX_MILLISECOND=1719891000000
+
+AI_MODEL=openai
 OPENAPI_TOKEN=
 CLAUDE_TOKEN=
-TARGET_NOTE_ESTIMATE_TOPIC_START_UNIX_MILLISECOND=1720900800000
-TARGET_NOTE_ESTIMATE_TOPIC_END_UNIX_MILLISECOND=1722110400000
\ No newline at end of file
+TARGET_NOTE_ESTIMATE_TOPIC_START_UNIX_MILLISECOND=1719851000000
+TARGET_NOTE_ESTIMATE_TOPIC_END_UNIX_MILLISECOND=1719891000000
+
+USE_DUMMY_DATA=False
\ No newline at end of file
diff --git a/etl/src/birdxplorer_etl/extract.py b/etl/src/birdxplorer_etl/extract.py
index d47c9be..b29c5de 100644
--- a/etl/src/birdxplorer_etl/extract.py
+++ b/etl/src/birdxplorer_etl/extract.py
@@ -5,7 +5,13 @@
 from prefect import get_run_logger
 from sqlalchemy.orm import Session
 from lib.x.postlookup import lookup
-from birdxplorer_common.storage import RowNoteRecord, RowPostRecord, RowUserRecord, RowNoteStatusRecord
+from birdxplorer_common.storage import (
+    RowNoteRecord,
+    RowPostRecord,
+    RowUserRecord,
+    RowNoteStatusRecord,
+    RowPostEmbedURLRecord,
+)
 import settings
 
 
@@ -30,10 +36,12 @@ def extract_data(db: Session):
             break
 
         dateString = date.strftime("%Y/%m/%d")
-        # note_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/notes/notes-00000.tsv"
-        note_url = (
-            "https://raw.githubusercontent.com/codeforjapan/BirdXplorer/refs/heads/main/etl/data/notes_sample.tsv"
-        )
+        note_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/notes/notes-00000.tsv"
+        if settings.USE_DUMMY_DATA:
+            note_url = (
+                "https://raw.githubusercontent.com/codeforjapan/BirdXplorer/refs/heads/main/etl/data/notes_sample.tsv"
+            )
+
         logger.info(note_url)
         res = requests.get(note_url)
 
@@ -53,8 +61,10 @@ def extract_data(db: Session):
                     rows_to_add = []
             db.bulk_save_objects(rows_to_add)
 
-            # status_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/noteStatusHistory/noteStatusHistory-00000.tsv"
-            status_url = "https://raw.githubusercontent.com/codeforjapan/BirdXplorer/refs/heads/main/etl/data/noteStatus_sample.tsv"
+            status_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/noteStatusHistory/noteStatusHistory-00000.tsv"
+            if settings.USE_DUMMY_DATA:
+                status_url = "https://raw.githubusercontent.com/codeforjapan/BirdXplorer/refs/heads/main/etl/data/noteStatus_sample.tsv"
+
             logger.info(status_url)
             res = requests.get(status_url)
 
@@ -155,6 +165,17 @@ def extract_data(db: Session):
             lang=post["data"]["lang"],
         )
         db.add(db_post)
+
+        if "entities" in post["data"] and "urls" in post["data"]["entities"]:
+            for url in post["data"]["entities"]["urls"]:
+                if "unwound_url" in url:
+                    post_url = RowPostEmbedURLRecord(
+                        post_id=post["data"]["id"],
+                        url=url["url"] if url["url"] else None,
+                        expanded_url=url["expanded_url"] if url["expanded_url"] else None,
+                        unwound_url=url["unwound_url"] if url["unwound_url"] else None,
+                    )
+                    db.add(post_url)
         note.row_post_id = tweet_id
         db.commit()
         continue
diff --git a/etl/src/birdxplorer_etl/settings.py b/etl/src/birdxplorer_etl/settings.py
index d6e9f2e..c775af3 100644
--- a/etl/src/birdxplorer_etl/settings.py
+++ b/etl/src/birdxplorer_etl/settings.py
@@ -17,3 +17,5 @@
 CLAUDE_TOKEN = os.getenv("CLAUDE_TOKEN")
 TARGET_NOTE_ESTIMATE_TOPIC_START_UNIX_MILLISECOND = os.getenv("TARGET_NOTE_ESTIMATE_TOPIC_START_UNIX_MILLISECOND")
 TARGET_NOTE_ESTIMATE_TOPIC_END_UNIX_MILLISECOND = os.getenv("TARGET_NOTE_ESTIMATE_TOPIC_END_UNIX_MILLISECOND")
+
+USE_DUMMY_DATA = os.getenv("USE_DUMMY_DATA", "False") == "True"
diff --git a/etl/src/birdxplorer_etl/test.json b/etl/src/birdxplorer_etl/test.json
deleted file mode 100644
index 395615b..0000000
--- a/etl/src/birdxplorer_etl/test.json
+++ /dev/null
@@ -1,456 +0,0 @@
-{
-  "data": {
-    "possibly_sensitive": false,
-    "author_id": "871657411525963776",
-    "edit_history_tweet_ids": ["1811769819198267436"],
-    "lang": "en",
-    "created_at": "2024-07-12T14:29:20.000Z",
-    "entities": {
-      "annotations": [
-        {
-          "start": 150,
-          "end": 164,
-          "probability": 0.6361,
-          "type": "Other",
-          "normalized_text": "Hyundai Ioniq 5"
-        },
-        {
-          "start": 223,
-          "end": 229,
-          "probability": 0.7171,
-          "type": "Other",
-          "normalized_text": "VW Polo"
-        }
-      ],
-      "hashtags": [{ "start": 232, "end": 241, "tag": "mobesity" }],
-      "urls": [
-        {
-          "start": 245,
-          "end": 268,
-          "url": "https://t.co/KiifOc4ZGS",
-          "expanded_url": "https://roaddamagecalculator.com/",
-          "display_url": "roaddamagecalculator.com",
-          "status": 200,
-          "title": "Road Damage",
-          "description": "Ever wondered how much more damage one vehicle does to the road than",
-          "unwound_url": "https://roaddamagecalculator.com/"
-        },
-        {
-          "start": 269,
-          "end": 292,
-          "url": "https://t.co/uV0j73ujer",
-          "expanded_url": "https://twitter.com/_chris_brand_/status/1811769819198267436/photo/1",
-          "display_url": "pic.x.com/uv0j73ujer",
-          "media_key": "3_1811768379000107008"
-        }
-      ]
-    },
-    "public_metrics": {
-      "retweet_count": 426,
-      "reply_count": 200,
-      "like_count": 1499,
-      "quote_count": 65,
-      "bookmark_count": 240,
-      "impression_count": 163126
-    },
-    "attachments": { "media_keys": ["3_1811768379000107008"] },
-    "text": "The change in damage to the road surface is proportional to the difference in axle weight to the fourth power\n\nSo, a 2.4-tonne electric SUV such as a Hyundai Ioniq 5 would therefore do 16 times more damage than a 1.2-tonne VW Polo\n\n#mobesity\n\n👉 https://t.co/KiifOc4ZGS https://t.co/uV0j73ujer",
-    "id": "1811769819198267436",
-    "edit_controls": {
-      "edits_remaining": 5,
-      "is_edit_eligible": false,
-      "editable_until": "2024-07-12T15:29:20.000Z"
-    },
-    "context_annotations": [
-      {
-        "domain": {
-          "id": "30",
-          "name": "Entities [Entity Service]",
-          "description": "Entity Service top level domain, every item that is in Entity Service should be in this domain"
-        },
-        "entity": {
-          "id": "781972125179518977",
-          "name": "Auto Manufacturer - Auto"
-        }
-      },
-      {
-        "domain": {
-          "id": "46",
-          "name": "Business Taxonomy",
-          "description": "Categories within Brand Verticals that narrow down the scope of Brands"
-        },
-        "entity": {
-          "id": "1557696420500541440",
-          "name": "Automotive, Aircraft & Boat Business",
-          "description": "Brands, companies, advertisers and every non-person handle with the profit intent related to automobile, spacecraft, aircraft and boat manufacturing"
-        }
-      },
-      {
-        "domain": {
-          "id": "47",
-          "name": "Brand",
-          "description": "Brands and Companies"
-        },
-        "entity": { "id": "10026295039", "name": "Hyundai" }
-      },
-      {
-        "domain": {
-          "id": "47",
-          "name": "Brand",
-          "description": "Brands and Companies"
-        },
-        "entity": { "id": "10026353537", "name": "Volkswagen" }
-      },
-      {
-        "domain": {
-          "id": "131",
-          "name": "Unified Twitter Taxonomy",
-          "description": "A taxonomy of user interests. "
-        },
-        "entity": { "id": "10026295039", "name": "Hyundai" }
-      },
-      {
-        "domain": {
-          "id": "131",
-          "name": "Unified Twitter Taxonomy",
-          "description": "A taxonomy of user interests. "
-        },
-        "entity": { "id": "10026353537", "name": "Volkswagen" }
-      },
-      {
-        "domain": {
-          "id": "131",
-          "name": "Unified Twitter Taxonomy",
-          "description": "A taxonomy of user interests. "
-        },
-        "entity": { "id": "1196845866138533888", "name": "Automobile Brands" }
-      },
-      {
-        "domain": {
-          "id": "30",
-          "name": "Entities [Entity Service]",
-          "description": "Entity Service top level domain, every item that is in Entity Service should be in this domain"
-        },
-        "entity": {
-          "id": "781972125179518977",
-          "name": "Auto Manufacturer - Auto"
-        }
-      },
-      {
-        "domain": {
-          "id": "47",
-          "name": "Brand",
-          "description": "Brands and Companies"
-        },
-        "entity": { "id": "10026295039", "name": "Hyundai" }
-      },
-      {
-        "domain": {
-          "id": "48",
-          "name": "Product",
-          "description": "Products created by Brands.  Examples: Ford Explorer, Apple iPhone."
-        },
-        "entity": { "id": "10044387828", "name": "Hyundai - Ioniq" }
-      },
-      {
-        "domain": {
-          "id": "65",
-          "name": "Interests and Hobbies Vertical",
-          "description": "Top level interests and hobbies groupings, like Food or Travel"
-        },
-        "entity": {
-          "id": "847528391163092993",
-          "name": "Automotive",
-          "description": "Car culture"
-        }
-      },
-      {
-        "domain": {
-          "id": "66",
-          "name": "Interests and Hobbies Category",
-          "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations"
-        },
-        "entity": {
-          "id": "847528576551337984",
-          "name": "Hybrid and electric vehicles",
-          "description": "Hybrid and electric vehicles"
-        }
-      },
-      {
-        "domain": {
-          "id": "66",
-          "name": "Interests and Hobbies Category",
-          "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations"
-        },
-        "entity": {
-          "id": "847529204530921472",
-          "name": "SUVs",
-          "description": "SUVs"
-        }
-      }
-    ],
-    "conversation_id": "1811769819198267436",
-    "reply_settings": "everyone"
-  },
-  "includes": {
-    "media": [
-      {
-        "height": 495,
-        "width": 607,
-        "url": "https://pbs.twimg.com/media/GSSxvdWWMAAuVk_.png",
-        "media_key": "3_1811768379000107008",
-        "type": "photo"
-      }
-    ],
-    "users": [
-      {
-        "id": "871657411525963776",
-        "verified_type": "none",
-        "url": "https://t.co/sBthi7IAvB",
-        "description": "Scientist. Prof at Oxford University https://t.co/0JetFU9aYd @TSUOxford and @ecioxford. Sustainable and healthy transport. Active travel. Transport and climate policy.",
-        "most_recent_tweet_id": "1829881319222043134",
-        "name": "Prof. Christian Brand",
-        "public_metrics": {
-          "followers_count": 2379,
-          "following_count": 917,
-          "tweet_count": 4733,
-          "listed_count": 37,
-          "like_count": 7760
-        },
-        "entities": {
-          "url": {
-            "urls": [
-              {
-                "start": 0,
-                "end": 23,
-                "url": "https://t.co/sBthi7IAvB",
-                "expanded_url": "http://www.tsu.ox.ac.uk/people/cbrand.html",
-                "display_url": "tsu.ox.ac.uk/people/cbrand.…"
-              }
-            ]
-          },
-          "description": {
-            "urls": [
-              {
-                "start": 37,
-                "end": 60,
-                "url": "https://t.co/0JetFU9aYd",
-                "expanded_url": "http://ox.ac.uk",
-                "display_url": "ox.ac.uk"
-              }
-            ],
-            "mentions": [
-              { "start": 61, "end": 71, "username": "TSUOxford" },
-              { "start": 76, "end": 86, "username": "ecioxford" }
-            ]
-          }
-        },
-        "profile_image_url": "https://pbs.twimg.com/profile_images/1707022613413818368/YLvCT_0r_normal.jpg",
-        "protected": false,
-        "username": "_chris_brand_",
-        "pinned_tweet_id": "1585556640933232640",
-        "verified": false,
-        "location": "Oxford, UK",
-        "created_at": "2017-06-05T09:18:16.000Z"
-      }
-    ],
-    "tweets": [
-      {
-        "possibly_sensitive": false,
-        "author_id": "871657411525963776",
-        "edit_history_tweet_ids": ["1811769819198267436"],
-        "lang": "en",
-        "created_at": "2024-07-12T14:29:20.000Z",
-        "entities": {
-          "annotations": [
-            {
-              "start": 150,
-              "end": 164,
-              "probability": 0.6361,
-              "type": "Other",
-              "normalized_text": "Hyundai Ioniq 5"
-            },
-            {
-              "start": 223,
-              "end": 229,
-              "probability": 0.7171,
-              "type": "Other",
-              "normalized_text": "VW Polo"
-            }
-          ],
-          "hashtags": [{ "start": 232, "end": 241, "tag": "mobesity" }],
-          "urls": [
-            {
-              "start": 245,
-              "end": 268,
-              "url": "https://t.co/KiifOc4ZGS",
-              "expanded_url": "https://roaddamagecalculator.com/",
-              "display_url": "roaddamagecalculator.com",
-              "status": 200,
-              "title": "Road Damage",
-              "description": "Ever wondered how much more damage one vehicle does to the road than",
-              "unwound_url": "https://roaddamagecalculator.com/"
-            },
-            {
-              "start": 269,
-              "end": 292,
-              "url": "https://t.co/uV0j73ujer",
-              "expanded_url": "https://twitter.com/_chris_brand_/status/1811769819198267436/photo/1",
-              "display_url": "pic.x.com/uv0j73ujer",
-              "media_key": "3_1811768379000107008"
-            }
-          ]
-        },
-        "public_metrics": {
-          "retweet_count": 426,
-          "reply_count": 200,
-          "like_count": 1499,
-          "quote_count": 65,
-          "bookmark_count": 240,
-          "impression_count": 163126
-        },
-        "attachments": { "media_keys": ["3_1811768379000107008"] },
-        "text": "The change in damage to the road surface is proportional to the difference in axle weight to the fourth power\n\nSo, a 2.4-tonne electric SUV such as a Hyundai Ioniq 5 would therefore do 16 times more damage than a 1.2-tonne VW Polo\n\n#mobesity\n\n👉 https://t.co/KiifOc4ZGS https://t.co/uV0j73ujer",
-        "id": "1811769819198267436",
-        "edit_controls": {
-          "edits_remaining": 5,
-          "is_edit_eligible": false,
-          "editable_until": "2024-07-12T15:29:20.000Z"
-        },
-        "context_annotations": [
-          {
-            "domain": {
-              "id": "30",
-              "name": "Entities [Entity Service]",
-              "description": "Entity Service top level domain, every item that is in Entity Service should be in this domain"
-            },
-            "entity": {
-              "id": "781972125179518977",
-              "name": "Auto Manufacturer - Auto"
-            }
-          },
-          {
-            "domain": {
-              "id": "46",
-              "name": "Business Taxonomy",
-              "description": "Categories within Brand Verticals that narrow down the scope of Brands"
-            },
-            "entity": {
-              "id": "1557696420500541440",
-              "name": "Automotive, Aircraft & Boat Business",
-              "description": "Brands, companies, advertisers and every non-person handle with the profit intent related to automobile, spacecraft, aircraft and boat manufacturing"
-            }
-          },
-          {
-            "domain": {
-              "id": "47",
-              "name": "Brand",
-              "description": "Brands and Companies"
-            },
-            "entity": { "id": "10026295039", "name": "Hyundai" }
-          },
-          {
-            "domain": {
-              "id": "47",
-              "name": "Brand",
-              "description": "Brands and Companies"
-            },
-            "entity": { "id": "10026353537", "name": "Volkswagen" }
-          },
-          {
-            "domain": {
-              "id": "131",
-              "name": "Unified Twitter Taxonomy",
-              "description": "A taxonomy of user interests. "
-            },
-            "entity": { "id": "10026295039", "name": "Hyundai" }
-          },
-          {
-            "domain": {
-              "id": "131",
-              "name": "Unified Twitter Taxonomy",
-              "description": "A taxonomy of user interests. "
-            },
-            "entity": { "id": "10026353537", "name": "Volkswagen" }
-          },
-          {
-            "domain": {
-              "id": "131",
-              "name": "Unified Twitter Taxonomy",
-              "description": "A taxonomy of user interests. "
-            },
-            "entity": {
-              "id": "1196845866138533888",
-              "name": "Automobile Brands"
-            }
-          },
-          {
-            "domain": {
-              "id": "30",
-              "name": "Entities [Entity Service]",
-              "description": "Entity Service top level domain, every item that is in Entity Service should be in this domain"
-            },
-            "entity": {
-              "id": "781972125179518977",
-              "name": "Auto Manufacturer - Auto"
-            }
-          },
-          {
-            "domain": {
-              "id": "47",
-              "name": "Brand",
-              "description": "Brands and Companies"
-            },
-            "entity": { "id": "10026295039", "name": "Hyundai" }
-          },
-          {
-            "domain": {
-              "id": "48",
-              "name": "Product",
-              "description": "Products created by Brands.  Examples: Ford Explorer, Apple iPhone."
-            },
-            "entity": { "id": "10044387828", "name": "Hyundai - Ioniq" }
-          },
-          {
-            "domain": {
-              "id": "65",
-              "name": "Interests and Hobbies Vertical",
-              "description": "Top level interests and hobbies groupings, like Food or Travel"
-            },
-            "entity": {
-              "id": "847528391163092993",
-              "name": "Automotive",
-              "description": "Car culture"
-            }
-          },
-          {
-            "domain": {
-              "id": "66",
-              "name": "Interests and Hobbies Category",
-              "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations"
-            },
-            "entity": {
-              "id": "847528576551337984",
-              "name": "Hybrid and electric vehicles",
-              "description": "Hybrid and electric vehicles"
-            }
-          },
-          {
-            "domain": {
-              "id": "66",
-              "name": "Interests and Hobbies Category",
-              "description": "A grouping of interests and hobbies entities, like Novelty Food or Destinations"
-            },
-            "entity": {
-              "id": "847529204530921472",
-              "name": "SUVs",
-              "description": "SUVs"
-            }
-          }
-        ],
-        "conversation_id": "1811769819198267436",
-        "reply_settings": "everyone"
-      }
-    ]
-  }
-}
diff --git a/etl/src/birdxplorer_etl/transform.py b/etl/src/birdxplorer_etl/transform.py
index 6de446b..bd977f9 100644
--- a/etl/src/birdxplorer_etl/transform.py
+++ b/etl/src/birdxplorer_etl/transform.py
@@ -1,6 +1,12 @@
 from sqlalchemy import select, func, and_, Integer
 from sqlalchemy.orm import Session
-from birdxplorer_common.storage import RowNoteRecord, RowPostRecord, RowUserRecord, RowNoteStatusRecord
+from birdxplorer_common.storage import (
+    RowNoteRecord,
+    RowPostRecord,
+    RowUserRecord,
+    RowNoteStatusRecord,
+    RowPostEmbedURLRecord,
+)
 from birdxplorer_etl.lib.ai_model.ai_model_interface import get_ai_service
 from birdxplorer_etl.settings import (
     TARGET_NOTE_ESTIMATE_TOPIC_START_UNIX_MILLISECOND,
@@ -9,6 +15,8 @@
 import csv
 import os
 from prefect import get_run_logger
+import uuid
+import random
 
 
 def transform_data(db: Session):
@@ -138,6 +146,10 @@ def transform_data(db: Session):
                 writer.writerow(user)
         offset += limit
 
+    # Transform row post embed link
+    generate_post_link(db)
+
+    # Transform row post embed url data and generate post_embed_url.csv
     csv_seed_file_path = "./seed/topic_seed.csv"
     output_csv_file_path = "./data/transformed/topic.csv"
     records = []
@@ -161,16 +173,62 @@ def transform_data(db: Session):
         for record in records:
             writer.writerow({"topic_id": record["topic_id"], "label": {k: v for k, v in record["label"].items()}})
 
-    generate_note_topic()
+    generate_note_topic(db)
 
     return
 
 
+def generate_post_link(db: Session):
+    link_csv_file_path = "./data/transformed/post_link.csv"
+    association_csv_file_path = "./data/transformed/post_link_association.csv"
+
+    if os.path.exists(link_csv_file_path):
+        os.remove(link_csv_file_path)
+    with open(link_csv_file_path, "a", newline="", encoding="utf-8") as file:
+        fieldnames = ["link_id", "url"]
+        writer = csv.DictWriter(file, fieldnames=fieldnames)
+        writer.writeheader()
+
+    if os.path.exists(association_csv_file_path):
+        os.remove(association_csv_file_path)
+    with open(association_csv_file_path, "a", newline="", encoding="utf-8") as file:
+        fieldnames = ["post_id", "link_id"]
+        writer = csv.DictWriter(file, fieldnames=fieldnames)
+        writer.writeheader()
+
+    offset = 0
+    limit = 1000
+    num_of_links = db.query(func.count(RowPostEmbedURLRecord.post_id)).scalar()
+
+    records = []
+    while offset < num_of_links:
+        links = db.query(RowPostEmbedURLRecord).limit(limit).offset(offset)
+
+        for link in links:
+            random.seed(link.unwound_url)
+            link_id = uuid.UUID(int=random.getrandbits(128))
+            is_link_exist = next((record for record in records if record["link_id"] == link_id), None)
+            if is_link_exist is None:
+                with open(link_csv_file_path, "a", newline="", encoding="utf-8") as file:
+                    fieldnames = ["link_id", "unwound_url"]
+                    writer = csv.DictWriter(file, fieldnames=fieldnames)
+                    writer.writerow({"link_id": link_id, "unwound_url": link.unwound_url})
+                record = {"post_id": link.post_id, "link_id": link_id, "unwound_url": link.unwound_url}
+                records.append(record)
+            with open(association_csv_file_path, "a", newline="", encoding="utf-8") as file:
+                fieldnames = ["post_id", "link_id"]
+                writer = csv.DictWriter(file, fieldnames=fieldnames)
+                writer.writerow({"post_id": link.post_id, "link_id": link_id})
+        offset += limit
+
+
 def generate_note_topic(db: Session):
-    note_csv_file_path = "./data/transformed/note.csv"
     output_csv_file_path = "./data/transformed/note_topic_association.csv"
     ai_service = get_ai_service()
 
+    if os.path.exists(output_csv_file_path):
+        os.remove(output_csv_file_path)
+
     records = []
     with open(output_csv_file_path, "w", newline="", encoding="utf-8", buffering=1) as file:
         fieldnames = ["note_id", "topic_id"]
@@ -214,6 +272,7 @@ def generate_note_topic(db: Session):
                         )
                     records = []
                 print(index)
+            offset += limit
 
         for record in records:
             writer.writerow(

From a9f158059b5e9b61db82f8f8a47369a22596a714 Mon Sep 17 00:00:00 2001
From: yu23ki14 <yuki_021423@yahoo.co.jp>
Date: Wed, 2 Oct 2024 10:50:40 +0900
Subject: [PATCH 4/4] try to fix error

---
 common/birdxplorer_common/models.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/common/birdxplorer_common/models.py b/common/birdxplorer_common/models.py
index d066f70..bc5532d 100644
--- a/common/birdxplorer_common/models.py
+++ b/common/birdxplorer_common/models.py
@@ -1,14 +1,25 @@
 from abc import ABC, abstractmethod
 from datetime import datetime, timezone
 from enum import Enum
-from typing import Any, Dict, List, Literal, Optional, Type, TypeAlias, TypeVar, Union
+from typing import (
+    Any,
+    Dict,
+    List,
+    Literal,
+    Optional,
+    Set,
+    Type,
+    TypeAlias,
+    TypeVar,
+    Union,
+)
 
 from pydantic import BaseModel as PydanticBaseModel
 from pydantic import ConfigDict, GetCoreSchemaHandler, HttpUrl, TypeAdapter
 from pydantic.alias_generators import to_camel
 from pydantic_core import core_schema
 
-IncEx: TypeAlias = "set[int] | set[str] | dict[int, IncEx] | dict[str, IncEx] | None"
+IncEx: TypeAlias = Union[Set[int], Set[str], Dict[int, Any], Dict[str, Any], None]
 StrT = TypeVar("StrT", bound="BaseString")
 IntT = TypeVar("IntT", bound="BaseInt")
 FloatT = TypeVar("FloatT", bound="BaseFloat")