add link csv

codeforjapan · Oct 2, 2024 · d542a76 · d542a76
1 parent 24c6464
commit d542a76
Show file tree

Hide file tree

Showing 5 changed files with 103 additions and 469 deletions.
diff --git a/etl/.env.example b/etl/.env.example
@@ -1,6 +1,14 @@
 X_BEARER_TOKEN=
-AI_MODEL=
+
+COMMUNITY_NOTE_DAYS_AGO=3
+
+TARGET_TWITTER_POST_START_UNIX_MILLISECOND=1719851000000
+TARGET_TWITTER_POST_END_UNIX_MILLISECOND=1719891000000
+
+AI_MODEL=openai
 OPENAPI_TOKEN=
 CLAUDE_TOKEN=
-TARGET_NOTE_ESTIMATE_TOPIC_START_UNIX_MILLISECOND=1720900800000
-TARGET_NOTE_ESTIMATE_TOPIC_END_UNIX_MILLISECOND=1722110400000
+TARGET_NOTE_ESTIMATE_TOPIC_START_UNIX_MILLISECOND=1719851000000
+TARGET_NOTE_ESTIMATE_TOPIC_END_UNIX_MILLISECOND=1719891000000
+
+USE_DUMMY_DATA=False
diff --git a/etl/src/birdxplorer_etl/extract.py b/etl/src/birdxplorer_etl/extract.py
@@ -5,7 +5,13 @@
 from prefect import get_run_logger
 from sqlalchemy.orm import Session
 from lib.x.postlookup import lookup
-from birdxplorer_common.storage import RowNoteRecord, RowPostRecord, RowUserRecord, RowNoteStatusRecord
+from birdxplorer_common.storage import (
+    RowNoteRecord,
+    RowPostRecord,
+    RowUserRecord,
+    RowNoteStatusRecord,
+    RowPostEmbedURLRecord,
+)
 import settings
 
 
@@ -30,10 +36,12 @@ def extract_data(db: Session):
             break
 
         dateString = date.strftime("%Y/%m/%d")
-        # note_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/notes/notes-00000.tsv"
-        note_url = (
-            "https://raw.githubusercontent.com/codeforjapan/BirdXplorer/refs/heads/main/etl/data/notes_sample.tsv"
-        )
+        note_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/notes/notes-00000.tsv"
+        if settings.USE_DUMMY_DATA:
+            note_url = (
+                "https://raw.githubusercontent.com/codeforjapan/BirdXplorer/refs/heads/main/etl/data/notes_sample.tsv"
+            )
+
         logger.info(note_url)
         res = requests.get(note_url)
 
@@ -53,8 +61,10 @@ def extract_data(db: Session):
                     rows_to_add = []
             db.bulk_save_objects(rows_to_add)
 
-            # status_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/noteStatusHistory/noteStatusHistory-00000.tsv"
-            status_url = "https://raw.githubusercontent.com/codeforjapan/BirdXplorer/refs/heads/main/etl/data/noteStatus_sample.tsv"
+            status_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/noteStatusHistory/noteStatusHistory-00000.tsv"
+            if settings.USE_DUMMY_DATA:
+                status_url = "https://raw.githubusercontent.com/codeforjapan/BirdXplorer/refs/heads/main/etl/data/noteStatus_sample.tsv"
+
             logger.info(status_url)
             res = requests.get(status_url)
 
@@ -155,6 +165,17 @@ def extract_data(db: Session):
             lang=post["data"]["lang"],
         )
         db.add(db_post)
+
+        if "entities" in post["data"] and "urls" in post["data"]["entities"]:
+            for url in post["data"]["entities"]["urls"]:
+                if "unwound_url" in url:
+                    post_url = RowPostEmbedURLRecord(
+                        post_id=post["data"]["id"],
+                        url=url["url"] if url["url"] else None,
+                        expanded_url=url["expanded_url"] if url["expanded_url"] else None,
+                        unwound_url=url["unwound_url"] if url["unwound_url"] else None,
+                    )
+                    db.add(post_url)
         note.row_post_id = tweet_id
         db.commit()
         continue

diff --git a/etl/src/birdxplorer_etl/settings.py b/etl/src/birdxplorer_etl/settings.py
@@ -17,3 +17,5 @@
 CLAUDE_TOKEN = os.getenv("CLAUDE_TOKEN")
 TARGET_NOTE_ESTIMATE_TOPIC_START_UNIX_MILLISECOND = os.getenv("TARGET_NOTE_ESTIMATE_TOPIC_START_UNIX_MILLISECOND")
 TARGET_NOTE_ESTIMATE_TOPIC_END_UNIX_MILLISECOND = os.getenv("TARGET_NOTE_ESTIMATE_TOPIC_END_UNIX_MILLISECOND")
+
+USE_DUMMY_DATA = os.getenv("USE_DUMMY_DATA", "False") == "True"