Skip to content

Commit

Permalink
add link csv
Browse files Browse the repository at this point in the history
  • Loading branch information
yu23ki14 committed Oct 2, 2024
1 parent 24c6464 commit d542a76
Show file tree
Hide file tree
Showing 5 changed files with 103 additions and 469 deletions.
14 changes: 11 additions & 3 deletions etl/.env.example
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
X_BEARER_TOKEN=
AI_MODEL=

COMMUNITY_NOTE_DAYS_AGO=3

TARGET_TWITTER_POST_START_UNIX_MILLISECOND=1719851000000
TARGET_TWITTER_POST_END_UNIX_MILLISECOND=1719891000000

AI_MODEL=openai
OPENAPI_TOKEN=
CLAUDE_TOKEN=
TARGET_NOTE_ESTIMATE_TOPIC_START_UNIX_MILLISECOND=1720900800000
TARGET_NOTE_ESTIMATE_TOPIC_END_UNIX_MILLISECOND=1722110400000
TARGET_NOTE_ESTIMATE_TOPIC_START_UNIX_MILLISECOND=1719851000000
TARGET_NOTE_ESTIMATE_TOPIC_END_UNIX_MILLISECOND=1719891000000

USE_DUMMY_DATA=False
35 changes: 28 additions & 7 deletions etl/src/birdxplorer_etl/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,13 @@
from prefect import get_run_logger
from sqlalchemy.orm import Session
from lib.x.postlookup import lookup
from birdxplorer_common.storage import RowNoteRecord, RowPostRecord, RowUserRecord, RowNoteStatusRecord
from birdxplorer_common.storage import (
RowNoteRecord,
RowPostRecord,
RowUserRecord,
RowNoteStatusRecord,
RowPostEmbedURLRecord,
)
import settings


Expand All @@ -30,10 +36,12 @@ def extract_data(db: Session):
break

dateString = date.strftime("%Y/%m/%d")
# note_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/notes/notes-00000.tsv"
note_url = (
"https://raw.githubusercontent.com/codeforjapan/BirdXplorer/refs/heads/main/etl/data/notes_sample.tsv"
)
note_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/notes/notes-00000.tsv"
if settings.USE_DUMMY_DATA:
note_url = (
"https://raw.githubusercontent.com/codeforjapan/BirdXplorer/refs/heads/main/etl/data/notes_sample.tsv"
)

logger.info(note_url)
res = requests.get(note_url)

Expand All @@ -53,8 +61,10 @@ def extract_data(db: Session):
rows_to_add = []
db.bulk_save_objects(rows_to_add)

# status_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/noteStatusHistory/noteStatusHistory-00000.tsv"
status_url = "https://raw.githubusercontent.com/codeforjapan/BirdXplorer/refs/heads/main/etl/data/noteStatus_sample.tsv"
status_url = f"https://ton.twimg.com/birdwatch-public-data/{dateString}/noteStatusHistory/noteStatusHistory-00000.tsv"
if settings.USE_DUMMY_DATA:
status_url = "https://raw.githubusercontent.com/codeforjapan/BirdXplorer/refs/heads/main/etl/data/noteStatus_sample.tsv"

logger.info(status_url)
res = requests.get(status_url)

Expand Down Expand Up @@ -155,6 +165,17 @@ def extract_data(db: Session):
lang=post["data"]["lang"],
)
db.add(db_post)

if "entities" in post["data"] and "urls" in post["data"]["entities"]:
for url in post["data"]["entities"]["urls"]:
if "unwound_url" in url:
post_url = RowPostEmbedURLRecord(
post_id=post["data"]["id"],
url=url["url"] if url["url"] else None,
expanded_url=url["expanded_url"] if url["expanded_url"] else None,
unwound_url=url["unwound_url"] if url["unwound_url"] else None,
)
db.add(post_url)
note.row_post_id = tweet_id
db.commit()
continue
Expand Down
2 changes: 2 additions & 0 deletions etl/src/birdxplorer_etl/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,5 @@
CLAUDE_TOKEN = os.getenv("CLAUDE_TOKEN")
TARGET_NOTE_ESTIMATE_TOPIC_START_UNIX_MILLISECOND = os.getenv("TARGET_NOTE_ESTIMATE_TOPIC_START_UNIX_MILLISECOND")
TARGET_NOTE_ESTIMATE_TOPIC_END_UNIX_MILLISECOND = os.getenv("TARGET_NOTE_ESTIMATE_TOPIC_END_UNIX_MILLISECOND")

USE_DUMMY_DATA = os.getenv("USE_DUMMY_DATA", "False") == "True"
Loading

0 comments on commit d542a76

Please sign in to comment.