diff --git a/common/birdxplorer_common/models.py b/common/birdxplorer_common/models.py index 25b1e4b..0e9801f 100644 --- a/common/birdxplorer_common/models.py +++ b/common/birdxplorer_common/models.py @@ -19,7 +19,13 @@ from pydantic import BaseModel as PydanticBaseModel from pydantic import ConfigDict from pydantic import Field as PydanticField -from pydantic import GetCoreSchemaHandler, HttpUrl, TypeAdapter, model_validator, computed_field +from pydantic import ( + GetCoreSchemaHandler, + HttpUrl, + TypeAdapter, + computed_field, + model_validator, +) from pydantic.alias_generators import to_camel from pydantic.main import IncEx from pydantic_core import core_schema diff --git a/common/birdxplorer_common/storage.py b/common/birdxplorer_common/storage.py index 5229dc1..a8da554 100644 --- a/common/birdxplorer_common/storage.py +++ b/common/birdxplorer_common/storage.py @@ -209,8 +209,6 @@ class RowPostRecord(Base): post_id: Mapped[PostId] = mapped_column(primary_key=True) author_id: Mapped[UserId] = mapped_column(ForeignKey("row_users.user_id"), nullable=False) text: Mapped[SummaryString] = mapped_column(nullable=False) - media_type: Mapped[String] = mapped_column(nullable=True) - media_url: Mapped[String] = mapped_column(nullable=True) created_at: Mapped[TwitterTimestamp] = mapped_column(nullable=False) like_count: Mapped[NonNegativeInt] = mapped_column(nullable=False) repost_count: Mapped[NonNegativeInt] = mapped_column(nullable=False) @@ -235,6 +233,7 @@ class RowPostMediaRecord(Base): post_id: Mapped[PostId] = mapped_column(ForeignKey("row_posts.post_id"), nullable=False) + class RowPostEmbedURLRecord(Base): __tablename__ = "row_post_embed_urls" diff --git a/etl/src/birdxplorer_etl/extract.py b/etl/src/birdxplorer_etl/extract.py index b268b9c..8c9675a 100644 --- a/etl/src/birdxplorer_etl/extract.py +++ b/etl/src/birdxplorer_etl/extract.py @@ -148,15 +148,15 @@ def extract_data(db: Session): media_data = ( post["includes"]["media"] if "includes" in post and "media" in post["includes"] and len(post["includes"]["media"]) > 0 - else [{}] + else [] ) + print(media_data) + db_post = RowPostRecord( post_id=post["data"]["id"], author_id=post["data"]["author_id"], text=post["data"]["text"], - media_type=media_data[0].get("type", ""), - media_url=media_data[0].get("url", ""), created_at=created_at_millis, like_count=post["data"]["public_metrics"]["like_count"], repost_count=post["data"]["public_metrics"]["retweet_count"], @@ -172,7 +172,7 @@ def extract_data(db: Session): RowPostMediaRecord( media_key=m["media_key"], type=m["type"], - url=m["url"], + url=m.get("url") or (m["variants"][0]["url"] if "variants" in m and m["variants"] else ""), width=m["width"], height=m["height"], post_id=post["data"]["id"],