From 56758a4f4871fefeba75f61643ab4edb2bb4677e Mon Sep 17 00:00:00 2001 From: sushichan044 Date: Thu, 10 Oct 2024 15:12:57 +0900 Subject: [PATCH] =?UTF-8?q?feat:=20X=E3=81=8B=E3=82=89Post=E3=82=92?= =?UTF-8?q?=E5=8F=96=E5=BE=97=E3=81=99=E3=82=8B=E9=9A=9B=E3=81=ABMedia?= =?UTF-8?q?=E6=83=85=E5=A0=B1=E3=82=92=E4=BF=9D=E5=AD=98=E3=81=99=E3=82=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- common/birdxplorer_common/storage.py | 12 ++++++++++++ etl/src/birdxplorer_etl/extract.py | 23 +++++++++++++++++++---- 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/common/birdxplorer_common/storage.py b/common/birdxplorer_common/storage.py index 6eee4af..b087e2b 100644 --- a/common/birdxplorer_common/storage.py +++ b/common/birdxplorer_common/storage.py @@ -223,6 +223,18 @@ class RowPostRecord(Base): user: Mapped["RowUserRecord"] = relationship("RowUserRecord", back_populates="row_post") +class RowPostMediaRecord(Base): + __tablename__ = "row_post_media" + + media_key: Mapped[str] = mapped_column(primary_key=True) + + url: Mapped[String] = mapped_column(nullable=False) + type: Mapped[MediaType] = mapped_column(nullable=False) + width: Mapped[NonNegativeInt] = mapped_column(nullable=False) + height: Mapped[NonNegativeInt] = mapped_column(nullable=False) + + post_id: Mapped[PostId] = mapped_column(ForeignKey("row_posts.post_id"), nullable=False) + class RowPostEmbedURLRecord(Base): __tablename__ = "row_post_embed_urls" diff --git a/etl/src/birdxplorer_etl/extract.py b/etl/src/birdxplorer_etl/extract.py index b29c5de..b268b9c 100644 --- a/etl/src/birdxplorer_etl/extract.py +++ b/etl/src/birdxplorer_etl/extract.py @@ -7,6 +7,7 @@ from lib.x.postlookup import lookup from birdxplorer_common.storage import ( RowNoteRecord, + RowPostMediaRecord, RowPostRecord, RowUserRecord, RowNoteStatusRecord, @@ -145,16 +146,17 @@ def extract_data(db: Session): db.add(db_user) media_data = ( - post["includes"]["media"][0] + post["includes"]["media"] if "includes" in post and "media" in post["includes"] and len(post["includes"]["media"]) > 0 - else {} + else [{}] ) + db_post = RowPostRecord( post_id=post["data"]["id"], author_id=post["data"]["author_id"], text=post["data"]["text"], - media_type=media_data.get("type", ""), - media_url=media_data.get("url", ""), + media_type=media_data[0].get("type", ""), + media_url=media_data[0].get("url", ""), created_at=created_at_millis, like_count=post["data"]["public_metrics"]["like_count"], repost_count=post["data"]["public_metrics"]["retweet_count"], @@ -166,6 +168,19 @@ def extract_data(db: Session): ) db.add(db_post) + media_recs = [ + RowPostMediaRecord( + media_key=m["media_key"], + type=m["type"], + url=m["url"], + width=m["width"], + height=m["height"], + post_id=post["data"]["id"], + ) + for m in media_data + ] + db.add_all(media_recs) + if "entities" in post["data"] and "urls" in post["data"]["entities"]: for url in post["data"]["entities"]["urls"]: if "unwound_url" in url: