diff --git a/api/Dockerfile.dev b/api/Dockerfile.dev new file mode 100644 index 0000000..1c439b9 --- /dev/null +++ b/api/Dockerfile.dev @@ -0,0 +1,52 @@ +ARG PYTHON_VERSION_CODE=3.10 +ARG ENVIRONMENT="dev" +# ENVIRONMENT: dev or prod, refer to project.optional-dependencies in pyproject.toml + +FROM python:${PYTHON_VERSION_CODE}-bookworm as builder +ARG PYTHON_VERSION_CODE +ARG ENVIRONMENT + +WORKDIR /app +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 + +COPY api/pyproject.toml api/README.md ./ +COPY api/birdxplorer_api/__init__.py ./birdxplorer_api/ + +RUN if [ "${ENVIRONMENT}" = "prod" ]; then \ + apt-get update && apt-get install -y --no-install-recommends \ + postgresql-client-15 \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/*; \ + fi + +RUN python -m pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -e ".[${ENVIRONMENT}]" + +COPY ../common ./common +RUN if [ "${ENVIRONMENT}" = "dev" ]; then \ + pip install -e ./common; \ + fi + +FROM python:${PYTHON_VERSION_CODE}-slim-bookworm as runner +ARG PYTHON_VERSION_CODE +ARG ENVIRONMENT + +WORKDIR /app + +RUN if [ "${ENVIRONMENT}" = "prod" ]; then \ + apt-get update && apt-get install -y --no-install-recommends \ + libpq5 \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/*; \ + fi + +RUN groupadd -r app && useradd -r -g app app +RUN chown -R app:app /app +USER app + +COPY --from=builder /usr/local/lib/python${PYTHON_VERSION_CODE}/site-packages /usr/local/lib/python${PYTHON_VERSION_CODE}/site-packages +COPY --chown=app:app api ./ +COPY ../common ./common + +ENTRYPOINT ["python", "-m", "uvicorn", "birdxplorer_api.main:app", "--host", "0.0.0.0"] diff --git a/api/Dockerfile.prd b/api/Dockerfile.prd new file mode 100644 index 0000000..e2bf1f3 --- /dev/null +++ b/api/Dockerfile.prd @@ -0,0 +1,46 @@ +ARG PYTHON_VERSION_CODE=3.10 +ARG ENVIRONMENT="prod" +# ENVIRONMENT: dev or prod, refer to project.optional-dependencies in pyproject.toml + +FROM python:${PYTHON_VERSION_CODE}-bookworm as builder +ARG PYTHON_VERSION_CODE +ARG ENVIRONMENT + +WORKDIR /app +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 + +COPY pyproject.toml README.md ./ +COPY birdxplorer_api/__init__.py ./birdxplorer_api/ + +RUN if [ "${ENVIRONMENT}" = "prod" ]; then \ + apt-get update && apt-get install -y --no-install-recommends \ + postgresql-client-15 \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/*; \ + fi + +RUN python -m pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -e ".[${ENVIRONMENT}]" + +FROM python:${PYTHON_VERSION_CODE}-slim-bookworm as runner +ARG PYTHON_VERSION_CODE +ARG ENVIRONMENT + +WORKDIR /app + +RUN if [ "${ENVIRONMENT}" = "prod" ]; then \ + apt-get update && apt-get install -y --no-install-recommends \ + libpq5 \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/*; \ + fi + +RUN groupadd -r app && useradd -r -g app app +RUN chown -R app:app /app +USER app + +COPY --from=builder /usr/local/lib/python${PYTHON_VERSION_CODE}/site-packages /usr/local/lib/python${PYTHON_VERSION_CODE}/site-packages +COPY --chown=app:app . ./ + +ENTRYPOINT ["python", "-m", "gunicorn", "birdxplorer_api.main:app", "-k", "uvicorn.workers.UvicornWorker", "-w", "1", "--bind", "0.0.0.0:10000"] diff --git a/api/birdxplorer_api/routers/data.py b/api/birdxplorer_api/routers/data.py index 2398df8..59ac30d 100644 --- a/api/birdxplorer_api/routers/data.py +++ b/api/birdxplorer_api/routers/data.py @@ -14,7 +14,6 @@ PostId, Topic, TopicId, - TweetId, TwitterTimestamp, UserEnrollment, ) @@ -73,7 +72,7 @@ def get_notes( created_at_from: Union[None, TwitterTimestamp] = Query(default=None), created_at_to: Union[None, TwitterTimestamp] = Query(default=None), topic_ids: Union[List[TopicId], None] = Query(default=None), - post_ids: Union[List[TweetId], None] = Query(default=None), + post_ids: Union[List[PostId], None] = Query(default=None), language: Union[LanguageIdentifier, None] = Query(default=None), ) -> NoteListResponse: return NoteListResponse( @@ -92,11 +91,14 @@ def get_notes( @router.get("/posts", response_model=PostListResponse) def get_posts( post_id: Union[List[PostId], None] = Query(default=None), + note_id: Union[List[NoteId], None] = Query(default=None), created_at_start: Union[None, TwitterTimestamp, str] = Query(default=None), created_at_end: Union[None, TwitterTimestamp, str] = Query(default=None), ) -> PostListResponse: if post_id is not None: return PostListResponse(data=list(storage.get_posts_by_ids(post_ids=post_id))) + if note_id is not None: + return PostListResponse(data=list(storage.get_posts_by_note_ids(note_ids=note_id))) if created_at_start is not None: if created_at_end is not None: return PostListResponse( diff --git a/api/pyproject.toml b/api/pyproject.toml index 6b76b7c..7bc5466 100644 --- a/api/pyproject.toml +++ b/api/pyproject.toml @@ -30,6 +30,7 @@ dependencies = [ "pydantic", "starlette", "python-dotenv", + "uvicorn[standard]", ] [project.urls] @@ -56,13 +57,13 @@ dev=[ "types-python-dateutil", "psycopg2-binary", "factory_boy", - "uvicorn", "polyfactory", "httpx", ] prod=[ "birdxplorer_common @ git+https://github.com/codeforjapan/BirdXplorer.git@main#subdirectory=common", "psycopg2", + "gunicorn", ] [tool.pytest.ini_options] diff --git a/api/tests/conftest.py b/api/tests/conftest.py index 4fcc8e0..c6b5bee 100644 --- a/api/tests/conftest.py +++ b/api/tests/conftest.py @@ -21,7 +21,6 @@ PostId, Topic, TopicId, - TweetId, TwitterTimestamp, UserEnrollment, XUser, @@ -227,7 +226,7 @@ def _get_notes( created_at_from: Union[None, TwitterTimestamp] = None, created_at_to: Union[None, TwitterTimestamp] = None, topic_ids: Union[List[TopicId], None] = None, - post_ids: Union[List[TweetId], None] = None, + post_ids: Union[List[PostId], None] = None, language: Union[LanguageIdentifier, None] = None, ) -> Generator[Note, None, None]: for note in note_samples: @@ -262,6 +261,15 @@ def _get_posts_by_ids(post_ids: List[PostId]) -> Generator[Post, None, None]: mock.get_posts_by_ids.side_effect = _get_posts_by_ids + def _get_posts_by_note_ids(note_ids: List[NoteId]) -> Generator[Post, None, None]: + for post in post_samples: + for note in note_samples: + if note.note_id in note_ids and post.post_id == note.post_id: + yield post + break + + mock.get_posts_by_note_ids.side_effect = _get_posts_by_note_ids + def _get_posts_by_created_at_range(start: TwitterTimestamp, end: TwitterTimestamp) -> Generator[Post, None, None]: for post in post_samples: if start <= post.created_at < end: diff --git a/api/tests/routers/test_data.py b/api/tests/routers/test_data.py index 8bf271c..ddd29ad 100644 --- a/api/tests/routers/test_data.py +++ b/api/tests/routers/test_data.py @@ -39,6 +39,13 @@ def test_posts_get_has_post_id_filter(client: TestClient, post_samples: List[Pos } +def test_posts_get_has_note_id_filter(client: TestClient, post_samples: List[Post], note_samples: List[Note]) -> None: + response = client.get(f"/api/v1/data/posts/?noteId={','.join([n.note_id for n in note_samples])}") + assert response.status_code == 200 + res_json = response.json() + assert res_json == {"data": [json.loads(post_samples[0].model_dump_json())]} + + def test_posts_get_has_created_at_filter_start_and_end(client: TestClient, post_samples: List[Post]) -> None: response = client.get("/api/v1/data/posts/?createdAtStart=2006-7-25 00:00:00&createdAtEnd=2006-7-30 23:59:59") assert response.status_code == 200 @@ -124,3 +131,13 @@ def test_notes_get_has_created_at_filter_to(client: TestClient, note_samples: Li assert response.status_code == 200 res_json = response.json() assert res_json == {"data": [json.loads(note_samples[i].model_dump_json()) for i in (0, 1, 2, 3)]} + + +def test_notes_get_has_topic_id_filter(client: TestClient, note_samples: List[Note]) -> None: + correct_notes = [note for note in note_samples if note_samples[0].topics[0] in note.topics] + response = client.get(f"/api/v1/data/notes/?topicIds={note_samples[0].topics[0].topic_id.serialize()}") + assert response.status_code == 200 + res_json = response.json() + assert res_json == { + "data": [json.loads(correct_notes[i].model_dump_json()) for i in range(correct_notes.__len__())] + } diff --git a/common/birdxplorer_common/models.py b/common/birdxplorer_common/models.py index a992edd..eaf1837 100644 --- a/common/birdxplorer_common/models.py +++ b/common/birdxplorer_common/models.py @@ -565,7 +565,7 @@ class NotesValidationDifficulty(str, Enum): empty = "" -class TweetId(UpToNineteenDigitsDecimalString): ... +class PostId(UpToNineteenDigitsDecimalString): ... class NoteData(BaseModel): @@ -576,7 +576,7 @@ class NoteData(BaseModel): note_id: NoteId note_author_participant_id: ParticipantId created_at_millis: TwitterTimestamp - tweet_id: TweetId + tweet_id: PostId believable: NotesBelievable misleading_other: BinaryBool misleading_factual_error: BinaryBool @@ -621,6 +621,15 @@ class LanguageIdentifier(str, Enum): DA = "da" RU = "ru" PL = "pl" + OTHER = "other" + + @classmethod + def normalize(cls, value: str, default: str = "other") -> str: + try: + cls(value) + return value + except ValueError: + return default class TopicLabelString(NonEmptyTrimmedString): ... @@ -642,7 +651,7 @@ class SummaryString(NonEmptyTrimmedString): ... class Note(BaseModel): note_id: NoteId - post_id: TweetId + post_id: PostId language: LanguageIdentifier topics: List[Topic] summary: SummaryString @@ -663,9 +672,6 @@ class XUser(BaseModel): following_count: NonNegativeInt -class PostId(UpToNineteenDigitsDecimalString): ... - - MediaDetails: TypeAlias = List[HttpUrl] | None diff --git a/common/birdxplorer_common/settings.py b/common/birdxplorer_common/settings.py index c561784..923ad98 100644 --- a/common/birdxplorer_common/settings.py +++ b/common/birdxplorer_common/settings.py @@ -18,7 +18,7 @@ class PostgresStorageSettings(BaseSettings): port: int = 5432 database: str = "postgres" - @computed_field # type: ignore[misc] + @computed_field # type: ignore[prop-decorator] @property def sqlalchemy_database_url(self) -> str: return PostgresDsn( diff --git a/common/birdxplorer_common/storage.py b/common/birdxplorer_common/storage.py index cd07b7a..5e6c30b 100644 --- a/common/birdxplorer_common/storage.py +++ b/common/birdxplorer_common/storage.py @@ -16,7 +16,6 @@ from .models import ( TopicId, TopicLabel, - TweetId, TwitterTimestamp, UserEnrollment, UserId, @@ -39,7 +38,7 @@ class Base(DeclarativeBase): TopicLabel: JSON, NoteId: String, ParticipantId: String, - TweetId: String, + PostId: String, LanguageIdentifier: String, TwitterTimestamp: DECIMAL, SummaryString: String, @@ -65,7 +64,7 @@ class NoteRecord(Base): __tablename__ = "notes" note_id: Mapped[NoteId] = mapped_column(primary_key=True) - post_id: Mapped[TweetId] = mapped_column(nullable=False) + post_id: Mapped[PostId] = mapped_column(nullable=False) topics: Mapped[List[NoteTopicAssociation]] = relationship() language: Mapped[LanguageIdentifier] = mapped_column(nullable=False) summary: Mapped[SummaryString] = mapped_column(nullable=False) @@ -92,7 +91,7 @@ class XUserRecord(Base): class PostRecord(Base): __tablename__ = "posts" - post_id: Mapped[TweetId] = mapped_column(primary_key=True) + post_id: Mapped[PostId] = mapped_column(primary_key=True) user_id: Mapped[UserId] = mapped_column(ForeignKey("x_users.user_id"), nullable=False) user: Mapped[XUserRecord] = relationship() text: Mapped[SummaryString] = mapped_column(nullable=False) @@ -109,7 +108,7 @@ class RowNoteRecord(Base): note_id: Mapped[NoteId] = mapped_column(primary_key=True) note_author_participant_id: Mapped[ParticipantId] = mapped_column(nullable=False) created_at_millis: Mapped[TwitterTimestamp] = mapped_column(nullable=False) - tweet_id: Mapped[TweetId] = mapped_column(nullable=False) + tweet_id: Mapped[PostId] = mapped_column(nullable=False) believable: Mapped[BinaryBool] = mapped_column(nullable=False) misleading_other: Mapped[BinaryBool] = mapped_column(nullable=False) misleading_factual_error: Mapped[BinaryBool] = mapped_column(nullable=False) @@ -129,14 +128,14 @@ class RowNoteRecord(Base): harmful: Mapped[NotesHarmful] = mapped_column(nullable=False) validation_difficulty: Mapped[SummaryString] = mapped_column(nullable=False) summary: Mapped[SummaryString] = mapped_column(nullable=False) - row_post_id: Mapped[TweetId] = mapped_column(ForeignKey("row_posts.post_id"), nullable=True) + row_post_id: Mapped[PostId] = mapped_column(ForeignKey("row_posts.post_id"), nullable=True) row_post: Mapped["RowPostRecord"] = relationship("RowPostRecord", back_populates="row_notes") class RowPostRecord(Base): __tablename__ = "row_posts" - post_id: Mapped[TweetId] = mapped_column(primary_key=True) + post_id: Mapped[PostId] = mapped_column(primary_key=True) author_id: Mapped[UserId] = mapped_column(ForeignKey("row_users.user_id"), nullable=False) text: Mapped[SummaryString] = mapped_column(nullable=False) media_type: Mapped[String] = mapped_column(nullable=True) @@ -224,7 +223,7 @@ def get_notes( created_at_from: Union[None, TwitterTimestamp] = None, created_at_to: Union[None, TwitterTimestamp] = None, topic_ids: Union[List[TopicId], None] = None, - post_ids: Union[List[TweetId], None] = None, + post_ids: Union[List[PostId], None] = None, language: Union[LanguageIdentifier, None] = None, ) -> Generator[NoteModel, None, None]: with Session(self.engine) as sess: @@ -241,7 +240,7 @@ def get_notes( subq = ( select(NoteTopicAssociation.note_id) .group_by(NoteTopicAssociation.note_id) - .having(func.array_agg(NoteTopicAssociation.topic_id) == topic_ids) + .having(func.bool_or(NoteTopicAssociation.topic_id.in_(topic_ids))) .subquery() ) query = query.join(subq, NoteRecord.note_id == subq.c.note_id) @@ -264,7 +263,7 @@ def get_notes( ) for topic in note_record.topics ], - language=note_record.language, + language=LanguageIdentifier.normalize(note_record.language), summary=note_record.summary, created_at=note_record.created_at, ) @@ -296,6 +295,16 @@ def get_posts_by_created_at_end(self, end: TwitterTimestamp) -> Generator[PostMo for post_record in sess.query(PostRecord).filter(PostRecord.created_at < end).all(): yield self._post_record_to_model(post_record) + def get_posts_by_note_ids(self, note_ids: List[NoteId]) -> Generator[PostModel, None, None]: + query = ( + select(PostRecord) + .join(NoteRecord, NoteRecord.post_id == PostRecord.post_id) + .where(NoteRecord.note_id.in_(note_ids)) + ) + with Session(self.engine) as sess: + for post_record in sess.execute(query).scalars().all(): + yield self._post_record_to_model(post_record) + def gen_storage(settings: GlobalSettings) -> Storage: engine = create_engine(settings.storage_settings.sqlalchemy_database_url) diff --git a/common/tests/test_storage.py b/common/tests/test_storage.py index 3b74975..63bf5bb 100644 --- a/common/tests/test_storage.py +++ b/common/tests/test_storage.py @@ -10,7 +10,6 @@ PostId, Topic, TopicId, - TweetId, TwitterTimestamp, ) from birdxplorer_common.storage import NoteRecord, PostRecord, Storage, TopicRecord @@ -182,10 +181,11 @@ def test_get_notes_by_topic_ids( topics = note_samples[0].topics topic_ids: List[TopicId] = [TopicId.from_int(0)] expected = sorted( - [note for note in note_samples if note.topics == topics], + [note for note in note_samples if topics[0] in note.topics], key=lambda note: note.note_id, ) actual = sorted(list(storage.get_notes(topic_ids=topic_ids)), key=lambda note: note.note_id) + assert expected == actual @@ -208,8 +208,8 @@ def test_get_notes_by_post_ids( ) -> None: storage = Storage(engine=engine_for_test) post_ids = [ - TweetId.from_str("2234567890123456781"), - TweetId.from_str("2234567890123456782"), + PostId.from_str("2234567890123456781"), + PostId.from_str("2234567890123456782"), ] expected = [note for note in note_samples if note.post_id in post_ids] actual = list(storage.get_notes(post_ids=post_ids)) @@ -222,7 +222,7 @@ def test_get_notes_by_post_ids_empty( note_records_sample: List[NoteRecord], ) -> None: storage = Storage(engine=engine_for_test) - post_ids: List[TweetId] = [] + post_ids: List[PostId] = [] expected: List[Note] = [] actual = list(storage.get_notes(post_ids=post_ids)) assert expected == actual diff --git a/compose.yml b/compose.yml index 7d36098..df2676f 100644 --- a/compose.yml +++ b/compose.yml @@ -1,4 +1,4 @@ -version: '3.1' +version: "3.1" services: db: @@ -14,7 +14,7 @@ services: timeout: 5s retries: 5 ports: - - '5432:5432' + - "5432:5432" volumes: - postgres_data:/var/lib/postgresql/data app: @@ -24,17 +24,17 @@ services: build: args: - ENVIRONMENT=dev - context: ./api - dockerfile: Dockerfile + context: ./ + dockerfile: ./api/Dockerfile.dev env_file: - .env ports: - - '8000:8000' + - "8000:8000" develop: watch: - action: rebuild - path: ./api - target: /app/api + path: ./ + target: /app migrate: depends_on: db: @@ -42,13 +42,12 @@ services: build: args: - ENVIRONMENT=dev - context: ./migrate - dockerfile: Dockerfile + context: ./ + dockerfile: ./migrate/Dockerfile.dev environment: - WAIT_HOSTS=db:5432 env_file: - .env - volumes: postgres_data: diff --git a/etl/src/birdxplorer_etl/extract.py b/etl/src/birdxplorer_etl/extract.py index 3fa9f63..81abf65 100644 --- a/etl/src/birdxplorer_etl/extract.py +++ b/etl/src/birdxplorer_etl/extract.py @@ -7,7 +7,6 @@ from lib.x.postlookup import lookup from birdxplorer_common.storage import RowNoteRecord, RowPostRecord, RowUserRecord import settings -import time def extract_data(db: Session): @@ -51,13 +50,6 @@ def extract_data(db: Session): db.commit() - # post = lookup() - # created_at = datetime.strptime(post["data"]["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ") - # created_at_millis = int(created_at.timestamp() * 1000) - # db_post = RowPostRecord(post_id=post["data"]["id"], author_id=post["data"]["author_id"], text=post["data"]["text"], created_at=created_at_millis,like_count=post["data"]["public_metrics"]["like_count"],repost_count=post["data"]["public_metrics"]["retweet_count"],bookmark_count=post["data"]["public_metrics"]["bookmark_count"],impression_count=post["data"]["public_metrics"]["impression_count"],quote_count=post["data"]["public_metrics"]["quote_count"],reply_count=post["data"]["public_metrics"]["reply_count"],lang=post["data"]["lang"]) - # db.add(db_post) - # db.commit() - # Noteに紐づくtweetデータを取得 postExtract_targetNotes = ( db.query(RowNoteRecord) @@ -70,42 +62,56 @@ def extract_data(db: Session): for note in postExtract_targetNotes: tweet_id = note.tweet_id - is_tweetExist = db.query(RowPostRecord).filter(RowPostRecord.post_id == tweet_id).first() + is_tweetExist = db.query(RowPostRecord).filter(RowPostRecord.post_id == str(tweet_id)).first() if is_tweetExist is not None: + logger.info(f"tweet_id {tweet_id} is already exist") note.row_post_id = tweet_id continue logger.info(tweet_id) post = lookup(tweet_id) + + if post == None or "data" not in post: + continue + created_at = datetime.strptime(post["data"]["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ") created_at_millis = int(created_at.timestamp() * 1000) is_userExist = db.query(RowUserRecord).filter(RowUserRecord.user_id == post["data"]["author_id"]).first() logger.info(is_userExist) if is_userExist is None: + user_data = ( + post["includes"]["users"][0] + if "includes" in post and "users" in post["includes"] and len(post["includes"]["users"]) > 0 + else {} + ) db_user = RowUserRecord( user_id=post["data"]["author_id"], - name=post["includes"]["users"][0]["name"], - user_name=post["includes"]["users"][0]["username"], - description=post["includes"]["users"][0]["description"], - profile_image_url=post["includes"]["users"][0]["profile_image_url"], - followers_count=post["includes"]["users"][0]["public_metrics"]["followers_count"], - following_count=post["includes"]["users"][0]["public_metrics"]["following_count"], - tweet_count=post["includes"]["users"][0]["public_metrics"]["tweet_count"], - verified=post["includes"]["users"][0]["verified"], - verified_type=post["includes"]["users"][0]["verified_type"], - location=post["includes"]["users"][0]["location"], - url=post["includes"]["users"][0]["url"], + name=user_data.get("name"), + user_name=user_data.get("username"), + description=user_data.get("description"), + profile_image_url=user_data.get("profile_image_url"), + followers_count=user_data.get("public_metrics", {}).get("followers_count"), + following_count=user_data.get("public_metrics", {}).get("following_count"), + tweet_count=user_data.get("public_metrics", {}).get("tweet_count"), + verified=user_data.get("verified", False), + verified_type=user_data.get("verified_type", ""), + location=user_data.get("location", ""), + url=user_data.get("url", ""), ) db.add(db_user) - media_url = post["includes"]["media"][0]["url"] + media_data = ( + post["includes"]["media"][0] + if "includes" in post and "media" in post["includes"] and len(post["includes"]["media"]) > 0 + else {} + ) db_post = RowPostRecord( post_id=post["data"]["id"], author_id=post["data"]["author_id"], text=post["data"]["text"], - media_type=post["includes"]["media"][0]["type"], - media_url=media_url, + media_type=media_data.get("type", ""), + media_url=media_data.get("url", ""), created_at=created_at_millis, like_count=post["data"]["public_metrics"]["like_count"], repost_count=post["data"]["public_metrics"]["retweet_count"], @@ -117,9 +123,8 @@ def extract_data(db: Session): ) db.add(db_post) note.row_post_id = tweet_id - time.sleep(1) + db.commit() continue - db.commit() # select note from db, get relation tweet and user data note = db.query(RowNoteRecord).filter(RowNoteRecord.tweet_id == "1797617478950170784").first() diff --git a/etl/src/birdxplorer_etl/lib/x/postlookup.py b/etl/src/birdxplorer_etl/lib/x/postlookup.py index f949492..1410ceb 100644 --- a/etl/src/birdxplorer_etl/lib/x/postlookup.py +++ b/etl/src/birdxplorer_etl/lib/x/postlookup.py @@ -1,18 +1,21 @@ import requests -import json import settings from prefect import get_run_logger +import time + def create_url(id): - logger = get_run_logger() - expansions = 'expansions=attachments.poll_ids,attachments.media_keys,author_id,edit_history_tweet_ids,entities.mentions.username,geo.place_id,in_reply_to_user_id,referenced_tweets.id,referenced_tweets.id.author_id' + expansions = "expansions=attachments.poll_ids,attachments.media_keys,author_id,edit_history_tweet_ids,entities.mentions.username,geo.place_id,in_reply_to_user_id,referenced_tweets.id,referenced_tweets.id.author_id" tweet_fields = "tweet.fields=attachments,author_id,context_annotations,conversation_id,created_at,edit_controls,entities,geo,id,in_reply_to_user_id,lang,public_metrics,possibly_sensitive,referenced_tweets,reply_settings,source,text,withheld" - media_fields = 'media.fields=duration_ms,height,media_key,preview_image_url,type,url,width,public_metrics,alt_text,variants' - place_fields = 'place.fields=contained_within,country,country_code,full_name,geo,id,name,place_type' - user_fields = 'user.fields=created_at,description,entities,id,location,most_recent_tweet_id,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,verified_type,withheld' - - url = "https://api.twitter.com/2/tweets/{}?{}&{}&{}&{}&{}".format(id, tweet_fields, expansions,media_fields,place_fields,user_fields) - logger.info(url) + media_fields = ( + "media.fields=duration_ms,height,media_key,preview_image_url,type,url,width,public_metrics,alt_text,variants" + ) + place_fields = "place.fields=contained_within,country,country_code,full_name,geo,id,name,place_type" + user_fields = "user.fields=created_at,description,entities,id,location,most_recent_tweet_id,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,verified_type,withheld" + + url = "https://api.twitter.com/2/tweets/{}?{}&{}&{}&{}&{}".format( + id, tweet_fields, expansions, media_fields, place_fields, user_fields + ) return url @@ -27,21 +30,31 @@ def bearer_oauth(r): def connect_to_endpoint(url): + logger = get_run_logger() response = requests.request("GET", url, auth=bearer_oauth) - print(response.status_code) - if response.status_code != 200: - raise Exception( - "Request returned an error: {} {}".format( - response.status_code, response.text - ) - ) + if response.status_code == 429: + limit = response.headers["x-rate-limit-reset"] + logger.info("Waiting for rate limit reset...") + time.sleep(int(limit) - int(time.time()) + 1) + data = connect_to_endpoint(url) + return data + elif response.status_code != 200: + raise Exception("Request returned an error: {} {}".format(response.status_code, response.text)) return response.json() +def check_existence(id): + url = "https://publish.twitter.com/oembed?url=https://x.com/CommunityNotes/status/{}&partner=&hide_thread=false".format( + id + ) + status = requests.get(url).status_code + return status == 200 + + def lookup(id): + isExist = check_existence(id) + if not isExist: + return None url = create_url(id) json_response = connect_to_endpoint(url) - print(json.dumps(json_response, indent=4, sort_keys=True)) return json_response - -# https://oauth-playground.glitch.me/?id=findTweetsById¶ms=%28%27query%21%28%27C%27*B%29%7Ebody%21%28%29%7Epath%21%28%29*B%7EFAG%27%7EuserADfile_image_url%2CiNcreated_at%2CconnectK_statuHurlMublic_JtricHuserDtecteNentitieHdescriptK%27%7ECG%2Creferenced_Fs.id-keys-source_F%27%7EOAtype%2Curl%27%29*%7EidL146E37035677698IE43339741184I-%2CattachJnts.O_A.fieldLBE46120540165%27CexpansKLDnaJMroE03237FtweetGauthor_idHs%2CI%2C146JmeKionLs%21%27M%2CpNd%2COJdia%01ONMLKJIHGFEDCBA-*_ \ No newline at end of file diff --git a/etl/src/birdxplorer_etl/settings.py b/etl/src/birdxplorer_etl/settings.py index e81325a..d6e9f2e 100644 --- a/etl/src/birdxplorer_etl/settings.py +++ b/etl/src/birdxplorer_etl/settings.py @@ -3,11 +3,13 @@ load_dotenv() -TARGET_TWITTER_POST_START_UNIX_MILLISECOND = 1717729500000 -TARGET_TWITTER_POST_END_UNIX_MILLISECOND = 1717729610000 +TARGET_TWITTER_POST_START_UNIX_MILLISECOND = int( + os.getenv("TARGET_TWITTER_POST_START_UNIX_MILLISECOND", "1717729500000") +) +TARGET_TWITTER_POST_END_UNIX_MILLISECOND = int(os.getenv("TARGET_TWITTER_POST_END_UNIX_MILLISECOND", "1717729610000")) # Extractで何日前のデータを最新と定義するか。開発中は3日前が楽。 -COMMUNITY_NOTE_DAYS_AGO = 3 +COMMUNITY_NOTE_DAYS_AGO = int(os.getenv("COMMUNITY_NOTE_DAYS_AGO", "3")) X_BEARER_TOKEN = os.getenv("X_BEARER_TOKEN") AI_MODEL = os.getenv("AI_MODEL") diff --git a/migrate/Dockerfile.dev b/migrate/Dockerfile.dev new file mode 100644 index 0000000..3057101 --- /dev/null +++ b/migrate/Dockerfile.dev @@ -0,0 +1,52 @@ +ARG PYTHON_VERSION_CODE=3.10 +ARG ENVIRONMENT="dev" +# ENVIRONMENT: dev or prod, refer to project.optional-dependencies in pyproject.toml + +FROM python:${PYTHON_VERSION_CODE}-bookworm as builder +ARG PYTHON_VERSION_CODE +ARG ENVIRONMENT + +WORKDIR /app +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 + +COPY migrate/pyproject.toml migrate/README.md ./ +COPY migrate/birdxplorer_migration/__init__.py ./birdxplorer_migration/ + +RUN if [ "${ENVIRONMENT}" = "prod" ]; then \ + apt-get update && apt-get install -y --no-install-recommends \ + postgresql-client-15 \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/*; \ + fi + +RUN python -m pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -e ".[${ENVIRONMENT}]" + +COPY ../common ./common +RUN if [ "${ENVIRONMENT}" = "dev" ]; then \ + pip install -e ./common; \ + fi + +FROM python:${PYTHON_VERSION_CODE}-slim-bookworm as runner +ARG PYTHON_VERSION_CODE +ARG ENVIRONMENT + +WORKDIR /app + +RUN if [ "${ENVIRONMENT}" = "prod" ]; then \ + apt-get update && apt-get install -y --no-install-recommends \ + libpq5 \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/*; \ + fi + +RUN groupadd -r app && useradd -r -g app app +RUN chown -R app:app /app +USER app + +COPY --from=builder /usr/local/lib/python${PYTHON_VERSION_CODE}/site-packages /usr/local/lib/python${PYTHON_VERSION_CODE}/site-packages +COPY --chown=app:app migrate ./ +COPY ../common ./common + +ENTRYPOINT ["python", "birdxplorer_migration/scripts/migrate_all.py", "birdxplorer_migration/data/appv1/"]