From 1569e00beec3ec767e7ff4ef1c74522737be0d1e Mon Sep 17 00:00:00 2001 From: yu23ki14 Date: Tue, 9 Jul 2024 11:49:14 +0900 Subject: [PATCH 01/19] deploy to render --- api/Dockerfile.prd | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 api/Dockerfile.prd diff --git a/api/Dockerfile.prd b/api/Dockerfile.prd new file mode 100644 index 0000000..278c05b --- /dev/null +++ b/api/Dockerfile.prd @@ -0,0 +1,46 @@ +ARG PYTHON_VERSION_CODE=3.10 +ARG ENVIRONMENT="prod" +# ENVIRONMENT: dev or prod, refer to project.optional-dependencies in pyproject.toml + +FROM python:${PYTHON_VERSION_CODE}-bookworm as builder +ARG PYTHON_VERSION_CODE +ARG ENVIRONMENT + +WORKDIR /app +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 + +COPY pyproject.toml README.md ./ +COPY birdxplorer_api/__init__.py ./birdxplorer_api/ + +RUN if [ "${ENVIRONMENT}" = "prod" ]; then \ + apt-get update && apt-get install -y --no-install-recommends \ + postgresql-client-15 \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/*; \ + fi + +RUN python -m pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -e ".[${ENVIRONMENT}]" + +FROM python:${PYTHON_VERSION_CODE}-slim-bookworm as runner +ARG PYTHON_VERSION_CODE +ARG ENVIRONMENT + +WORKDIR /app + +RUN if [ "${ENVIRONMENT}" = "prod" ]; then \ + apt-get update && apt-get install -y --no-install-recommends \ + libpq5 \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/*; \ + fi + +RUN groupadd -r app && useradd -r -g app app +RUN chown -R app:app /app +USER app + +COPY --from=builder /usr/local/lib/python${PYTHON_VERSION_CODE}/site-packages /usr/local/lib/python${PYTHON_VERSION_CODE}/site-packages +COPY --chown=app:app . ./ + +ENTRYPOINT ["python", "-m", "uvicorn", "birdxplorer_api.main:app", "--host", "0.0.0.0", "--port", "10000"] From f0fc111367882a1a645c800bc8ff6cfb387b2855 Mon Sep 17 00:00:00 2001 From: yu23ki14 Date: Tue, 30 Jul 2024 18:46:20 +0900 Subject: [PATCH 02/19] modify docker file for dev env --- api/Dockerfile.dev | 52 ++++++++++++++++++++++++++++++++++++++++++ compose.yml | 19 ++++++++------- migrate/Dockerfile.dev | 52 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 113 insertions(+), 10 deletions(-) create mode 100644 api/Dockerfile.dev create mode 100644 migrate/Dockerfile.dev diff --git a/api/Dockerfile.dev b/api/Dockerfile.dev new file mode 100644 index 0000000..1c439b9 --- /dev/null +++ b/api/Dockerfile.dev @@ -0,0 +1,52 @@ +ARG PYTHON_VERSION_CODE=3.10 +ARG ENVIRONMENT="dev" +# ENVIRONMENT: dev or prod, refer to project.optional-dependencies in pyproject.toml + +FROM python:${PYTHON_VERSION_CODE}-bookworm as builder +ARG PYTHON_VERSION_CODE +ARG ENVIRONMENT + +WORKDIR /app +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 + +COPY api/pyproject.toml api/README.md ./ +COPY api/birdxplorer_api/__init__.py ./birdxplorer_api/ + +RUN if [ "${ENVIRONMENT}" = "prod" ]; then \ + apt-get update && apt-get install -y --no-install-recommends \ + postgresql-client-15 \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/*; \ + fi + +RUN python -m pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -e ".[${ENVIRONMENT}]" + +COPY ../common ./common +RUN if [ "${ENVIRONMENT}" = "dev" ]; then \ + pip install -e ./common; \ + fi + +FROM python:${PYTHON_VERSION_CODE}-slim-bookworm as runner +ARG PYTHON_VERSION_CODE +ARG ENVIRONMENT + +WORKDIR /app + +RUN if [ "${ENVIRONMENT}" = "prod" ]; then \ + apt-get update && apt-get install -y --no-install-recommends \ + libpq5 \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/*; \ + fi + +RUN groupadd -r app && useradd -r -g app app +RUN chown -R app:app /app +USER app + +COPY --from=builder /usr/local/lib/python${PYTHON_VERSION_CODE}/site-packages /usr/local/lib/python${PYTHON_VERSION_CODE}/site-packages +COPY --chown=app:app api ./ +COPY ../common ./common + +ENTRYPOINT ["python", "-m", "uvicorn", "birdxplorer_api.main:app", "--host", "0.0.0.0"] diff --git a/compose.yml b/compose.yml index 7d36098..df2676f 100644 --- a/compose.yml +++ b/compose.yml @@ -1,4 +1,4 @@ -version: '3.1' +version: "3.1" services: db: @@ -14,7 +14,7 @@ services: timeout: 5s retries: 5 ports: - - '5432:5432' + - "5432:5432" volumes: - postgres_data:/var/lib/postgresql/data app: @@ -24,17 +24,17 @@ services: build: args: - ENVIRONMENT=dev - context: ./api - dockerfile: Dockerfile + context: ./ + dockerfile: ./api/Dockerfile.dev env_file: - .env ports: - - '8000:8000' + - "8000:8000" develop: watch: - action: rebuild - path: ./api - target: /app/api + path: ./ + target: /app migrate: depends_on: db: @@ -42,13 +42,12 @@ services: build: args: - ENVIRONMENT=dev - context: ./migrate - dockerfile: Dockerfile + context: ./ + dockerfile: ./migrate/Dockerfile.dev environment: - WAIT_HOSTS=db:5432 env_file: - .env - volumes: postgres_data: diff --git a/migrate/Dockerfile.dev b/migrate/Dockerfile.dev new file mode 100644 index 0000000..3057101 --- /dev/null +++ b/migrate/Dockerfile.dev @@ -0,0 +1,52 @@ +ARG PYTHON_VERSION_CODE=3.10 +ARG ENVIRONMENT="dev" +# ENVIRONMENT: dev or prod, refer to project.optional-dependencies in pyproject.toml + +FROM python:${PYTHON_VERSION_CODE}-bookworm as builder +ARG PYTHON_VERSION_CODE +ARG ENVIRONMENT + +WORKDIR /app +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 + +COPY migrate/pyproject.toml migrate/README.md ./ +COPY migrate/birdxplorer_migration/__init__.py ./birdxplorer_migration/ + +RUN if [ "${ENVIRONMENT}" = "prod" ]; then \ + apt-get update && apt-get install -y --no-install-recommends \ + postgresql-client-15 \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/*; \ + fi + +RUN python -m pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -e ".[${ENVIRONMENT}]" + +COPY ../common ./common +RUN if [ "${ENVIRONMENT}" = "dev" ]; then \ + pip install -e ./common; \ + fi + +FROM python:${PYTHON_VERSION_CODE}-slim-bookworm as runner +ARG PYTHON_VERSION_CODE +ARG ENVIRONMENT + +WORKDIR /app + +RUN if [ "${ENVIRONMENT}" = "prod" ]; then \ + apt-get update && apt-get install -y --no-install-recommends \ + libpq5 \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/*; \ + fi + +RUN groupadd -r app && useradd -r -g app app +RUN chown -R app:app /app +USER app + +COPY --from=builder /usr/local/lib/python${PYTHON_VERSION_CODE}/site-packages /usr/local/lib/python${PYTHON_VERSION_CODE}/site-packages +COPY --chown=app:app migrate ./ +COPY ../common ./common + +ENTRYPOINT ["python", "birdxplorer_migration/scripts/migrate_all.py", "birdxplorer_migration/data/appv1/"] From fd5c1e912e24eeb9756a60ede292dc0d0171a80d Mon Sep 17 00:00:00 2001 From: yu23ki14 Date: Mon, 5 Aug 2024 11:10:24 +0900 Subject: [PATCH 03/19] get env for etl setting --- etl/src/birdxplorer_etl/settings.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/etl/src/birdxplorer_etl/settings.py b/etl/src/birdxplorer_etl/settings.py index 30a4e6e..7a12c70 100644 --- a/etl/src/birdxplorer_etl/settings.py +++ b/etl/src/birdxplorer_etl/settings.py @@ -3,10 +3,12 @@ load_dotenv() -TARGET_TWITTER_POST_START_UNIX_MILLISECOND = 1717729500000 -TARGET_TWITTER_POST_END_UNIX_MILLISECOND = 1717729610000 +TARGET_TWITTER_POST_START_UNIX_MILLISECOND = int( + os.getenv("TARGET_TWITTER_POST_START_UNIX_MILLISECOND", "1717729500000") +) +TARGET_TWITTER_POST_END_UNIX_MILLISECOND = int(os.getenv("TARGET_TWITTER_POST_END_UNIX_MILLISECOND", "1717729610000")) # Extractで何日前のデータを最新と定義するか。開発中は3日前が楽。 -COMMUNITY_NOTE_DAYS_AGO = 3 +COMMUNITY_NOTE_DAYS_AGO = int(os.getenv("COMMUNITY_NOTE_DAYS_AGO", "3")) X_BEARER_TOKEN = os.getenv("X_BEARER_TOKEN") From dd8fa2fbbd675574488d1dbe5da46e856958846b Mon Sep 17 00:00:00 2001 From: sushi-chaaaan Date: Mon, 5 Aug 2024 15:23:00 +0900 Subject: [PATCH 04/19] add support for retrieving posts by note IDs --- api/birdxplorer_api/routers/data.py | 3 +++ common/birdxplorer_common/storage.py | 10 ++++++++++ 2 files changed, 13 insertions(+) diff --git a/api/birdxplorer_api/routers/data.py b/api/birdxplorer_api/routers/data.py index 2398df8..7bdd935 100644 --- a/api/birdxplorer_api/routers/data.py +++ b/api/birdxplorer_api/routers/data.py @@ -92,11 +92,14 @@ def get_notes( @router.get("/posts", response_model=PostListResponse) def get_posts( post_id: Union[List[PostId], None] = Query(default=None), + note_id: Union[List[NoteId], None] = Query(default=None), created_at_start: Union[None, TwitterTimestamp, str] = Query(default=None), created_at_end: Union[None, TwitterTimestamp, str] = Query(default=None), ) -> PostListResponse: if post_id is not None: return PostListResponse(data=list(storage.get_posts_by_ids(post_ids=post_id))) + if note_id is not None: + return PostListResponse(data=list(storage.get_posts_by_note_ids(note_ids=note_id))) if created_at_start is not None: if created_at_end is not None: return PostListResponse( diff --git a/common/birdxplorer_common/storage.py b/common/birdxplorer_common/storage.py index cd07b7a..ad77eca 100644 --- a/common/birdxplorer_common/storage.py +++ b/common/birdxplorer_common/storage.py @@ -296,6 +296,16 @@ def get_posts_by_created_at_end(self, end: TwitterTimestamp) -> Generator[PostMo for post_record in sess.query(PostRecord).filter(PostRecord.created_at < end).all(): yield self._post_record_to_model(post_record) + def get_posts_by_note_ids(self, note_ids: List[NoteId]) -> Generator[PostModel, None, None]: + query = ( + select(PostRecord) + .join(NoteRecord, NoteRecord.post_id == PostRecord.post_id) + .where(NoteRecord.note_id.in_(note_ids)) + ) + with Session(self.engine) as sess: + for post_record in sess.execute(query).scalars().all(): + yield self._post_record_to_model(post_record) + def gen_storage(settings: GlobalSettings) -> Storage: engine = create_engine(settings.storage_settings.sqlalchemy_database_url) From 776d0d1b27609245480fa812a3ae5506af83a7f0 Mon Sep 17 00:00:00 2001 From: sushi-chaaaan Date: Mon, 5 Aug 2024 15:16:24 +0900 Subject: [PATCH 05/19] add test --- api/tests/conftest.py | 9 +++++++++ api/tests/routers/test_data.py | 7 +++++++ 2 files changed, 16 insertions(+) diff --git a/api/tests/conftest.py b/api/tests/conftest.py index 4fcc8e0..7e8d128 100644 --- a/api/tests/conftest.py +++ b/api/tests/conftest.py @@ -262,6 +262,15 @@ def _get_posts_by_ids(post_ids: List[PostId]) -> Generator[Post, None, None]: mock.get_posts_by_ids.side_effect = _get_posts_by_ids + def _get_posts_by_note_ids(note_ids: List[NoteId]) -> Generator[Post, None, None]: + for post in post_samples: + for note in note_samples: + if note.note_id in note_ids and post.post_id == note.post_id: + yield post + break + + mock.get_posts_by_note_ids.side_effect = _get_posts_by_note_ids + def _get_posts_by_created_at_range(start: TwitterTimestamp, end: TwitterTimestamp) -> Generator[Post, None, None]: for post in post_samples: if start <= post.created_at < end: diff --git a/api/tests/routers/test_data.py b/api/tests/routers/test_data.py index 8bf271c..59fef49 100644 --- a/api/tests/routers/test_data.py +++ b/api/tests/routers/test_data.py @@ -39,6 +39,13 @@ def test_posts_get_has_post_id_filter(client: TestClient, post_samples: List[Pos } +def test_posts_get_has_note_id_filter(client: TestClient, post_samples: List[Post], note_samples: List[Note]) -> None: + response = client.get(f"/api/v1/data/posts/?noteId={','.join([n.note_id for n in note_samples])}") + assert response.status_code == 200 + res_json = response.json() + assert res_json == {"data": [json.loads(post_samples[0].model_dump_json())]} + + def test_posts_get_has_created_at_filter_start_and_end(client: TestClient, post_samples: List[Post]) -> None: response = client.get("/api/v1/data/posts/?createdAtStart=2006-7-25 00:00:00&createdAtEnd=2006-7-30 23:59:59") assert response.status_code == 200 From 4cce8c8a22d0030bb2cf3be1f8bbef350aa1384d Mon Sep 17 00:00:00 2001 From: sushi-chaaaan Date: Mon, 5 Aug 2024 15:58:49 +0900 Subject: [PATCH 06/19] fix: mypy ignore statement --- common/birdxplorer_common/settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/birdxplorer_common/settings.py b/common/birdxplorer_common/settings.py index c561784..923ad98 100644 --- a/common/birdxplorer_common/settings.py +++ b/common/birdxplorer_common/settings.py @@ -18,7 +18,7 @@ class PostgresStorageSettings(BaseSettings): port: int = 5432 database: str = "postgres" - @computed_field # type: ignore[misc] + @computed_field # type: ignore[prop-decorator] @property def sqlalchemy_database_url(self) -> str: return PostgresDsn( From 584510b592be222ddd3cafb9c0b600184d8d7b0b Mon Sep 17 00:00:00 2001 From: sushi-chaaaan Date: Mon, 5 Aug 2024 16:20:53 +0900 Subject: [PATCH 07/19] =?UTF-8?q?refactor:=20TweetId=20=E3=81=AE=E4=BD=BF?= =?UTF-8?q?=E7=94=A8=E7=AE=87=E6=89=80=E3=82=92=20PostId=20=E3=81=AB?= =?UTF-8?q?=E7=BD=AE=E3=81=8D=E6=8F=9B=E3=81=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- api/birdxplorer_api/routers/data.py | 3 +-- api/tests/conftest.py | 3 +-- common/birdxplorer_common/models.py | 10 +++++----- common/birdxplorer_common/storage.py | 13 +++++++------ common/tests/test_storage.py | 7 +++---- 5 files changed, 17 insertions(+), 19 deletions(-) diff --git a/api/birdxplorer_api/routers/data.py b/api/birdxplorer_api/routers/data.py index 2398df8..352ff32 100644 --- a/api/birdxplorer_api/routers/data.py +++ b/api/birdxplorer_api/routers/data.py @@ -14,7 +14,6 @@ PostId, Topic, TopicId, - TweetId, TwitterTimestamp, UserEnrollment, ) @@ -73,7 +72,7 @@ def get_notes( created_at_from: Union[None, TwitterTimestamp] = Query(default=None), created_at_to: Union[None, TwitterTimestamp] = Query(default=None), topic_ids: Union[List[TopicId], None] = Query(default=None), - post_ids: Union[List[TweetId], None] = Query(default=None), + post_ids: Union[List[PostId], None] = Query(default=None), language: Union[LanguageIdentifier, None] = Query(default=None), ) -> NoteListResponse: return NoteListResponse( diff --git a/api/tests/conftest.py b/api/tests/conftest.py index 4fcc8e0..ac586a7 100644 --- a/api/tests/conftest.py +++ b/api/tests/conftest.py @@ -21,7 +21,6 @@ PostId, Topic, TopicId, - TweetId, TwitterTimestamp, UserEnrollment, XUser, @@ -227,7 +226,7 @@ def _get_notes( created_at_from: Union[None, TwitterTimestamp] = None, created_at_to: Union[None, TwitterTimestamp] = None, topic_ids: Union[List[TopicId], None] = None, - post_ids: Union[List[TweetId], None] = None, + post_ids: Union[List[PostId], None] = None, language: Union[LanguageIdentifier, None] = None, ) -> Generator[Note, None, None]: for note in note_samples: diff --git a/common/birdxplorer_common/models.py b/common/birdxplorer_common/models.py index 5d19baa..e6bec14 100644 --- a/common/birdxplorer_common/models.py +++ b/common/birdxplorer_common/models.py @@ -568,6 +568,9 @@ class NotesValidationDifficulty(str, Enum): class TweetId(UpToNineteenDigitsDecimalString): ... +class PostId(UpToNineteenDigitsDecimalString): ... + + class NoteData(BaseModel): """ This is for validating the original data from notes.csv. @@ -576,7 +579,7 @@ class NoteData(BaseModel): note_id: NoteId note_author_participant_id: ParticipantId created_at_millis: TwitterTimestamp - tweet_id: TweetId + tweet_id: PostId believable: NotesBelievable misleading_other: BinaryBool misleading_factual_error: BinaryBool @@ -629,7 +632,7 @@ class SummaryString(NonEmptyTrimmedString): ... class Note(BaseModel): note_id: NoteId - post_id: TweetId + post_id: PostId language: LanguageIdentifier topics: List[Topic] summary: SummaryString @@ -650,9 +653,6 @@ class XUser(BaseModel): following_count: NonNegativeInt -class PostId(UpToNineteenDigitsDecimalString): ... - - MediaDetails: TypeAlias = List[HttpUrl] | None diff --git a/common/birdxplorer_common/storage.py b/common/birdxplorer_common/storage.py index cd07b7a..8f4013b 100644 --- a/common/birdxplorer_common/storage.py +++ b/common/birdxplorer_common/storage.py @@ -40,6 +40,7 @@ class Base(DeclarativeBase): NoteId: String, ParticipantId: String, TweetId: String, + PostId: String, LanguageIdentifier: String, TwitterTimestamp: DECIMAL, SummaryString: String, @@ -65,7 +66,7 @@ class NoteRecord(Base): __tablename__ = "notes" note_id: Mapped[NoteId] = mapped_column(primary_key=True) - post_id: Mapped[TweetId] = mapped_column(nullable=False) + post_id: Mapped[PostId] = mapped_column(nullable=False) topics: Mapped[List[NoteTopicAssociation]] = relationship() language: Mapped[LanguageIdentifier] = mapped_column(nullable=False) summary: Mapped[SummaryString] = mapped_column(nullable=False) @@ -92,7 +93,7 @@ class XUserRecord(Base): class PostRecord(Base): __tablename__ = "posts" - post_id: Mapped[TweetId] = mapped_column(primary_key=True) + post_id: Mapped[PostId] = mapped_column(primary_key=True) user_id: Mapped[UserId] = mapped_column(ForeignKey("x_users.user_id"), nullable=False) user: Mapped[XUserRecord] = relationship() text: Mapped[SummaryString] = mapped_column(nullable=False) @@ -109,7 +110,7 @@ class RowNoteRecord(Base): note_id: Mapped[NoteId] = mapped_column(primary_key=True) note_author_participant_id: Mapped[ParticipantId] = mapped_column(nullable=False) created_at_millis: Mapped[TwitterTimestamp] = mapped_column(nullable=False) - tweet_id: Mapped[TweetId] = mapped_column(nullable=False) + tweet_id: Mapped[PostId] = mapped_column(nullable=False) believable: Mapped[BinaryBool] = mapped_column(nullable=False) misleading_other: Mapped[BinaryBool] = mapped_column(nullable=False) misleading_factual_error: Mapped[BinaryBool] = mapped_column(nullable=False) @@ -129,14 +130,14 @@ class RowNoteRecord(Base): harmful: Mapped[NotesHarmful] = mapped_column(nullable=False) validation_difficulty: Mapped[SummaryString] = mapped_column(nullable=False) summary: Mapped[SummaryString] = mapped_column(nullable=False) - row_post_id: Mapped[TweetId] = mapped_column(ForeignKey("row_posts.post_id"), nullable=True) + row_post_id: Mapped[PostId] = mapped_column(ForeignKey("row_posts.post_id"), nullable=True) row_post: Mapped["RowPostRecord"] = relationship("RowPostRecord", back_populates="row_notes") class RowPostRecord(Base): __tablename__ = "row_posts" - post_id: Mapped[TweetId] = mapped_column(primary_key=True) + post_id: Mapped[PostId] = mapped_column(primary_key=True) author_id: Mapped[UserId] = mapped_column(ForeignKey("row_users.user_id"), nullable=False) text: Mapped[SummaryString] = mapped_column(nullable=False) media_type: Mapped[String] = mapped_column(nullable=True) @@ -224,7 +225,7 @@ def get_notes( created_at_from: Union[None, TwitterTimestamp] = None, created_at_to: Union[None, TwitterTimestamp] = None, topic_ids: Union[List[TopicId], None] = None, - post_ids: Union[List[TweetId], None] = None, + post_ids: Union[List[PostId], None] = None, language: Union[LanguageIdentifier, None] = None, ) -> Generator[NoteModel, None, None]: with Session(self.engine) as sess: diff --git a/common/tests/test_storage.py b/common/tests/test_storage.py index 3b74975..ff898e9 100644 --- a/common/tests/test_storage.py +++ b/common/tests/test_storage.py @@ -10,7 +10,6 @@ PostId, Topic, TopicId, - TweetId, TwitterTimestamp, ) from birdxplorer_common.storage import NoteRecord, PostRecord, Storage, TopicRecord @@ -208,8 +207,8 @@ def test_get_notes_by_post_ids( ) -> None: storage = Storage(engine=engine_for_test) post_ids = [ - TweetId.from_str("2234567890123456781"), - TweetId.from_str("2234567890123456782"), + PostId.from_str("2234567890123456781"), + PostId.from_str("2234567890123456782"), ] expected = [note for note in note_samples if note.post_id in post_ids] actual = list(storage.get_notes(post_ids=post_ids)) @@ -222,7 +221,7 @@ def test_get_notes_by_post_ids_empty( note_records_sample: List[NoteRecord], ) -> None: storage = Storage(engine=engine_for_test) - post_ids: List[TweetId] = [] + post_ids: List[PostId] = [] expected: List[Note] = [] actual = list(storage.get_notes(post_ids=post_ids)) assert expected == actual From 9305c78934c53376fc4ea9bcf5c69b4a4a875e7e Mon Sep 17 00:00:00 2001 From: sushi-chaaaan Date: Mon, 5 Aug 2024 16:21:19 +0900 Subject: [PATCH 08/19] =?UTF-8?q?TweetId=20=E3=82=92=E5=89=8A=E9=99=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- common/birdxplorer_common/models.py | 3 --- common/birdxplorer_common/storage.py | 2 -- 2 files changed, 5 deletions(-) diff --git a/common/birdxplorer_common/models.py b/common/birdxplorer_common/models.py index e6bec14..3712d35 100644 --- a/common/birdxplorer_common/models.py +++ b/common/birdxplorer_common/models.py @@ -565,9 +565,6 @@ class NotesValidationDifficulty(str, Enum): empty = "" -class TweetId(UpToNineteenDigitsDecimalString): ... - - class PostId(UpToNineteenDigitsDecimalString): ... diff --git a/common/birdxplorer_common/storage.py b/common/birdxplorer_common/storage.py index 8f4013b..0bff610 100644 --- a/common/birdxplorer_common/storage.py +++ b/common/birdxplorer_common/storage.py @@ -16,7 +16,6 @@ from .models import ( TopicId, TopicLabel, - TweetId, TwitterTimestamp, UserEnrollment, UserId, @@ -39,7 +38,6 @@ class Base(DeclarativeBase): TopicLabel: JSON, NoteId: String, ParticipantId: String, - TweetId: String, PostId: String, LanguageIdentifier: String, TwitterTimestamp: DECIMAL, From 3713e348ba52877a5711dc03caaa065f436797ac Mon Sep 17 00:00:00 2001 From: yu23ki14 Date: Tue, 6 Aug 2024 01:12:51 +0900 Subject: [PATCH 09/19] wait a minute --- etl/src/birdxplorer_etl/extract.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl/src/birdxplorer_etl/extract.py b/etl/src/birdxplorer_etl/extract.py index 3fa9f63..f39fbc9 100644 --- a/etl/src/birdxplorer_etl/extract.py +++ b/etl/src/birdxplorer_etl/extract.py @@ -117,7 +117,7 @@ def extract_data(db: Session): ) db.add(db_post) note.row_post_id = tweet_id - time.sleep(1) + time.sleep(60) continue db.commit() From ba81dba08f3bd1a86db5fdefa731b982c01d5a8e Mon Sep 17 00:00:00 2001 From: yu23ki14 Date: Tue, 6 Aug 2024 01:46:41 +0900 Subject: [PATCH 10/19] media and user option --- etl/src/birdxplorer_etl/extract.py | 32 ++++++++++++++++++------------ 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/etl/src/birdxplorer_etl/extract.py b/etl/src/birdxplorer_etl/extract.py index f39fbc9..878d4f9 100644 --- a/etl/src/birdxplorer_etl/extract.py +++ b/etl/src/birdxplorer_etl/extract.py @@ -83,28 +83,34 @@ def extract_data(db: Session): is_userExist = db.query(RowUserRecord).filter(RowUserRecord.user_id == post["data"]["author_id"]).first() logger.info(is_userExist) if is_userExist is None: + user_data = ( + post["includes"]["users"][0] + if "includes" in post and "users" in post["includes"] and len(post["includes"]["users"]) > 0 + else {} + ) db_user = RowUserRecord( user_id=post["data"]["author_id"], - name=post["includes"]["users"][0]["name"], - user_name=post["includes"]["users"][0]["username"], - description=post["includes"]["users"][0]["description"], - profile_image_url=post["includes"]["users"][0]["profile_image_url"], - followers_count=post["includes"]["users"][0]["public_metrics"]["followers_count"], - following_count=post["includes"]["users"][0]["public_metrics"]["following_count"], - tweet_count=post["includes"]["users"][0]["public_metrics"]["tweet_count"], - verified=post["includes"]["users"][0]["verified"], - verified_type=post["includes"]["users"][0]["verified_type"], - location=post["includes"]["users"][0]["location"], - url=post["includes"]["users"][0]["url"], + name=user_data.get("name"), + user_name=user_data.get("username"), + description=user_data.get("description"), + profile_image_url=user_data.get("profile_image_url"), + followers_count=user_data.get("public_metrics", {}).get("followers_count"), + following_count=user_data.get("public_metrics", {}).get("following_count"), + tweet_count=user_data.get("public_metrics", {}).get("tweet_count"), + verified=user_data.get("verified"), + verified_type=user_data.get("verified_type"), + location=user_data.get("location"), + url=user_data.get("url", ""), ) db.add(db_user) - media_url = post["includes"]["media"][0]["url"] + media_url = post["includes"]["media"][0]["url"] if "media" in post["includes"] else "" + media_type = post["includes"]["media"][0]["type"] if "media" in post["includes"] else "" db_post = RowPostRecord( post_id=post["data"]["id"], author_id=post["data"]["author_id"], text=post["data"]["text"], - media_type=post["includes"]["media"][0]["type"], + media_type=media_type, media_url=media_url, created_at=created_at_millis, like_count=post["data"]["public_metrics"]["like_count"], From fc702a315266007a6cf7d6fb342a219952fb9319 Mon Sep 17 00:00:00 2001 From: yu23ki14 Date: Tue, 6 Aug 2024 09:40:44 +0900 Subject: [PATCH 11/19] rate limit --- etl/src/birdxplorer_etl/extract.py | 28 +++++++++++++-------- etl/src/birdxplorer_etl/lib/x/postlookup.py | 27 ++++++++++---------- 2 files changed, 32 insertions(+), 23 deletions(-) diff --git a/etl/src/birdxplorer_etl/extract.py b/etl/src/birdxplorer_etl/extract.py index 878d4f9..0e9b6fb 100644 --- a/etl/src/birdxplorer_etl/extract.py +++ b/etl/src/birdxplorer_etl/extract.py @@ -70,13 +70,18 @@ def extract_data(db: Session): for note in postExtract_targetNotes: tweet_id = note.tweet_id - is_tweetExist = db.query(RowPostRecord).filter(RowPostRecord.post_id == tweet_id).first() + is_tweetExist = db.query(RowPostRecord).filter(RowPostRecord.post_id == str(tweet_id)).first() if is_tweetExist is not None: + logger.info(f"tweet_id {tweet_id} is already exist") note.row_post_id = tweet_id continue logger.info(tweet_id) post = lookup(tweet_id) + + if "data" not in post: + continue + created_at = datetime.strptime(post["data"]["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ") created_at_millis = int(created_at.timestamp() * 1000) @@ -97,21 +102,24 @@ def extract_data(db: Session): followers_count=user_data.get("public_metrics", {}).get("followers_count"), following_count=user_data.get("public_metrics", {}).get("following_count"), tweet_count=user_data.get("public_metrics", {}).get("tweet_count"), - verified=user_data.get("verified"), - verified_type=user_data.get("verified_type"), - location=user_data.get("location"), + verified=user_data.get("verified", False), + verified_type=user_data.get("verified_type", ""), + location=user_data.get("location", ""), url=user_data.get("url", ""), ) db.add(db_user) - media_url = post["includes"]["media"][0]["url"] if "media" in post["includes"] else "" - media_type = post["includes"]["media"][0]["type"] if "media" in post["includes"] else "" + media_data = ( + post["includes"]["media"][0] + if "includes" in post and "media" in post["includes"] and len(post["includes"]["media"]) > 0 + else {} + ) db_post = RowPostRecord( post_id=post["data"]["id"], author_id=post["data"]["author_id"], text=post["data"]["text"], - media_type=media_type, - media_url=media_url, + media_type=media_data.get("type", ""), + media_url=media_data.get("url", ""), created_at=created_at_millis, like_count=post["data"]["public_metrics"]["like_count"], repost_count=post["data"]["public_metrics"]["retweet_count"], @@ -123,9 +131,9 @@ def extract_data(db: Session): ) db.add(db_post) note.row_post_id = tweet_id - time.sleep(60) + time.sleep(90) + db.commit() continue - db.commit() # select note from db, get relation tweet and user data note = db.query(RowNoteRecord).filter(RowNoteRecord.tweet_id == "1797617478950170784").first() diff --git a/etl/src/birdxplorer_etl/lib/x/postlookup.py b/etl/src/birdxplorer_etl/lib/x/postlookup.py index f949492..557cb9e 100644 --- a/etl/src/birdxplorer_etl/lib/x/postlookup.py +++ b/etl/src/birdxplorer_etl/lib/x/postlookup.py @@ -3,15 +3,20 @@ import settings from prefect import get_run_logger + def create_url(id): logger = get_run_logger() - expansions = 'expansions=attachments.poll_ids,attachments.media_keys,author_id,edit_history_tweet_ids,entities.mentions.username,geo.place_id,in_reply_to_user_id,referenced_tweets.id,referenced_tweets.id.author_id' + expansions = "expansions=attachments.poll_ids,attachments.media_keys,author_id,edit_history_tweet_ids,entities.mentions.username,geo.place_id,in_reply_to_user_id,referenced_tweets.id,referenced_tweets.id.author_id" tweet_fields = "tweet.fields=attachments,author_id,context_annotations,conversation_id,created_at,edit_controls,entities,geo,id,in_reply_to_user_id,lang,public_metrics,possibly_sensitive,referenced_tweets,reply_settings,source,text,withheld" - media_fields = 'media.fields=duration_ms,height,media_key,preview_image_url,type,url,width,public_metrics,alt_text,variants' - place_fields = 'place.fields=contained_within,country,country_code,full_name,geo,id,name,place_type' - user_fields = 'user.fields=created_at,description,entities,id,location,most_recent_tweet_id,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,verified_type,withheld' - - url = "https://api.twitter.com/2/tweets/{}?{}&{}&{}&{}&{}".format(id, tweet_fields, expansions,media_fields,place_fields,user_fields) + media_fields = ( + "media.fields=duration_ms,height,media_key,preview_image_url,type,url,width,public_metrics,alt_text,variants" + ) + place_fields = "place.fields=contained_within,country,country_code,full_name,geo,id,name,place_type" + user_fields = "user.fields=created_at,description,entities,id,location,most_recent_tweet_id,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,verified_type,withheld" + + url = "https://api.twitter.com/2/tweets/{}?{}&{}&{}&{}&{}".format( + id, tweet_fields, expansions, media_fields, place_fields, user_fields + ) logger.info(url) return url @@ -30,18 +35,14 @@ def connect_to_endpoint(url): response = requests.request("GET", url, auth=bearer_oauth) print(response.status_code) if response.status_code != 200: - raise Exception( - "Request returned an error: {} {}".format( - response.status_code, response.text - ) - ) + raise Exception("Request returned an error: {} {}".format(response.status_code, response.text)) return response.json() def lookup(id): url = create_url(id) json_response = connect_to_endpoint(url) - print(json.dumps(json_response, indent=4, sort_keys=True)) return json_response -# https://oauth-playground.glitch.me/?id=findTweetsById¶ms=%28%27query%21%28%27C%27*B%29%7Ebody%21%28%29%7Epath%21%28%29*B%7EFAG%27%7EuserADfile_image_url%2CiNcreated_at%2CconnectK_statuHurlMublic_JtricHuserDtecteNentitieHdescriptK%27%7ECG%2Creferenced_Fs.id-keys-source_F%27%7EOAtype%2Curl%27%29*%7EidL146E37035677698IE43339741184I-%2CattachJnts.O_A.fieldLBE46120540165%27CexpansKLDnaJMroE03237FtweetGauthor_idHs%2CI%2C146JmeKionLs%21%27M%2CpNd%2COJdia%01ONMLKJIHGFEDCBA-*_ \ No newline at end of file + +# https://oauth-playground.glitch.me/?id=findTweetsById¶ms=%28%27query%21%28%27C%27*B%29%7Ebody%21%28%29%7Epath%21%28%29*B%7EFAG%27%7EuserADfile_image_url%2CiNcreated_at%2CconnectK_statuHurlMublic_JtricHuserDtecteNentitieHdescriptK%27%7ECG%2Creferenced_Fs.id-keys-source_F%27%7EOAtype%2Curl%27%29*%7EidL146E37035677698IE43339741184I-%2CattachJnts.O_A.fieldLBE46120540165%27CexpansKLDnaJMroE03237FtweetGauthor_idHs%2CI%2C146JmeKionLs%21%27M%2CpNd%2COJdia%01ONMLKJIHGFEDCBA-*_ From 736c2723b0825eeac1d8223817d56b6bc26d1b27 Mon Sep 17 00:00:00 2001 From: yu23ki14 Date: Wed, 7 Aug 2024 22:38:34 +0900 Subject: [PATCH 12/19] update for rate limit --- etl/src/birdxplorer_etl/extract.py | 11 +------- etl/src/birdxplorer_etl/lib/x/postlookup.py | 28 +++++++++++++++------ 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/etl/src/birdxplorer_etl/extract.py b/etl/src/birdxplorer_etl/extract.py index 0e9b6fb..81abf65 100644 --- a/etl/src/birdxplorer_etl/extract.py +++ b/etl/src/birdxplorer_etl/extract.py @@ -7,7 +7,6 @@ from lib.x.postlookup import lookup from birdxplorer_common.storage import RowNoteRecord, RowPostRecord, RowUserRecord import settings -import time def extract_data(db: Session): @@ -51,13 +50,6 @@ def extract_data(db: Session): db.commit() - # post = lookup() - # created_at = datetime.strptime(post["data"]["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ") - # created_at_millis = int(created_at.timestamp() * 1000) - # db_post = RowPostRecord(post_id=post["data"]["id"], author_id=post["data"]["author_id"], text=post["data"]["text"], created_at=created_at_millis,like_count=post["data"]["public_metrics"]["like_count"],repost_count=post["data"]["public_metrics"]["retweet_count"],bookmark_count=post["data"]["public_metrics"]["bookmark_count"],impression_count=post["data"]["public_metrics"]["impression_count"],quote_count=post["data"]["public_metrics"]["quote_count"],reply_count=post["data"]["public_metrics"]["reply_count"],lang=post["data"]["lang"]) - # db.add(db_post) - # db.commit() - # Noteに紐づくtweetデータを取得 postExtract_targetNotes = ( db.query(RowNoteRecord) @@ -79,7 +71,7 @@ def extract_data(db: Session): logger.info(tweet_id) post = lookup(tweet_id) - if "data" not in post: + if post == None or "data" not in post: continue created_at = datetime.strptime(post["data"]["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ") @@ -131,7 +123,6 @@ def extract_data(db: Session): ) db.add(db_post) note.row_post_id = tweet_id - time.sleep(90) db.commit() continue diff --git a/etl/src/birdxplorer_etl/lib/x/postlookup.py b/etl/src/birdxplorer_etl/lib/x/postlookup.py index 557cb9e..1410ceb 100644 --- a/etl/src/birdxplorer_etl/lib/x/postlookup.py +++ b/etl/src/birdxplorer_etl/lib/x/postlookup.py @@ -1,11 +1,10 @@ import requests -import json import settings from prefect import get_run_logger +import time def create_url(id): - logger = get_run_logger() expansions = "expansions=attachments.poll_ids,attachments.media_keys,author_id,edit_history_tweet_ids,entities.mentions.username,geo.place_id,in_reply_to_user_id,referenced_tweets.id,referenced_tweets.id.author_id" tweet_fields = "tweet.fields=attachments,author_id,context_annotations,conversation_id,created_at,edit_controls,entities,geo,id,in_reply_to_user_id,lang,public_metrics,possibly_sensitive,referenced_tweets,reply_settings,source,text,withheld" media_fields = ( @@ -17,7 +16,6 @@ def create_url(id): url = "https://api.twitter.com/2/tweets/{}?{}&{}&{}&{}&{}".format( id, tweet_fields, expansions, media_fields, place_fields, user_fields ) - logger.info(url) return url @@ -32,17 +30,31 @@ def bearer_oauth(r): def connect_to_endpoint(url): + logger = get_run_logger() response = requests.request("GET", url, auth=bearer_oauth) - print(response.status_code) - if response.status_code != 200: + if response.status_code == 429: + limit = response.headers["x-rate-limit-reset"] + logger.info("Waiting for rate limit reset...") + time.sleep(int(limit) - int(time.time()) + 1) + data = connect_to_endpoint(url) + return data + elif response.status_code != 200: raise Exception("Request returned an error: {} {}".format(response.status_code, response.text)) return response.json() +def check_existence(id): + url = "https://publish.twitter.com/oembed?url=https://x.com/CommunityNotes/status/{}&partner=&hide_thread=false".format( + id + ) + status = requests.get(url).status_code + return status == 200 + + def lookup(id): + isExist = check_existence(id) + if not isExist: + return None url = create_url(id) json_response = connect_to_endpoint(url) return json_response - - -# https://oauth-playground.glitch.me/?id=findTweetsById¶ms=%28%27query%21%28%27C%27*B%29%7Ebody%21%28%29%7Epath%21%28%29*B%7EFAG%27%7EuserADfile_image_url%2CiNcreated_at%2CconnectK_statuHurlMublic_JtricHuserDtecteNentitieHdescriptK%27%7ECG%2Creferenced_Fs.id-keys-source_F%27%7EOAtype%2Curl%27%29*%7EidL146E37035677698IE43339741184I-%2CattachJnts.O_A.fieldLBE46120540165%27CexpansKLDnaJMroE03237FtweetGauthor_idHs%2CI%2C146JmeKionLs%21%27M%2CpNd%2COJdia%01ONMLKJIHGFEDCBA-*_ From 2a0af34b8119b682b5cbdef59298bf83193e8742 Mon Sep 17 00:00:00 2001 From: yu23ki14 Date: Sat, 10 Aug 2024 11:16:28 +0900 Subject: [PATCH 13/19] gunicorn --- api/Dockerfile.prd | 2 +- api/pyproject.toml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/api/Dockerfile.prd b/api/Dockerfile.prd index 278c05b..b802075 100644 --- a/api/Dockerfile.prd +++ b/api/Dockerfile.prd @@ -43,4 +43,4 @@ USER app COPY --from=builder /usr/local/lib/python${PYTHON_VERSION_CODE}/site-packages /usr/local/lib/python${PYTHON_VERSION_CODE}/site-packages COPY --chown=app:app . ./ -ENTRYPOINT ["python", "-m", "uvicorn", "birdxplorer_api.main:app", "--host", "0.0.0.0", "--port", "10000"] +ENTRYPOINT ["python", "-m", "gunicorn", "birdxplorer_api.main:app", "-k", "uvicorn.workers.UvicornWorker", "--bind", "0.0.0.0", "--port", "10000"] diff --git a/api/pyproject.toml b/api/pyproject.toml index 3bb2650..f600636 100644 --- a/api/pyproject.toml +++ b/api/pyproject.toml @@ -63,6 +63,7 @@ dev=[ ] prod=[ "psycopg2", + "gunicorn", ] [tool.pytest.ini_options] From 788ceb12007ea1910113b377e9aaf754fa442964 Mon Sep 17 00:00:00 2001 From: yu23ki14 Date: Sat, 10 Aug 2024 11:22:56 +0900 Subject: [PATCH 14/19] gunicorn port --- api/Dockerfile.prd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/Dockerfile.prd b/api/Dockerfile.prd index b802075..97b31bd 100644 --- a/api/Dockerfile.prd +++ b/api/Dockerfile.prd @@ -43,4 +43,4 @@ USER app COPY --from=builder /usr/local/lib/python${PYTHON_VERSION_CODE}/site-packages /usr/local/lib/python${PYTHON_VERSION_CODE}/site-packages COPY --chown=app:app . ./ -ENTRYPOINT ["python", "-m", "gunicorn", "birdxplorer_api.main:app", "-k", "uvicorn.workers.UvicornWorker", "--bind", "0.0.0.0", "--port", "10000"] +ENTRYPOINT ["python", "-m", "gunicorn", "birdxplorer_api.main:app", "-k", "uvicorn.workers.UvicornWorker", "--bind", "0.0.0.0:10000"] From 7e7923deff9eec9f5b3179d085747eed55a402db Mon Sep 17 00:00:00 2001 From: yu23ki14 Date: Sat, 10 Aug 2024 11:30:43 +0900 Subject: [PATCH 15/19] gunicorn service --- api/Dockerfile.prd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/Dockerfile.prd b/api/Dockerfile.prd index 97b31bd..33890fb 100644 --- a/api/Dockerfile.prd +++ b/api/Dockerfile.prd @@ -43,4 +43,4 @@ USER app COPY --from=builder /usr/local/lib/python${PYTHON_VERSION_CODE}/site-packages /usr/local/lib/python${PYTHON_VERSION_CODE}/site-packages COPY --chown=app:app . ./ -ENTRYPOINT ["python", "-m", "gunicorn", "birdxplorer_api.main:app", "-k", "uvicorn.workers.UvicornWorker", "--bind", "0.0.0.0:10000"] +ENTRYPOINT ["python", "-m", "gunicorn", "birdxplorer_api.main:app", "--bind", "0.0.0.0:10000"] From 7fe79c426fa9787ec5100a7ff4b9237570699253 Mon Sep 17 00:00:00 2001 From: yu23ki14 Date: Sat, 10 Aug 2024 11:39:03 +0900 Subject: [PATCH 16/19] gunicorn service --- api/Dockerfile.prd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/Dockerfile.prd b/api/Dockerfile.prd index 33890fb..e2bf1f3 100644 --- a/api/Dockerfile.prd +++ b/api/Dockerfile.prd @@ -43,4 +43,4 @@ USER app COPY --from=builder /usr/local/lib/python${PYTHON_VERSION_CODE}/site-packages /usr/local/lib/python${PYTHON_VERSION_CODE}/site-packages COPY --chown=app:app . ./ -ENTRYPOINT ["python", "-m", "gunicorn", "birdxplorer_api.main:app", "--bind", "0.0.0.0:10000"] +ENTRYPOINT ["python", "-m", "gunicorn", "birdxplorer_api.main:app", "-k", "uvicorn.workers.UvicornWorker", "-w", "1", "--bind", "0.0.0.0:10000"] From 6b5e0be38a55c0f08737e8233f2df68a36a67415 Mon Sep 17 00:00:00 2001 From: yu23ki14 Date: Sat, 10 Aug 2024 11:43:14 +0900 Subject: [PATCH 17/19] uvicorn --- api/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/pyproject.toml b/api/pyproject.toml index af6f324..7bc5466 100644 --- a/api/pyproject.toml +++ b/api/pyproject.toml @@ -30,6 +30,7 @@ dependencies = [ "pydantic", "starlette", "python-dotenv", + "uvicorn[standard]", ] [project.urls] @@ -56,7 +57,6 @@ dev=[ "types-python-dateutil", "psycopg2-binary", "factory_boy", - "uvicorn", "polyfactory", "httpx", ] From e97fe4889bc0c360bbac01264a480e4c999f34c0 Mon Sep 17 00:00:00 2001 From: yu23ki14 Date: Sat, 10 Aug 2024 14:09:29 +0900 Subject: [PATCH 18/19] or search for topic --- api/tests/routers/test_data.py | 10 ++++++++++ common/birdxplorer_common/models.py | 9 +++++++++ common/birdxplorer_common/storage.py | 4 ++-- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/api/tests/routers/test_data.py b/api/tests/routers/test_data.py index 59fef49..ddd29ad 100644 --- a/api/tests/routers/test_data.py +++ b/api/tests/routers/test_data.py @@ -131,3 +131,13 @@ def test_notes_get_has_created_at_filter_to(client: TestClient, note_samples: Li assert response.status_code == 200 res_json = response.json() assert res_json == {"data": [json.loads(note_samples[i].model_dump_json()) for i in (0, 1, 2, 3)]} + + +def test_notes_get_has_topic_id_filter(client: TestClient, note_samples: List[Note]) -> None: + correct_notes = [note for note in note_samples if note_samples[0].topics[0] in note.topics] + response = client.get(f"/api/v1/data/notes/?topicIds={note_samples[0].topics[0].topic_id.serialize()}") + assert response.status_code == 200 + res_json = response.json() + assert res_json == { + "data": [json.loads(correct_notes[i].model_dump_json()) for i in range(correct_notes.__len__())] + } diff --git a/common/birdxplorer_common/models.py b/common/birdxplorer_common/models.py index 3712d35..7a1803a 100644 --- a/common/birdxplorer_common/models.py +++ b/common/birdxplorer_common/models.py @@ -608,6 +608,15 @@ class LanguageIdentifier(str, Enum): PT = "pt" DE = "de" FR = "fr" + OTHER = "other" + + @classmethod + def normalize(cls, value: str, default: str = "other") -> str: + try: + cls(value) + return value + except ValueError: + return default class TopicLabelString(NonEmptyTrimmedString): ... diff --git a/common/birdxplorer_common/storage.py b/common/birdxplorer_common/storage.py index 2b70590..5e6c30b 100644 --- a/common/birdxplorer_common/storage.py +++ b/common/birdxplorer_common/storage.py @@ -240,7 +240,7 @@ def get_notes( subq = ( select(NoteTopicAssociation.note_id) .group_by(NoteTopicAssociation.note_id) - .having(func.array_agg(NoteTopicAssociation.topic_id) == topic_ids) + .having(func.bool_or(NoteTopicAssociation.topic_id.in_(topic_ids))) .subquery() ) query = query.join(subq, NoteRecord.note_id == subq.c.note_id) @@ -263,7 +263,7 @@ def get_notes( ) for topic in note_record.topics ], - language=note_record.language, + language=LanguageIdentifier.normalize(note_record.language), summary=note_record.summary, created_at=note_record.created_at, ) From 9906132bc7ee20eaf32fef081878e76701f321ce Mon Sep 17 00:00:00 2001 From: yu23ki14 Date: Sat, 10 Aug 2024 14:22:23 +0900 Subject: [PATCH 19/19] fix test --- common/tests/test_storage.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/common/tests/test_storage.py b/common/tests/test_storage.py index ff898e9..63bf5bb 100644 --- a/common/tests/test_storage.py +++ b/common/tests/test_storage.py @@ -181,10 +181,11 @@ def test_get_notes_by_topic_ids( topics = note_samples[0].topics topic_ids: List[TopicId] = [TopicId.from_int(0)] expected = sorted( - [note for note in note_samples if note.topics == topics], + [note for note in note_samples if topics[0] in note.topics], key=lambda note: note.note_id, ) actual = sorted(list(storage.get_notes(topic_ids=topic_ids)), key=lambda note: note.note_id) + assert expected == actual