From fb781e3cdec0d74610ad0fef23e54abad5209193 Mon Sep 17 00:00:00 2001 From: polux0 Date: Thu, 23 May 2024 17:42:41 +0200 Subject: [PATCH 1/3] fix: turned off `auto_suggest` in extractor.py, as it was causing issues with some page_ids; --- dags/hivemind_etl_helpers/src/db/mediawiki/extractor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dags/hivemind_etl_helpers/src/db/mediawiki/extractor.py b/dags/hivemind_etl_helpers/src/db/mediawiki/extractor.py index aa9b88f8..889cef66 100644 --- a/dags/hivemind_etl_helpers/src/db/mediawiki/extractor.py +++ b/dags/hivemind_etl_helpers/src/db/mediawiki/extractor.py @@ -42,7 +42,9 @@ def extract_from_pages(self, pages: List[str]) -> List[Document]: List[Document]: A list of Document objects extracted from the specified pages. """ try: - response = self.wikimedia_reader.load_data(pages=pages) + response = self.wikimedia_reader.load_data(pages=pages, + auto_suggest=False + ) return response except Exception as e: print(f"Failed to extract from pages {pages}: {str(e)}") From b1b650671c1b2ad683e6c28d6425b0c4cc33b50a Mon Sep 17 00:00:00 2001 From: polux0 Date: Fri, 24 May 2024 10:03:27 +0200 Subject: [PATCH 2/3] feat: adapting tests accordingly; --- .../tests/unit/test_mediawiki_extractor.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dags/hivemind_etl_helpers/tests/unit/test_mediawiki_extractor.py b/dags/hivemind_etl_helpers/tests/unit/test_mediawiki_extractor.py index 6badf729..ec0054d7 100644 --- a/dags/hivemind_etl_helpers/tests/unit/test_mediawiki_extractor.py +++ b/dags/hivemind_etl_helpers/tests/unit/test_mediawiki_extractor.py @@ -31,7 +31,8 @@ def test_extract_from_valid_pages(self): test_pages = ["Python_(programming_language)", "OpenAI"] documents = self.extractor.extract(page_ids=test_pages) self.assertEqual(len(documents), len(mock_response)) - self.mock_reader.load_data.assert_called_once_with(pages=test_pages) + self.mock_reader.load_data.assert_called_once_with(pages=test_pages, + auto_suggest=False) def test_extract_no_pages(self): """ @@ -52,4 +53,5 @@ def test_handle_invalid_page_titles(self): documents = self.extractor.extract(page_ids=invalid_pages) self.assertEqual(len(documents), 0) - self.mock_reader.load_data.assert_called_with(pages=invalid_pages) + self.mock_reader.load_data.assert_called_with(pages=invalid_pages, + auto_suggest=False) From b01471f071fd0889442bcbd9f0aa0630fc5fe1f5 Mon Sep 17 00:00:00 2001 From: polux0 Date: Fri, 24 May 2024 10:13:47 +0200 Subject: [PATCH 3/3] fix: linting issues; --- .../src/db/mediawiki/extractor.py | 11 ++--------- .../tests/unit/test_mediawiki_extractor.py | 10 ++++++---- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/dags/hivemind_etl_helpers/src/db/mediawiki/extractor.py b/dags/hivemind_etl_helpers/src/db/mediawiki/extractor.py index 481c77d2..1ed8e9fe 100644 --- a/dags/hivemind_etl_helpers/src/db/mediawiki/extractor.py +++ b/dags/hivemind_etl_helpers/src/db/mediawiki/extractor.py @@ -41,12 +41,5 @@ def extract_from_pages(self, pages: List[str]) -> List[Document]: Returns: List[Document]: A list of Document objects extracted from the specified pages. """ - try: - response = self.wikimedia_reader.load_data(pages=pages, - auto_suggest=False - ) - return response - except Exception as e: - print(f"Failed to extract from pages {pages}: {str(e)}") - return [] - + response = self.wikimedia_reader.load_data(pages=pages, auto_suggest=False) + return response diff --git a/dags/hivemind_etl_helpers/tests/unit/test_mediawiki_extractor.py b/dags/hivemind_etl_helpers/tests/unit/test_mediawiki_extractor.py index ec0054d7..d39e7d1d 100644 --- a/dags/hivemind_etl_helpers/tests/unit/test_mediawiki_extractor.py +++ b/dags/hivemind_etl_helpers/tests/unit/test_mediawiki_extractor.py @@ -31,8 +31,9 @@ def test_extract_from_valid_pages(self): test_pages = ["Python_(programming_language)", "OpenAI"] documents = self.extractor.extract(page_ids=test_pages) self.assertEqual(len(documents), len(mock_response)) - self.mock_reader.load_data.assert_called_once_with(pages=test_pages, - auto_suggest=False) + self.mock_reader.load_data.assert_called_once_with( + pages=test_pages, auto_suggest=False + ) def test_extract_no_pages(self): """ @@ -53,5 +54,6 @@ def test_handle_invalid_page_titles(self): documents = self.extractor.extract(page_ids=invalid_pages) self.assertEqual(len(documents), 0) - self.mock_reader.load_data.assert_called_with(pages=invalid_pages, - auto_suggest=False) + self.mock_reader.load_data.assert_called_with( + pages=invalid_pages, auto_suggest=False + )