Skip to content

Commit

Permalink
Merge branch 'main' into fix/mediawiki-extraction-autosuggest-issues
Browse files Browse the repository at this point in the history
  • Loading branch information
polux0 authored May 23, 2024
2 parents fb781e3 + 148b40e commit bd16ced
Show file tree
Hide file tree
Showing 10 changed files with 20 additions and 171 deletions.
8 changes: 0 additions & 8 deletions dags/hivemind_etl_helpers/debugs.py

This file was deleted.

106 changes: 0 additions & 106 deletions dags/hivemind_etl_helpers/gdrive_etl.py

This file was deleted.

10 changes: 3 additions & 7 deletions dags/hivemind_etl_helpers/ingestion_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,6 @@ def run_pipeline(self, docs: list[Document]):
docstore_strategy=DocstoreStrategy.UPSERTS,
)
logging.info("Pipeline created, now inserting documents into pipeline!")
try:
nodes = pipeline.run(documents=docs, show_progress=True)
return nodes
except Exception as e:
logging.error(
f"An error occurred while running the pipeline: {e}", exc_info=True
)

nodes = pipeline.run(documents=docs, show_progress=True)
return nodes
18 changes: 7 additions & 11 deletions dags/hivemind_etl_helpers/mediawiki_etl.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,14 @@ def process_mediawiki_etl(
"""
if page_titles is None:
raise ValueError("The `page_titles` must be given!")
try:
mediawiki_extractor = MediaWikiExtractor(api_url)
documents = mediawiki_extractor.extract(
page_ids=page_titles,
)
except TypeError as exp:
logging.info(f"No documents retrieved from MediaWiki! exp: {exp}")

logging.info(f"Processing community id: {community_id}")
mediawiki_extractor = MediaWikiExtractor(api_url)
documents = mediawiki_extractor.extract(
page_ids=page_titles,
)

ingestion_pipeline = CustomIngestionPipeline(
community_id=community_id, collection_name="mediawiki"
)
try:
ingestion_pipeline.run_pipeline(docs=documents)
except Exception as e:
logging.info(f"Error while trying to run MediaWikiIngestionPipeline! exp: {e}")
ingestion_pipeline.run_pipeline(docs=documents)
15 changes: 4 additions & 11 deletions dags/hivemind_etl_helpers/notion_etl.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,19 +33,12 @@ def process_notion_etl(
raise ValueError(
"At least one of the `database_ids` or `page_ids` must be given!"
)
try:
notion_extractor = NotionExtractor(notion_token=access_token)
documents = notion_extractor.extract(
page_ids=page_ids, database_ids=database_ids
)
except TypeError as exp:
logging.info(f"No documents retrieved from notion! exp: {exp}")

logging.info(f"Processing community id: {community_id}")
notion_extractor = NotionExtractor(notion_token=access_token)
documents = notion_extractor.extract(page_ids=page_ids, database_ids=database_ids)
collection_name = "notion"
ingestion_pipeline = CustomIngestionPipeline(
community_id, collection_name=collection_name
)
try:
ingestion_pipeline.run_pipeline(docs=documents)
except Exception as e:
logging.info(f"Error while trying to run NotionIngestionPipeline! exp: {e}")
ingestion_pipeline.run_pipeline(docs=documents)
1 change: 1 addition & 0 deletions dags/hivemind_etl_helpers/src/db/mediawiki/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,4 @@ def extract_from_pages(self, pages: List[str]) -> List[Document]:
except Exception as e:
print(f"Failed to extract from pages {pages}: {str(e)}")
return []

Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
from typing import Any, List, Optional

import wikipedia
from llama_index.core import Document
from llama_index.legacy.readers.base import BasePydanticReader
from llama_index.legacy.schema import Document


class MediaWikiReader(BasePydanticReader):
Expand Down
16 changes: 4 additions & 12 deletions dags/hivemind_etl_helpers/src/db/notion/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,8 @@ def extract_from_database(self, database_id: str) -> List[Document]:
Returns:
List[Document]: A list of Document objects extracted from the specified database.
"""
try:
response = self.notion_page_reader.load_data(database_id=database_id)
return response
except Exception as e:
print(f"Failed to extract from database {database_id}: {str(e)}")
return []
response = self.notion_page_reader.load_data(database_id=database_id)
return response

def extract_from_pages(self, page_ids: List[str]) -> List[Document]:
"""
Expand All @@ -78,9 +74,5 @@ def extract_from_pages(self, page_ids: List[str]) -> List[Document]:
Returns:
List[Document]: A list of Document objects extracted from the specified pages.
"""
try:
response = self.notion_page_reader.load_data(page_ids=page_ids)
return response
except Exception as e:
print(f"Failed to extract from pages {page_ids}: {str(e)}")
return []
response = self.notion_page_reader.load_data(page_ids=page_ids)
return response
12 changes: 0 additions & 12 deletions dags/hivemind_etl_helpers/tests/unit/test_mediawiki_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,15 +53,3 @@ def test_handle_invalid_page_titles(self):
documents = self.extractor.extract(page_ids=invalid_pages)
self.assertEqual(len(documents), 0)
self.mock_reader.load_data.assert_called_with(pages=invalid_pages)

def test_extract_from_valid_pages_with_exception(self):
"""
Test extracting from valid pages with an exception occurring.
Expecting empty results.
"""
test_pages = ["Python_(programming_language)"]
self.mock_reader.load_data.side_effect = Exception("Mocked exception")

documents = self.extractor.extract(page_ids=test_pages)
self.assertEqual(len(documents), 0)
self.mock_reader.load_data.assert_called_once_with(pages=test_pages)
3 changes: 0 additions & 3 deletions dags/hivemind_mediawiki_etl.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

from airflow import DAG
from airflow.decorators import task
from hivemind_etl_helpers.debugs import pring_qdrant_creds
from hivemind_etl_helpers.mediawiki_etl import process_mediawiki_etl
from hivemind_etl_helpers.src.utils.modules import ModulesMediaWiki

Expand All @@ -20,8 +19,6 @@ def get_mediawiki_communities() -> list[dict[str, str | list[str]]]:
"""
Getting all communities having mediawiki from database
"""
# TODO: REMOVE!!!
pring_qdrant_creds()
communities = ModulesMediaWiki().get_learning_platforms()
return communities

Expand Down

0 comments on commit bd16ced

Please sign in to comment.