-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #154 from TogetherCrew/feat/mediawiki-ingest-and-v…
…ectorize feat: `WikipediaReader` that accepts `api_url` -> `MediaWikiReader`;
- Loading branch information
Showing
11 changed files
with
485 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
import logging | ||
|
||
from dags.hivemind_etl_helpers.ingestion_pipeline import CustomIngestionPipeline | ||
from dags.hivemind_etl_helpers.src.db.mediawiki.extractor import MediaWikiExtractor | ||
|
||
|
||
def process_mediawiki_etl( | ||
community_id: str, | ||
api_url: str, | ||
page_titles: list[str], | ||
) -> None: | ||
""" | ||
Process the MediaWiki pages or categories | ||
and save the processed data within PostgreSQL | ||
Parameters | ||
----------- | ||
community_id : str | ||
the community to save its data | ||
page_titles : list[str] | None | ||
the page titles to process their data | ||
default is None | ||
Note: The `page_titles` should be given. | ||
""" | ||
if page_titles is None: | ||
raise ValueError("The `page_titles` must be given!") | ||
try: | ||
mediawiki_extractor = MediaWikiExtractor(api_url) | ||
documents = mediawiki_extractor.extract( | ||
page_ids=page_titles, | ||
) | ||
except TypeError as exp: | ||
logging.info(f"No documents retrieved from MediaWiki! exp: {exp}") | ||
|
||
ingestion_pipeline = CustomIngestionPipeline( | ||
community_id=community_id, collection_name="mediawiki" | ||
) | ||
try: | ||
ingestion_pipeline.run_pipeline(docs=documents) | ||
except Exception as e: | ||
logging.info(f"Error while trying to run MediaWikiIngestionPipeline! exp: {e}") |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
from typing import List, Optional | ||
|
||
from hivemind_etl_helpers.src.db.mediawiki.mediawiki_reader import MediaWikiReader | ||
from llama_index.core import Document | ||
|
||
|
||
class MediaWikiExtractor: | ||
def __init__(self, api_url: Optional[str] = "https://en.wikipedia.org/w/api.php"): | ||
""" | ||
Initializes the MediaWikiExtractor with an API URL for Wikimedia. | ||
If no URL is provided, it tries to load it from an environment variable. | ||
Args: | ||
api_url (Optional[str]): The Wikimedia API URL. | ||
If None, the URL is loaded from the 'WIKIMEDIA_API_URL' environment variable. | ||
""" | ||
self.api_url = api_url | ||
self.wikimedia_reader = MediaWikiReader(api_url=self.api_url) | ||
|
||
def extract(self, page_ids: Optional[List[str]] = None) -> List[Document]: | ||
""" | ||
Extracts documents from Wikimedia page_ids (their titles). | ||
Args: | ||
pages (Optional[List[str]]): List of page_ids to extract documents from. | ||
Returns: | ||
List[Document]: A list of Document objects extracted from the specified Wikimedia pages. | ||
""" | ||
if page_ids: | ||
return self.extract_from_pages(page_ids) | ||
return [] | ||
|
||
def extract_from_pages(self, pages: List[str]) -> List[Document]: | ||
""" | ||
Extracts documents from specific Wikimedia pages by their titles. | ||
Args: | ||
pages (List[str]): The list of page titles to extract documents from. | ||
Returns: | ||
List[Document]: A list of Document objects extracted from the specified pages. | ||
""" | ||
try: | ||
response = self.wikimedia_reader.load_data(pages=pages) | ||
return response | ||
except Exception as e: | ||
print(f"Failed to extract from pages {pages}: {str(e)}") | ||
return [] |
52 changes: 52 additions & 0 deletions
52
dags/hivemind_etl_helpers/src/db/mediawiki/mediawiki_reader.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
"""Simple reader that reads mediawiki.""" | ||
|
||
from typing import Any, List, Optional | ||
|
||
import wikipedia | ||
from llama_index.legacy.readers.base import BasePydanticReader | ||
from llama_index.legacy.schema import Document | ||
|
||
|
||
class MediaWikiReader(BasePydanticReader): | ||
"""WikiMedia reader. | ||
Reads a page. | ||
""" | ||
|
||
is_remote: bool = True | ||
|
||
def __init__(self, api_url: Optional[str] = None) -> None: | ||
"""Initialize with parameters.""" | ||
if api_url: | ||
wikipedia.set_api_url(api_url) | ||
|
||
@classmethod | ||
def class_name(cls) -> str: | ||
return "WikipediaReader" | ||
|
||
def load_data(self, pages: List[str], **load_kwargs: Any) -> List[Document]: | ||
"""Load data from the input directory. | ||
Args: | ||
pages (List[str]): List of pages to read. | ||
""" | ||
import wikipedia | ||
|
||
results = [] | ||
for page in pages: | ||
wiki_page = wikipedia.page(page, **load_kwargs) | ||
page_content = wiki_page.content | ||
page_id = wiki_page.pageid | ||
doc = Document( | ||
id_=page_id, | ||
text=page_content, | ||
metadata={ | ||
"url": wiki_page.url, | ||
}, | ||
excluded_embed_metadata_keys=["url"], | ||
excluded_llm_metadata_keys=["url"], | ||
) | ||
results.append(doc) | ||
return results |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
from .modules_base import ModulesBase | ||
|
||
|
||
class ModulesMediaWiki(ModulesBase): | ||
def __init__(self) -> None: | ||
self.platform_name = "mediawiki" | ||
super().__init__() | ||
|
||
def get_learning_platforms( | ||
self, | ||
) -> list[dict[str, str | list[str]]]: | ||
""" | ||
Get all the MediaWiki communities with their page titles. | ||
Returns | ||
--------- | ||
community_orgs : list[dict[str, str | list[str]]] = [] | ||
a list of MediaWiki data information | ||
example data output: | ||
``` | ||
[{ | ||
"community_id": "6579c364f1120850414e0dc5", | ||
"page_titles": ["Main_Page", "Default_Page"], | ||
"base_url": "some_api_url", | ||
}] | ||
``` | ||
""" | ||
modules = self.query(platform=self.platform_name, projection={"name": 0}) | ||
communities_data: list[dict[str, str | list[str]]] = [] | ||
|
||
for module in modules: | ||
community = module["community"] | ||
|
||
# each platform of the community | ||
for platform in module["options"]["platforms"]: | ||
if platform["name"] != self.platform_name: | ||
continue | ||
|
||
page_ids = self.get_platform_metadata( | ||
platform_id=platform["platform"], | ||
metadata_name="pageIds", | ||
) | ||
modules_options = platform["metadata"] | ||
communities_data.append( | ||
{ | ||
"community_id": str(community), | ||
"page_titles": page_ids, | ||
"base_url": modules_options.get("api_url"), | ||
} | ||
) | ||
|
||
return communities_data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.