diff --git a/src/Scraper/application/ScraperOrchestration/Wrapper.py b/src/Scraper/application/ScraperOrchestration/Wrapper.py new file mode 100644 index 0000000..e620aa8 --- /dev/null +++ b/src/Scraper/application/ScraperOrchestration/Wrapper.py @@ -0,0 +1,73 @@ +from sqlalchemy.orm.session import Session +from typing import List +from random import uniform +import asyncio + +from SearchEngine.infrastructure.ComponentManagment.SQL_alchemy_repository import ( + SQLAlchemyRepository, +) +from Scraper.domain.value_object import AbstractScraper, URL +from Scraper.domain.category_url import CategoryURL +from Scraper.domain.aggragate import VolatileData +from Scraper.domain.service import FactoryScraper +from framework.domain.value_object import UUIDv5 +from entrypoints.api.endpoints.connection_util import engine +from framework.infrastructure.db_management.db_connection import create_session +from Scraper.infrastructure.VolatileDataManagment.SQL_alchemy_volatile_data import ( + SQLAlchemyVolatileData, +) +from Scraper.infrastructure.ScraperOrchestration.category_URL_manager import ( + CategoryURLManager, +) + + +class Wrapper: + _volatile_data_manager: SQLAlchemyVolatileData + domain: str + scraper: AbstractScraper + domain_urls: List[CategoryURL] + session: Session + + max_sleep_seconds = 3 + + def __init__(self, scheme: str, domain: str): + self.domain = domain + self.session = create_session(engine) + self._volatile_data_manager = SQLAlchemyVolatileData(self.session) + + factory_scraper = FactoryScraper() + url_manager = CategoryURLManager(self.session) + self.scraper = factory_scraper.build_scraper(f"{scheme}://{domain}") + self.domain_urls = url_manager.get(filters_eq={"domain": domain}) + + async def run_scraping(self): + for domain_url in self.domain_urls: + next_url: URL = domain_url.url + + while next_url != None: + next_url, volatile_datas = self.scraper.get_volatile_data( + url=next_url.url + ) + + for url, name, cost, availability in volatile_datas: + # TODO: fazer chamada da engine de busca para classificar o componente + # component = SearchEngine.classifie(name) + component_manager = SQLAlchemyRepository( + self.session + ) # placeholder + component = component_manager.get(filters_gt={"consumption": -1})[ + 0 + ] # placeholder + + volatile_data = VolatileData( + _id=UUIDv5(url.url), + component_id=component.uid, + url=url, + cost=cost, + availability=availability, + ) + + self._volatile_data_manager.add(volatile_data) + + sleep_seconds = uniform(1.0, self.max_sleep_seconds) + await asyncio.sleep(sleep_seconds) diff --git a/src/Scraper/application/ScraperOrchestration/category_URL_manager.py b/src/Scraper/application/ScraperOrchestration/category_URL_manager.py new file mode 100644 index 0000000..25797ce --- /dev/null +++ b/src/Scraper/application/ScraperOrchestration/category_URL_manager.py @@ -0,0 +1,103 @@ +from sqlalchemy.orm.session import Session +from sqlalchemy.exc import NoResultFound +from typing import List +from operator import lt, gt, eq + +from framework.domain.value_object import UUID +from framework.infrastructure.db_management.db_structure import CategoryUrlInstance +from Scraper.domain.category_url import CategoryURL +from framework.domain.value_object import URL +from framework.domain.components import EComponentType +from Scraper.domain.repositories import ( + ISQLAlchemyRepository, + EntityUIDNotFoundException, +) + + +class CategoryURLManager(ISQLAlchemyRepository): + _filters_ops: dict = {"filters_eq": eq, "filters_lt": lt, "filters_gt": gt} + + def __init__(self, session): + self._session: Session = session + + def _url_to_db(self, category_url: CategoryURL): + url_instance = CategoryUrlInstance() + url = category_url.url + + url_instance.uid = category_url.uid + url_instance.domain = url.domain + url_instance.path = url.path + url_instance.scheme = url.scheme + url_instance.type = category_url.category + + return url_instance + + def _db_to_category_url(self, url_instance: CategoryUrlInstance): + url_str = f"{url_instance.scheme}://{url_instance.domain}/{url_instance.path}" + url = URL(url_str, url_instance.scheme, url_instance.domain, url_instance.path) + category_url = CategoryURL( + _id=url_instance.uid, url=url, category=url_instance.type + ) + + return category_url + + def _parse_filters(self, **kwargs) -> List: + ret = [] + + for filter_type, filters in kwargs.items(): + if filter_type in self._filters_ops.keys(): + op = self._filters_ops[filter_type] + + [ + ret.append(op(getattr(CategoryUrlInstance, prop), value)) + for prop, value in filters.items() + ] + + return ret + + def _filter_components_from_db(self, filters: List) -> List[CategoryURL]: + url_instances: List[CategoryUrlInstance] = ( + self._session.query(CategoryUrlInstance).filter(*filters).all() + ) + + urls = [self._db_to_category_url(instance) for instance in url_instances] + + return urls + + def _add(self, category_url: CategoryURL): + url = category_url.url + url_instance = self._url_to_db(url, category_url.category) + self._session.add(url_instance) + self._session.commit() + + def _get(self, **kwargs) -> List[CategoryURL]: + ret = [] + + filters = self._parse_filters(**kwargs) + + urls = self._filter_components_from_db(filters) + ret.extend(urls) + + return ret + + def _get_by_uid(self, ref: UUID): + query_filter = [CategoryUrlInstance.uid == ref] + + try: + category_url: CategoryUrlInstance = ( + self._session.query(CategoryUrlInstance).filter(*query_filter).one() + ) + + url = self._db_to_category_url(category_url) + + except NoResultFound: + raise EntityUIDNotFoundException(ref) + + return url + + def get_urls(self) -> List[tuple[str, str]]: + params = [CategoryUrlInstance.scheme, CategoryUrlInstance.domain] + + query = self._session.query(*params).distinct(CategoryUrlInstance.domain) + urls = [url for url in query] + return urls diff --git a/src/Scraper/application/ScraperOrchestration/orchestrator.py b/src/Scraper/application/ScraperOrchestration/orchestrator.py new file mode 100644 index 0000000..a305dd1 --- /dev/null +++ b/src/Scraper/application/ScraperOrchestration/orchestrator.py @@ -0,0 +1,39 @@ +import sys + +sys.path.insert(0, r"C:\Users\wesle\OneDrive\Documentos\UFPI\ESII\WiseBuilder\src") + +import asyncio +from Scraper.infrastructure.ScraperOrchestration.category_URL_manager import ( + CategoryURLManager, +) +from framework.infrastructure.db_management.db_connection import create_session +from entrypoints.api.endpoints.connection_util import engine +from Scraper.infrastructure.ScraperOrchestration.Wrapper import Wrapper + +_category_url_manager = CategoryURLManager(create_session(engine)) +_sleep_minutes = 60 + + +async def run_scrapers(): + while True: + urls = _category_url_manager.get_urls() + + tasks = [] + + for scheme, domain in urls: + wrapper = Wrapper(scheme, domain) + tasks.append(wrapper.run_scraping()) + + await asyncio.gather(*tasks) + await asyncio.sleep(_sleep_minutes * 60) + + +def main(): + orchestrator_loop = asyncio.new_event_loop() + asyncio.set_event_loop(orchestrator_loop) + orchestrator_loop.run_until_complete(run_scrapers()) + orchestrator_loop.close() + + +if __name__ == "__main__": + main() diff --git a/src/Scraper/domain/aggragate.py b/src/Scraper/domain/aggragate.py index 512b649..9571c2b 100644 --- a/src/Scraper/domain/aggragate.py +++ b/src/Scraper/domain/aggragate.py @@ -1,25 +1,34 @@ from datetime import datetime from dataclasses import dataclass, field +from typing import List + from framework.domain.entity import AggregateRoot from framework.domain.value_object import UUID, Money, URL -from .entity import MatchesTrackedComponent + +_AttrsVolatileData = ["_id", "url", "component_id", "cost", "availability", "timestamp"] @dataclass(kw_only=True) class VolatileData(AggregateRoot): - component_id: UUID - url_id: UUID + # url_id: UUID url: URL + component_id: UUID cost: Money availability: bool timestamp: datetime = field(default=datetime.utcnow()) + def __hash__(self): + return hash(self.uid) + + @classmethod + def get_attrs(cls) -> List[str]: + return _AttrsVolatileData.copy() + def generateVolatileDataPoint( self, _id: UUID, component_id: UUID, - url_id: UUID, url: URL, cost: Money, availability: bool, @@ -27,7 +36,6 @@ def generateVolatileDataPoint( return VolatileData( _id=_id, component_id=component_id, - url_id=url_id, url=url, cost=cost, availability=availability, diff --git a/src/Scraper/domain/entity.py b/src/Scraper/domain/entity.py index 0433dc3..a1cc6ac 100644 --- a/src/Scraper/domain/entity.py +++ b/src/Scraper/domain/entity.py @@ -1,9 +1,14 @@ +from dataclasses import dataclass + +from framework.domain.entity import Entity +from framework.domain.value_object import URL +from framework.domain.components import EComponentType + + @dataclass -class MatchesTrackedComponent(Rule): - # Verifica se o componente é um dos componentes observados - component_name: str - - def is_broken(self) -> bool: - # verificar se componente existe - # return not SearchEngine.get_id_by_name(component_name) - return False +class CategoryURL(Entity): + url: URL + category: EComponentType + + def __hash__(self): + return hash(self.uid) diff --git a/src/Scraper/domain/repositories.py b/src/Scraper/domain/repositories.py new file mode 100644 index 0000000..e095d4d --- /dev/null +++ b/src/Scraper/domain/repositories.py @@ -0,0 +1,70 @@ +from dataclasses import dataclass, field +from typing import Dict +from abc import ABCMeta, abstractmethod + +from framework.domain.value_object import UUID +from framework.domain.repository import AbstractRepository +from framework.domain.exception import DomainException +from Scraper.domain.aggragate import VolatileData + + +@dataclass +class EntityUIDNotFoundException(DomainException): + entity_id: UUID + _message: str = field(init=False) + + def __post_init__(self): + self._message = f"{self.__class__.__name__}: " + f"Componente com UID {self.entity_id} não existe." + + +class MockRepository(AbstractRepository): + def __init__(self, volatile_datas: Dict[UUID, VolatileData]): + self._volatile_data = volatile_datas + + def _add(self, volatile_data: VolatileData): + self._volatile_data[volatile_data.uid] = volatile_data + + def _get_by_uid(self, ref: UUID): + ret = self._volatile_data.get(ref, None) + if ret: + return self._volatile_data[ref] + raise EntityUIDNotFoundException(ref) + + def _get(self, **kwargs): + qsize = kwargs.get("qsize", 10) + ctype = kwargs.get("availability", None) + + ret = list() + if ctype: + for v in self._volatile_data.values(): + if v.availability == True: + ret.append(v) + if len(ret) == qsize: + break + + return ret + + def __repr__(self): + return str(self._volatile_data) + + +class ISQLAlchemyRepository(AbstractRepository, metaclass=ABCMeta): + @abstractmethod + def __init__(self, session): + raise NotImplemented + + @abstractmethod + def _add(self, volatile_data: VolatileData): + raise NotImplemented + + @abstractmethod + def _get_by_uid(self, ref: UUID): + raise NotImplemented + + @abstractmethod + def _get(self, **kwargs): + raise NotImplemented + + def __repr__(self): + raise NotImplemented diff --git a/src/Scraper/domain/scrapers.py b/src/Scraper/domain/scrapers.py index 97dffec..f5c4b5c 100644 --- a/src/Scraper/domain/scrapers.py +++ b/src/Scraper/domain/scrapers.py @@ -62,11 +62,14 @@ def get_volatile_data( n_next_page = n_actual_page + 1 + next_url: URL | None = None + if n_next_page in number_of_pages: next_page = url.split("?")[0] + self.query_string.format( page_number=n_next_page ) + next_url = URL.get_URL(next_page) else: next_page = None - return next_page, volatile_data + return next_url, volatile_data diff --git a/src/Scraper/domain/service.py b/src/Scraper/domain/service.py index 135a3fe..8cd2526 100644 --- a/src/Scraper/domain/service.py +++ b/src/Scraper/domain/service.py @@ -1,11 +1,14 @@ from dataclasses import dataclass, field from .value_object import AbstractScraper -from typing import Dict, Union, NoReturn +from typing import Dict, Union, NoReturn, Type +from Scraper.domain.scrapers import KabumScraper @dataclass class FactoryScraper: - _scrapers: Dict[str, AbstractScraper] = field(init=False, default_factory=dict) + _scrapers: Dict[str, AbstractScraper] = field( + init=False, default_factory=lambda: {KabumScraper.raw_url: KabumScraper()} + ) def build_scraper(self, domain: str) -> Union[AbstractScraper, NoReturn]: _scraper: AbstractScraper | None = self._scrapers.get(domain) diff --git a/src/Scraper/domain/value_object.py b/src/Scraper/domain/value_object.py index fc6bd24..9c0807d 100644 --- a/src/Scraper/domain/value_object.py +++ b/src/Scraper/domain/value_object.py @@ -1,5 +1,7 @@ -from typing import Tuple +from typing import Tuple, List from dataclasses import dataclass + +from framework.domain.components import EComponentType from framework.domain.value_object import URL, ValueObject, Money from abc import ABC, abstractmethod @@ -7,5 +9,7 @@ @dataclass class AbstractScraper(ABC, ValueObject): @abstractmethod - def get_volatile_data(self) -> Tuple[URL, str, Money, int]: + def get_volatile_data( + self, url: str + ) -> Tuple[URL, List[Tuple[URL, str, Money, bool]]]: pass diff --git a/src/Scraper/infrastructure/VolatileDataManagment/SQL_alchemy_volatile_data.py b/src/Scraper/infrastructure/VolatileDataManagment/SQL_alchemy_volatile_data.py new file mode 100644 index 0000000..68d1dda --- /dev/null +++ b/src/Scraper/infrastructure/VolatileDataManagment/SQL_alchemy_volatile_data.py @@ -0,0 +1,85 @@ +from sqlalchemy.orm.session import Session +from sqlalchemy.exc import NoResultFound +from sqlalchemy import update + +from Scraper.domain.aggragate import VolatileData +from framework.domain.value_object import UUID +from framework.infrastructure.db_management.db_mapping import map_from_to +from framework.infrastructure.db_management.db_structure import ( + VolatileDataInstance, + AttrsVolatileData, +) +from Scraper.domain.repositories import ( + ISQLAlchemyRepository, + EntityUIDNotFoundException, +) + + +class SQLAlchemyVolatileData(ISQLAlchemyRepository): + def __init__(self, session): + self._session: Session = session + + def volatile_data_to_db_object( + self, volatile_data: VolatileData + ) -> VolatileDataInstance: + mapped_vol_data = map_from_to( + volatile_data, VolatileData.get_attrs(), AttrsVolatileData + ) + + return VolatileDataInstance(**mapped_vol_data) + + def db_object_to_volatile_data( + self, volatile_data_instance: VolatileDataInstance + ) -> VolatileData: + mapped_vol_data = map_from_to( + volatile_data_instance, AttrsVolatileData, VolatileData.get_attrs() + ) + + return VolatileData(**mapped_vol_data) + + def _get_instance_by_uid(self, ref: UUID) -> VolatileDataInstance: + query_filter = [VolatileDataInstance.url_id == ref] + + try: + vol_data_inst: VolatileDataInstance = ( + self._session.query(VolatileDataInstance).filter(*query_filter).one() + ) + + except NoResultFound: + raise EntityUIDNotFoundException(ref) + + return vol_data_inst + + def _add(self, volatile_data: VolatileData): + db_volatile_data: VolatileDataInstance = self.volatile_data_to_db_object( + volatile_data + ) + + db_volatile_data.url = volatile_data.url.url # TODO modificar dicionário + db_volatile_data.cost = volatile_data.cost.amount + + try: + current_volatile_data = self._get_instance_by_uid(volatile_data.uid) + + if ( + db_volatile_data.cost + 0.1 < current_volatile_data.cost + and db_volatile_data.availability + ): + # TODO lançar evento de redução de preço + pass + + current_volatile_data.cost = db_volatile_data.cost + + except EntityUIDNotFoundException: + self._session.add(db_volatile_data) + + self._session.commit() + + def _get(self, **kwargs): + return super()._get(**kwargs) + + def _get_by_uid(self, ref: UUID): + volatile_data_instance = self._get_instance_by_uid(ref) + volatile_data = self.db_object_to_volatile_data(volatile_data_instance) + + return volatile_data diff --git a/src/SearchEngine/application/unit_of_work.py b/src/SearchEngine/application/unit_of_work.py index 175a853..3df9b8c 100644 --- a/src/SearchEngine/application/unit_of_work.py +++ b/src/SearchEngine/application/unit_of_work.py @@ -1,7 +1,7 @@ from framework.application.uow import AbstractUnitOfWork from ..domain.repositories import MockRepository -from ..infrastructure.component_managment.SQL_alchemy_repository import ( +from ..infrastructure.ComponentManagment.SQL_alchemy_repository import ( SQLAlchemyRepository, ) diff --git a/src/SearchEngine/infrastructure/component_managment/SQL_alchemy_repository.py b/src/SearchEngine/infrastructure/ComponentManagment/SQL_alchemy_repository.py similarity index 94% rename from src/SearchEngine/infrastructure/component_managment/SQL_alchemy_repository.py rename to src/SearchEngine/infrastructure/ComponentManagment/SQL_alchemy_repository.py index c276f80..cca7866 100644 --- a/src/SearchEngine/infrastructure/component_managment/SQL_alchemy_repository.py +++ b/src/SearchEngine/infrastructure/ComponentManagment/SQL_alchemy_repository.py @@ -15,7 +15,7 @@ ComponentInstance, component_inst_idx, ) -from SearchEngine.infrastructure.component_managment.component_mapper import * +from SearchEngine.infrastructure.ComponentManagment.component_mapper import * class SQLAlchemyRepository(ISQLAlchemyRepository): @@ -41,11 +41,14 @@ def _get_by_uid(self, ref: UUID) -> Component: self._session.query(ComponentInstance.type).filter(*query_filter).one() ) - component: Component = ( + component_inst: ComponentInstance = ( self._session.query(component_inst_idx[ctype[0]]) .filter(*query_filter) .one() ) + + component = bd_object_to_component(component_inst) + except NoResultFound: raise EntityUIDNotFoundException(ref) diff --git a/src/SearchEngine/infrastructure/component_managment/__init__.py b/src/SearchEngine/infrastructure/ComponentManagment/__init__.py similarity index 100% rename from src/SearchEngine/infrastructure/component_managment/__init__.py rename to src/SearchEngine/infrastructure/ComponentManagment/__init__.py diff --git a/src/SearchEngine/infrastructure/component_managment/component_mapper.py b/src/SearchEngine/infrastructure/ComponentManagment/component_mapper.py similarity index 81% rename from src/SearchEngine/infrastructure/component_managment/component_mapper.py rename to src/SearchEngine/infrastructure/ComponentManagment/component_mapper.py index 08fa114..1207725 100644 --- a/src/SearchEngine/infrastructure/component_managment/component_mapper.py +++ b/src/SearchEngine/infrastructure/ComponentManagment/component_mapper.py @@ -1,5 +1,6 @@ from typing import List from framework.domain.components import * +from framework.infrastructure.db_management.db_mapping import map_from_to from framework.infrastructure.db_management.db_structure import ( ComponentInstance, component_inst_idx, @@ -20,18 +21,10 @@ def _get_attrs_from(c_type: EComponentType): return comp_attrs, comp_inst_attrs -def _map_from_to( - component: Component | ComponentInstance, from_attrs: List, to_attrs: List -) -> dict: - mapped = {t: getattr(component, f) for t, f in zip(to_attrs, from_attrs)} - - return mapped - - def component_to_bd_object(component: Component) -> ComponentInstance: specific_inst_cls = component_inst_idx[component.type.value] comp_attrs, comp_inst_attrs = _get_attrs_from(component.type) - mapped_comp_dict = _map_from_to(component, comp_attrs, comp_inst_attrs) + mapped_comp_dict = map_from_to(component, comp_attrs, comp_inst_attrs) return specific_inst_cls(**mapped_comp_dict) @@ -41,7 +34,7 @@ def bd_object_to_component(component_instance: ComponentInstance) -> Component: comp_attrs, comp_inst_attrs = _get_attrs_from( EComponentType(component_instance.type) ) - mapped_comp_dict = _map_from_to(component_instance, comp_inst_attrs, comp_attrs) + mapped_comp_dict = map_from_to(component_instance, comp_inst_attrs, comp_attrs) return specific_comp_cls(**mapped_comp_dict) diff --git a/src/framework/domain/entity.py b/src/framework/domain/entity.py index 67a5e29..5e456be 100644 --- a/src/framework/domain/entity.py +++ b/src/framework/domain/entity.py @@ -1,7 +1,7 @@ from dataclasses import dataclass, field from typing import List, Union -from .value_object import UUID, UUIDv4 +from .value_object import UUID, UUIDv4, ValueObject from .rule import BussinessAssertionExtension from .events import DomainEvent @@ -33,3 +33,4 @@ class AggregateRoot(BussinessAssertionExtension, Entity): UniqueObject = Union[Entity, AggregateRoot] +DomainObject = Union[ValueObject, UniqueObject] diff --git a/src/framework/domain/value_object.py b/src/framework/domain/value_object.py index fe78abf..8fd63a7 100644 --- a/src/framework/domain/value_object.py +++ b/src/framework/domain/value_object.py @@ -1,4 +1,5 @@ import uuid +from functools import partial from urllib.parse import urlsplit, SplitResult from dataclasses import dataclass from functools import total_ordering @@ -6,10 +7,11 @@ from .rule import Rule, BussinessAssertionExtension -__all__ = ["UUID", "UUIDv4", "ValueObject", "Money", "URL"] +__all__ = ["UUID", "UUIDv4", "UUIDv5", "ValueObject", "Money", "URL"] UUID = uuid.UUID UUIDv4 = uuid.uuid4 +UUIDv5 = partial(uuid.uuid5, uuid.NAMESPACE_URL) class ValueObject(BussinessAssertionExtension): diff --git a/src/framework/infrastructure/db_management/db_mapping.py b/src/framework/infrastructure/db_management/db_mapping.py new file mode 100644 index 0000000..6f3e1ba --- /dev/null +++ b/src/framework/infrastructure/db_management/db_mapping.py @@ -0,0 +1,7 @@ +from typing import List + + +def map_from_to(original_object: object, from_attrs: List, to_attrs: List) -> dict: + mapped = {t: getattr(original_object, f) for t, f in zip(to_attrs, from_attrs)} + + return mapped diff --git a/src/framework/infrastructure/db_management/db_structure.py b/src/framework/infrastructure/db_management/db_structure.py index 52688d2..2d76b93 100644 --- a/src/framework/infrastructure/db_management/db_structure.py +++ b/src/framework/infrastructure/db_management/db_structure.py @@ -53,11 +53,21 @@ class VolatileDataInstance(base): url_id = Column(BinaryUUID, primary_key=True) url = Column(VARCHAR(255)) component_uid = Column(BinaryUUID, ForeignKey(ComponentInstance.uid)) - price = Column(FLOAT(7, 2, False)) + cost = Column(FLOAT(7, 2, False)) availability = Column(BOOLEAN()) timestamp = Column(DATETIME(timezone=False, fsp=0)) +AttrsVolatileData = [ + "url_id", + "url", + "component_uid", + "cost", + "availability", + "timestamp", +] + + class PriceHistoryInstance(base): __tablename__ = "prices_history" uid = Column(BinaryUUID, primary_key=True) @@ -212,7 +222,8 @@ class ComputerInstance(base): class CategoryUrlInstance(base): __tablename__ = "category_url" - uid = Column(INTEGER(5), primary_key=True, autoincrement=False) + uid = Column(BinaryUUID, primary_key=True, autoincrement=False) + scheme = Column(VARCHAR(8)) domain = Column(VARCHAR(100)) path = Column(VARCHAR(150)) type = Column(ENUM(EComponentType))