From dd45e6880b1e61c6f00ac67139b85d65c5a4f35d Mon Sep 17 00:00:00 2001 From: kggold4 Date: Fri, 7 Jul 2023 16:01:19 +0300 Subject: [PATCH] (#35) --- .gitignore | 8 +++-- scheduler/logic_scheduler.py | 41 ++++++++++++++++++++++ scheduler/run.py | 5 +++ scrapers/websites_scrapers/utils/consts.py | 1 + 4 files changed, 52 insertions(+), 3 deletions(-) create mode 100644 scheduler/logic_scheduler.py create mode 100644 scheduler/run.py diff --git a/.gitignore b/.gitignore index 1a4e185..0ade9de 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,11 @@ .idea/* -db_driver/cred +server_utils/db_driver/cred venv/ *_pids.txt __pycache__ -*/integration/api/ +*/integration/server_api_build/ */integration/scapres/ */integration/scheduler/ -*/integration/nlp_models/ \ No newline at end of file +*/integration/nlp_models/ +*.tar +*/integration/image_id.txt \ No newline at end of file diff --git a/scheduler/logic_scheduler.py b/scheduler/logic_scheduler.py new file mode 100644 index 0000000..a03853d --- /dev/null +++ b/scheduler/logic_scheduler.py @@ -0,0 +1,41 @@ +from time import sleep + +from db_driver import get_current_db_driver +from logger import get_current_logger, log_function +from scrapers.websites_scrapers.utils.consts import ScraperConsts, MainConsts +from server_utils.db_utils.task_utils import TaskUtils + + +class LogicScheduler: + SECONDS = 60 + MINUTES = 60 + SEC_TO_SLEEP = SECONDS * MINUTES * 12 # 12 hours + + def __init__(self): + self.logger = get_current_logger() + self._db = get_current_db_driver() + self.task_utils = TaskUtils() + + @log_function + def _create_collect_urls_task(self, url: str, domain: str): + try: + self.logger.debug(f"Creating collect urls task for `{domain}`m url: `{url}`") + self.task_utils.create_new_task(url=url, domain=domain, task_type=MainConsts.COLLECT_URLS) + self.logger.info(f"Created collect url task") + except Exception as e: + self.logger.error(f"Error create collect urls task for `{domain}`, url: `{url}`, except: {str(e)}") + + @log_function + def run(self): + """ + Create collect urls tasks every 12 hours + :return: + """ + while True: + self.logger.debug(f"Start creating tasks") + for domain, url in ScraperConsts.DOMAINS_HOME_PAGE_URLS.items(): + self._create_collect_urls_task(url=url, domain=domain) + domains = ScraperConsts.DOMAINS_HOME_PAGE_URLS.keys() + self.logger.info(f"Done creating collect urls tasks for domains: `{domains}`") + self.logger.warning(f"Start sleeping for {self.SEC_TO_SLEEP / (self.SECONDS * self.MINUTES)} hours") + sleep(self.SEC_TO_SLEEP) diff --git a/scheduler/run.py b/scheduler/run.py new file mode 100644 index 0000000..db87424 --- /dev/null +++ b/scheduler/run.py @@ -0,0 +1,5 @@ +from scheduler.logic_scheduler import LogicScheduler + +if __name__ == '__main__': + logic_schedular = LogicScheduler() + logic_schedular.run() diff --git a/scrapers/websites_scrapers/utils/consts.py b/scrapers/websites_scrapers/utils/consts.py index 76a86e8..af81a67 100644 --- a/scrapers/websites_scrapers/utils/consts.py +++ b/scrapers/websites_scrapers/utils/consts.py @@ -4,6 +4,7 @@ class ScraperConsts: BBC_HOME_PAGE = "https://www.bbc.com/news/" TIME_HOME_PAGE = "https://time.com/" + DOMAINS_HOME_PAGE_URLS = {"bbc": BBC_HOME_PAGE, "time": TIME_HOME_PAGE} class BBCConsts: