(#35)

all-news-project · Jul 7, 2023 · dd45e68 · dd45e68
1 parent 5c65d3b
commit dd45e68
Show file tree

Hide file tree

Showing 4 changed files with 52 additions and 3 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,9 +1,11 @@
 .idea/*
-db_driver/cred
+server_utils/db_driver/cred
 venv/
 *_pids.txt
 __pycache__
-*/integration/api/
+*/integration/server_api_build/
 */integration/scapres/
 */integration/scheduler/
-*/integration/nlp_models/
+*/integration/nlp_models/
+*.tar
+*/integration/image_id.txt
diff --git a/scheduler/logic_scheduler.py b/scheduler/logic_scheduler.py
@@ -0,0 +1,41 @@
+from time import sleep
+
+from db_driver import get_current_db_driver
+from logger import get_current_logger, log_function
+from scrapers.websites_scrapers.utils.consts import ScraperConsts, MainConsts
+from server_utils.db_utils.task_utils import TaskUtils
+
+
+class LogicScheduler:
+    SECONDS = 60
+    MINUTES = 60
+    SEC_TO_SLEEP = SECONDS * MINUTES * 12  # 12 hours
+
+    def __init__(self):
+        self.logger = get_current_logger()
+        self._db = get_current_db_driver()
+        self.task_utils = TaskUtils()
+
+    @log_function
+    def _create_collect_urls_task(self, url: str, domain: str):
+        try:
+            self.logger.debug(f"Creating collect urls task for `{domain}`m url: `{url}`")
+            self.task_utils.create_new_task(url=url, domain=domain, task_type=MainConsts.COLLECT_URLS)
+            self.logger.info(f"Created collect url task")
+        except Exception as e:
+            self.logger.error(f"Error create collect urls task for `{domain}`, url: `{url}`, except: {str(e)}")
+
+    @log_function
+    def run(self):
+        """
+        Create collect urls tasks every 12 hours
+        :return:
+        """
+        while True:
+            self.logger.debug(f"Start creating tasks")
+            for domain, url in ScraperConsts.DOMAINS_HOME_PAGE_URLS.items():
+                self._create_collect_urls_task(url=url, domain=domain)
+            domains = ScraperConsts.DOMAINS_HOME_PAGE_URLS.keys()
+            self.logger.info(f"Done creating collect urls tasks for domains: `{domains}`")
+            self.logger.warning(f"Start sleeping for {self.SEC_TO_SLEEP / (self.SECONDS * self.MINUTES)} hours")
+            sleep(self.SEC_TO_SLEEP)
diff --git a/scheduler/run.py b/scheduler/run.py
@@ -0,0 +1,5 @@
+from scheduler.logic_scheduler import LogicScheduler
+
+if __name__ == '__main__':
+    logic_schedular = LogicScheduler()
+    logic_schedular.run()
diff --git a/scrapers/websites_scrapers/utils/consts.py b/scrapers/websites_scrapers/utils/consts.py
@@ -4,6 +4,7 @@
 class ScraperConsts:
     BBC_HOME_PAGE = "https://www.bbc.com/news/"
     TIME_HOME_PAGE = "https://time.com/"
+    DOMAINS_HOME_PAGE_URLS = {"bbc": BBC_HOME_PAGE, "time": TIME_HOME_PAGE}
 
 
 class BBCConsts: