Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

1.0.0 09072023 cnn scraper #48

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions scrapers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from logger import get_current_logger
from scrapers.websites_scrapers.bbc_scraper import BBCScraper
from scrapers.websites_scrapers.cnn_scraper import CNNScraper
from scrapers.websites_scrapers.time_scraper import TIMEScraper
from scrapers.websites_scrapers.utils.exceptions import UnknownWebsiteScraperException
from scrapers.websites_scrapers.website_scraper_base import WebsiteScraperBase

SCRAPERS = {"bbc": BBCScraper, "time": TIMEScraper,"cnn": CNNScraper}


def websites_scrapers_factory(scraper_name: str, *args, **kwargs) -> WebsiteScraperBase:
"""
Website scrapers factory of given `scraper_name` returning website scraper class instance
:param scraper_name:
:param args:
:param kwargs:
:return:
"""
logger = get_current_logger()
try:
return SCRAPERS[scraper_name](*args, **kwargs)
except KeyError:
desc = f"Cannot find scraper name: `{scraper_name}` in {SCRAPERS.keys()}"
logger.error(desc)
raise UnknownWebsiteScraperException(desc)
except Exception as e:
desc = f"Error getting website scraper instance of name: `{scraper_name}` - {str(e)}"
logger.error(desc)
raise e
85 changes: 85 additions & 0 deletions scrapers/web_scraper/websites_scrapers/cnn_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
from datetime import datetime
from typing import List, Union
from selenium.common import NoSuchElementException
from selenium.webdriver.common.by import By

from logger import log_function
from scrapers.web_scraper import WebsiteScraperBase
from scrapers.web_scraper.websites_scrapers.utils.consts import ScraperConsts, CNNConsts
from scrapers.web_scraper.websites_scrapers.utils.xpaths import CNNXPaths


class CNNScraper(WebsiteScraperBase):
def __init__(self):
super().__init__()
self._homepage_url = ScraperConsts.CNN_HOME_PAGE

@log_function
def _get_article_title(self) -> str:
title = self._driver.get_title()
self.logger.info(f"Got title: `{title}`")
return title

@log_function
def _get_article_content_text(self) -> str:
paragraphs = self._driver.find_elements(by=By.XPATH, value=CNNXPaths.text_block)
if not paragraphs:
desc = f"Error find content text of article, element value: `{CNNXPaths.text_block}`"
self.logger.error(desc)
raise NoSuchElementException(desc)
else:
return " ".join([paragraph.get_text() for paragraph in paragraphs])

@log_function
def _get_article_publishing_time(self) -> Union[datetime, None]:
try:
time_element = self._driver.find_element(by=By.XPATH, value=CNNXPaths.publishing_time_element)
publishing_timestamp = time_element.get_attribute("datetime")
publishing_datetime = datetime.strptime(publishing_timestamp, CNNConsts.PUBLISHING_FORMAT)
return publishing_datetime
except Exception as e:
self.logger.warning(f"Error collecting publishing time - {e}")
return None

@log_function
def _get_article_image_urls(self) -> List[str]:
image_urls = []
images = self._driver.find_elements(by=By.XPATH, value=CNNXPaths.article_image)
for image in images:
image_urls.append(image.get_attribute("src"))
return image_urls

@log_function
def _check_unwanted_article(self):
pass

@log_function
def _close_popups_if_needed(self):
if self.USE_REQUEST_DRIVER:
return

self.logger.debug(f"Trying to click close popups if needed")
self._try_click_element(by=By.XPATH, value=CNNXPaths.popup_close_button, raise_on_fail=False)

@log_function
def _extract_article_urls_from_home_page(self) -> List[str]:
articles_urls = set()
articles_elements = self._driver.find_elements(by=By.XPATH, value=CNNXPaths.articles_elements)
for element in articles_elements:
href = element.get_attribute("href")
is_url_filter_bad = any([url_filter in href for url_filter in CNNConsts.NEW_ARTICLE_URL_FILTER_UNWANTED])
is_url_filter_good = any([url_filter in href for url_filter in CNNConsts.NEW_ARTICLE_URL_FILTER_WANTED])
if is_url_filter_bad or not is_url_filter_good:
continue

# if any([homepage_url in href for homepage_url in self._homepage_url]):# and bool(re.search(r'\d', href)):
articles_urls.add(href)
return list(articles_urls)

def get_new_article_urls_from_home_page(self) -> List[str]:
article_urls = []
for home_page in CNNConsts.CNN_SCRAPE_PAGES:
self._get_page(home_page)
self._close_popups_if_needed()
article_urls.extend(self._extract_article_urls_from_home_page())
return article_urls
11 changes: 10 additions & 1 deletion scrapers/web_scraper/websites_scrapers/utils/consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
class ScraperConsts:
BBC_HOME_PAGE = "https://www.bbc.com/news/"
TIME_HOME_PAGE = "https://time.com/"
CNN_HOME_PAGE = "https://cnn.com/"
NBC_HOME_PAGE = "https://www.nbcnews.com/"
DOMAINS_HOME_PAGE_URLS = {"bbc": BBC_HOME_PAGE, "time": TIME_HOME_PAGE, "nbc": NBC_HOME_PAGE}
DOMAINS_HOME_PAGE_URLS = {"bbc": BBC_HOME_PAGE, "time": TIME_HOME_PAGE, "cnn": CNN_HOME_PAGE,"nbc": NBC_HOME_PAGE}


class BBCConsts:
Expand All @@ -25,6 +26,14 @@ class NBCConsts:
NEW_ARTICLE_URL_FILTER = []


class CNNConsts:
NEW_ARTICLE_URL_FILTER_WANTED = ["/politics/", "/africa/", "/americas/", "/asia/", "/australia/", "/china/",
"/europe/", "/india/", "/middleeast/", "/uk/"]
NEW_ARTICLE_URL_FILTER_UNWANTED = ["/gallery/", "/videos/"]
PUBLISHING_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"
CNN_SCRAPE_PAGES = ["https://cnn.com/politics", "https://cnn.com/world"]


class MainConsts:
COLLECT_URLS = "collect_urls"
COLLECT_ARTICLE = "collect_article"
Expand Down
10 changes: 10 additions & 0 deletions scrapers/web_scraper/websites_scrapers/utils/xpaths.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,13 @@ class NBCXPaths:
text_block = f"{text_block_1} | {text_block_2} | {text_block_3}"
popup_close_button = "//*[@id='close_icon']"
article_image = "//article//figure//img"



class CNNXPaths:
publishing_time_element = "//div[@class='timestamp']"
article_image = "//div[@class='image__picture']//img"
articles_elements = "//a[contains(@data-link-type, 'article')]"
text_block = "//main[@class='article__main']//p[@data-component-name='paragraph']"
popup_close_button="//*[@class='bx-close bx-close-link bx-close-inside']"