From ad4d30b188acbb1d440e9e56a2533faa0f11a34b Mon Sep 17 00:00:00 2001 From: Eiko Wagenknecht Date: Thu, 31 Mar 2022 13:23:27 +0200 Subject: [PATCH] Add gog giveaways support (#33) Signed-off-by: Eiko Wagenknecht --- app/common.py | 1 + app/feed.py | 4 +- app/scraper/info/utils.py | 2 +- app/scraper/loot/amazon_prime.py | 6 +- app/scraper/loot/gog.py | 191 +++++++++++++++++++++++++++++++ config.default.ini | 1 + lootscraper.py | 9 ++ 7 files changed, 208 insertions(+), 6 deletions(-) create mode 100644 app/scraper/loot/gog.py diff --git a/app/common.py b/app/common.py index 51d2cdf..6758e08 100644 --- a/app/common.py +++ b/app/common.py @@ -22,6 +22,7 @@ class Source(Enum): AMAZON = "Amazon Prime" EPIC = "Epic Games" STEAM = "Steam" + GOG = "GOG" @dataclass diff --git a/app/feed.py b/app/feed.py index 907426b..79fb74b 100644 --- a/app/feed.py +++ b/app/feed.py @@ -3,12 +3,12 @@ from datetime import datetime from pathlib import Path -from feedgen.feed import FeedGenerator, FeedEntry +from feedgen.feed import FeedEntry, FeedGenerator from .common import ( - TIMESTAMP_SHORT, TIMESTAMP_LONG, TIMESTAMP_READABLE_WITH_HOUR, + TIMESTAMP_SHORT, LootOffer, OfferType, Source, diff --git a/app/scraper/info/utils.py b/app/scraper/info/utils.py index 5333070..15affc0 100644 --- a/app/scraper/info/utils.py +++ b/app/scraper/info/utils.py @@ -1,5 +1,5 @@ -import re import difflib +import re RESULT_MATCH_THRESHOLD = 0.85 diff --git a/app/scraper/loot/amazon_prime.py b/app/scraper/loot/amazon_prime.py index b10e1d7..2e1e450 100644 --- a/app/scraper/loot/amazon_prime.py +++ b/app/scraper/loot/amazon_prime.py @@ -23,10 +23,10 @@ XPATH_GAMES = ( '//div[@data-a-target="offer-list-FGWP_FULL"]//div[@data-a-target="Offer"]' ) -SUBPATH_TITLE = './/div[contains(concat(" ", normalize-space(@class), " "), " offer__body__titles")]/h3' -SUBPATH_PARAGRAPH = './/div[contains(concat(" ", normalize-space(@class), " "), " offer__body__titles")]/p' +SUBPATH_TITLE = './/div[contains(concat(" ", normalize-space(@class), " "), " offer__body__titles ")]/h3' +SUBPATH_PARAGRAPH = './/div[contains(concat(" ", normalize-space(@class), " "), " offer__body__titles ")]/p' SUBPATH_ENDDATE = ( - './/div[contains(concat(" ", normalize-space(@class), " "), " claim-info")]//p/span' + './/div[contains(concat(" ", normalize-space(@class), " "), " claim-info ")]//p/span' ) SUBPATH_LINK = './/a[@data-a-target="learn-more-card"]' SUBPATH_IMG = './/img[@class="tw-image"]' diff --git a/app/scraper/loot/gog.py b/app/scraper/loot/gog.py new file mode 100644 index 0000000..d3db0aa --- /dev/null +++ b/app/scraper/loot/gog.py @@ -0,0 +1,191 @@ +import logging +from dataclasses import dataclass +from datetime import datetime, timezone +from time import sleep + +from selenium.common.exceptions import WebDriverException +from selenium.webdriver.chrome.webdriver import WebDriver +from selenium.webdriver.common.by import By +from selenium.webdriver.remote.webelement import WebElement +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.ui import WebDriverWait + +from app.common import LootOffer, OfferType, Source +from app.scraper.loot.scraper import Scraper + +SCRAPER_NAME = "GOG" +ROOT_URL = "https://www.gog.com/#giveaway" +MAX_WAIT_SECONDS = 60 # Needs to be quite high in Docker for first run + +XPATH_PAGE_LOADED = """//div[@class="content cf"]""" +XPATH_GIVEAWAY = """//a[contains(concat(" ", normalize-space(@class), " "), " giveaway-banner ")]""" # URL: Attribute href +XPATH_SWITCH_TO_ENGLISH = """//li[@class="footer-microservice-language__item"][1]""" +SUBPATH_TITLE = """.//span[contains(concat(" ", normalize-space(@class), " "), " giveaway-banner__title ")]""" +SUBPATH_IMAGE = """.//div[contains(concat(" ", normalize-space(@class), " "), " giveaway-banner__image ")]//source[@type="image/png" and not(@media)]""" # Attribute srcset, first entry without the "2x text + root url" +SUBPATH_VALID_TO = """.//gog-countdown-timer""" # Attr "end-date" without the last 3 digits (000) is the timestamp in unixtime + + +@dataclass +class RawOffer: + title: str | None + valid_to: str | None + url: str | None + img_url: str | None + + +class GogScraper(Scraper): + @staticmethod + def scrape( + driver: WebDriver, options: dict[str, bool] = None + ) -> dict[str, list[LootOffer]]: + if options and not options[OfferType.GAME.name]: + return {} + + driver.get(ROOT_URL) + + offers = {} + + logging.info(f"Analyzing {ROOT_URL} for {OfferType.GAME.value} offers") + offers[OfferType.GAME.name] = GogScraper.read_offers_from_page(driver) + + return offers + + @staticmethod + def read_offers_from_page(driver: WebDriver) -> list[LootOffer]: + try: + # Wait until the page loaded + WebDriverWait(driver, MAX_WAIT_SECONDS).until( + EC.presence_of_element_located((By.XPATH, XPATH_PAGE_LOADED)) + ) + except WebDriverException: + logging.error(f"Page took longer than {MAX_WAIT_SECONDS} to load") + return [] + + try: + # Switch to english version + en = driver.find_element(By.XPATH, XPATH_SWITCH_TO_ENGLISH) + en.click() + sleep(1) # Wait for the language switching to begin + except WebDriverException: + logging.error("Couldn't switch to English") + return [] + + try: + # Wait until the page loaded + WebDriverWait(driver, MAX_WAIT_SECONDS).until( + EC.presence_of_element_located((By.XPATH, XPATH_GIVEAWAY)) + ) + except WebDriverException: + logging.info( + f"Giveaways took longer than {MAX_WAIT_SECONDS} to load, probably there are none" + ) + return [] + + offer_element = driver.find_element(By.XPATH, XPATH_GIVEAWAY) + + raw_offers: list[RawOffer] = [] + raw_offers.append(GogScraper.read_raw_offer(offer_element)) + + normalized_offers = GogScraper.normalize_offers(raw_offers) + + return normalized_offers + + @staticmethod + def read_raw_offer(element: WebElement) -> RawOffer: + title_str = None + valid_to_str = None + url_str = None + img_url_str = None + + try: + title_str = str(element.find_element(By.XPATH, SUBPATH_TITLE).text) + title_str = title_str.removeprefix("Claim ") + title_str = title_str.removesuffix( + " and don't miss the best GOG offers in the future!" + ) + except WebDriverException: + # Nothing to do here, string stays empty + pass + + try: + valid_to_str = str( + element.find_element(By.XPATH, SUBPATH_VALID_TO).get_attribute( + "end-date" + ) + ) + except WebDriverException: + # Nothing to do here, string stays empty + pass + + try: + url_str = str(element.get_attribute("href")) # type: ignore + except WebDriverException: + # Nothing to do here, string stays empty + pass + + try: + img_url_str = str( + element.find_element(By.XPATH, SUBPATH_IMAGE).get_attribute("srcset") + ) + img_url_str = "https:" + ( + img_url_str.split(",")[0] + .strip() + .removesuffix(" 2x") + .removesuffix(" 1x") + ) + except WebDriverException: + # Nothing to do here, string stays empty + pass + + # For current offers, the date is included twice but only means the enddate + + return RawOffer( + title=title_str, + valid_to=valid_to_str, + url=url_str, + img_url=img_url_str, + ) + + @staticmethod + def normalize_offers(raw_offers: list[RawOffer]) -> list[LootOffer]: + normalized_offers: list[LootOffer] = [] + + for offer in raw_offers: + # Raw text + rawtext = "" + if offer.title: + rawtext += f"{offer.title}" + + if offer.valid_to: + rawtext += f"{offer.valid_to}" + + # Title + # Contains additional text that needs to be stripped + title = offer.title + + # Valid to + valid_to_stamp = None + if offer.valid_to: + try: + valid_to_unix = int(offer.valid_to) / 1000 + valid_to_stamp = datetime.utcfromtimestamp(valid_to_unix).replace( + tzinfo=timezone.utc + ) + except ValueError: + valid_to_stamp = None + + nearest_url = offer.url if offer.url else ROOT_URL + loot_offer = LootOffer( + seen_last=datetime.now(timezone.utc), + source=Source.GOG, + type=OfferType.GAME, + rawtext=rawtext, + title=title, + valid_to=valid_to_stamp, + url=nearest_url, + img_url=offer.img_url, + ) + + if title is not None and len(title) > 0: + normalized_offers.append(loot_offer) + return normalized_offers diff --git a/config.default.ini b/config.default.ini index 8cbf523..a81a2c8 100644 --- a/config.default.ini +++ b/config.default.ini @@ -15,6 +15,7 @@ ForceUpdate = yes [sources_loot] Amazon = yes Epic = yes +Gog = yes Steam = yes [sources_info] diff --git a/lootscraper.py b/lootscraper.py index 979be37..e34355c 100644 --- a/lootscraper.py +++ b/lootscraper.py @@ -19,6 +19,7 @@ from app.scraper.info.steam import get_steam_details from app.scraper.loot.amazon_prime import AmazonScraper from app.scraper.loot.epic_games import EpicScraper +from app.scraper.loot.gog import GogScraper from app.scraper.loot.steam import SteamScraper from app.upload import upload_to_server @@ -88,6 +89,7 @@ def job() -> None: cfg_amazon: bool = Config.config().getboolean("sources_loot", "Amazon") cfg_epic: bool = Config.config().getboolean("sources_loot", "Epic") cfg_steam: bool = Config.config().getboolean("sources_loot", "Steam") + cfg_gog: bool = Config.config().getboolean("sources_loot", "Gog") cfg_games: bool = Config.config().getboolean("actions", "ScrapeGames") cfg_loot: bool = Config.config().getboolean("actions", "ScrapeLoot") @@ -117,6 +119,13 @@ def job() -> None: else: logging.info(f"Skipping {Source.STEAM.value}") + if cfg_gog: + scraped_offers[Source.GOG.name] = GogScraper.scrape( + webdriver, cfg_what_to_scrape + ) + else: + logging.info(f"Skipping {Source.GOG.value}") + # Check which offers are new and which are updated, then act accordingly: # - Offers that are neither new nor updated just get a new date # - Offers that are new are inserted