From 1c818dbd1b5d4c39bcdf9581808cbfcaf8672e71 Mon Sep 17 00:00:00 2001 From: Roberto Bochet Date: Wed, 26 Jun 2024 05:19:59 +0200 Subject: [PATCH] Squashed commit of the following: commit b95f6c9d3171cf0c43a57e42e98c534a9d9f5e85 Author: Roberto Bochet Date: Wed Jun 26 05:18:37 2024 +0200 Update config.example.yaml commit 8a7bd0bdc80798d95197f61ec8e947e2e40e5cb9 Author: Roberto Bochet Date: Wed Jun 26 04:58:13 2024 +0200 Move config example commit 2cb20f940c81403c3ec76eb46233f56bfc858282 Author: Roberto Bochet Date: Wed Jun 26 04:57:02 2024 +0200 Minor change in README commit 47800f681cf0d2134569932673b210fca106f812 Author: Roberto Bochet Date: Wed Jun 26 04:54:00 2024 +0200 Make redis async commit 7f03ae6bd2d867c908b19174aee814b89f24fe5a Author: Roberto Bochet Date: Wed Jun 26 04:53:43 2024 +0200 Implement scraper bot logic commit ac08eea9071955fddce282ca76c5a39494eb2b99 Author: Roberto Bochet Date: Wed Jun 26 04:52:46 2024 +0200 Fix result entry hash commit 166f8aa8eec001bf7c48e7b6aec99a5ad522d218 Author: Roberto Bochet Date: Wed Jun 26 03:44:06 2024 +0200 Renaming commit 6e3fc3f756c4764d59b7967839fa1aeca0816d7f Author: Roberto Bochet Date: Wed Jun 26 03:43:31 2024 +0200 Create class for results commit 381af6e1f03acd9de5dcdff8af97d71a124ec4e1 Author: Roberto Bochet Date: Wed Jun 26 03:40:02 2024 +0200 Make settings persistent commit 3d4549151ca40f17df7a4143d0df303cdb102f23 Author: Roberto Bochet Date: Wed Jun 26 02:15:45 2024 +0200 Fix sigint exit commit 396870ad137d39dfcde1886d0d311d11e2240cf9 Author: Roberto Bochet Date: Wed Jun 26 00:48:05 2024 +0200 Use playwright in scraper task commit 7764364b23ab10808068a20bd8f20bf917aa453e Author: Roberto Bochet Date: Wed Jun 26 00:47:42 2024 +0200 Minor change commit 43512a06f97c34316e62e1b4bc682e25f28650ff Author: Roberto Bochet Date: Wed Jun 26 00:46:52 2024 +0200 Use async notify version commit 78b4027829eb262f8f203ec6d50fd07741a8abbe Author: Roberto Bochet Date: Wed Jun 26 00:18:43 2024 +0200 Add playwright installer in docker commit 411b8f7400e0710aacc5b742f04bc78e32aef41d Author: Roberto Bochet Date: Wed Jun 26 00:16:28 2024 +0200 Update settings commit a51daa6220fca7375fa67418a4dfde36e11d5dea Author: Roberto Bochet Date: Tue Jun 25 07:13:42 2024 +0200 Get demonize also via cli commit 1fabcde02a345e8ce829fcb302762dc6cd97a34a Author: Roberto Bochet Date: Tue Jun 25 07:13:22 2024 +0200 Update dependencies commit d4ca5eb8676e25d33d8cc5cfd60a3e9615c7f7d2 Author: Roberto Bochet Date: Tue Jun 25 07:11:39 2024 +0200 Force v in version tag for the ci commit ee9dfe66e56858f62a2abfb23e7504c416e1612f Author: Roberto Bochet Date: Tue Jun 25 07:11:21 2024 +0200 Remove old files commit 0aa63ede15e723c7833bdd518fa6f2776c4afedc Author: Roberto Bochet Date: Tue Jun 25 07:09:32 2024 +0200 Update settings --- .github/workflows/build-container.yml | 2 +- Dockerfile | 6 +- README.md | 4 +- config.example.yaml | 42 +++++ config.yaml | 38 ----- poetry.lock | 156 ++++++++++++++---- pyproject.toml | 4 +- scraper_bot/__main__.py | 42 ++++- scraper_bot/bot/__init__.py | 1 - scraper_bot/bot/_bot.py | 29 ---- scraper_bot/cache/__init__.py | 2 +- scraper_bot/cache/_cache.py | 14 -- scraper_bot/cache/cache.py | 27 +++ scraper_bot/exceptions/__init__.py | 2 - scraper_bot/notifications/__init__.py | 1 + scraper_bot/notifications/notifications.py | 28 +++- scraper_bot/scraper/__init__.py | 2 +- scraper_bot/scraper/_scraper.py | 97 ----------- scraper_bot/scraper/browser_manager.py | 41 +++++ .../scraper/{_exceptions.py => exceptions.py} | 0 scraper_bot/scraper/scraper.py | 31 ++++ scraper_bot/scraper/scraper_task.py | 57 +++++++ scraper_bot/scraper/scraper_task_result.py | 29 ++++ .../scraper/scraper_task_result_entity.py | 38 +++++ scraper_bot/scraper_bot/__init__.py | 3 +- scraper_bot/scraper_bot/_scraper_bot.py | 50 ------ scraper_bot/scraper_bot/_task.py | 35 ---- scraper_bot/scraper_bot/scraper_bot.py | 54 ++++++ scraper_bot/settings/browser.py | 21 +++ scraper_bot/settings/notifications.py | 55 ++++++ scraper_bot/settings/settings.py | 116 +++++-------- scraper_bot/settings/task.py | 36 ++++ .../{AppriseURI.py => apprise_uri.py} | 2 +- 33 files changed, 668 insertions(+), 397 deletions(-) create mode 100644 config.example.yaml delete mode 100644 config.yaml delete mode 100644 scraper_bot/bot/__init__.py delete mode 100644 scraper_bot/bot/_bot.py delete mode 100644 scraper_bot/cache/_cache.py create mode 100644 scraper_bot/cache/cache.py delete mode 100644 scraper_bot/exceptions/__init__.py delete mode 100644 scraper_bot/scraper/_scraper.py create mode 100644 scraper_bot/scraper/browser_manager.py rename scraper_bot/scraper/{_exceptions.py => exceptions.py} (100%) create mode 100644 scraper_bot/scraper/scraper.py create mode 100644 scraper_bot/scraper/scraper_task.py create mode 100644 scraper_bot/scraper/scraper_task_result.py create mode 100644 scraper_bot/scraper/scraper_task_result_entity.py delete mode 100644 scraper_bot/scraper_bot/_scraper_bot.py delete mode 100644 scraper_bot/scraper_bot/_task.py create mode 100644 scraper_bot/scraper_bot/scraper_bot.py create mode 100644 scraper_bot/settings/browser.py create mode 100644 scraper_bot/settings/notifications.py create mode 100644 scraper_bot/settings/task.py rename scraper_bot/utilities/{AppriseURI.py => apprise_uri.py} (93%) diff --git a/.github/workflows/build-container.yml b/.github/workflows/build-container.yml index 2a3caca..47f0785 100644 --- a/.github/workflows/build-container.yml +++ b/.github/workflows/build-container.yml @@ -3,7 +3,7 @@ name: build-container on: push: tags: - - '?[0-9]+.[0-9]+.[0-9]+' + - 'v[0-9]+.[0-9]+.[0-9]+' jobs: build-container: diff --git a/Dockerfile b/Dockerfile index a422133..0bc6448 100644 --- a/Dockerfile +++ b/Dockerfile @@ -17,7 +17,7 @@ COPY . . RUN poetry build --format wheel -FROM python:3.12-alpine +FROM python:3.12-slim VOLUME /app @@ -25,4 +25,8 @@ COPY --from=compiler /app/dist/*.whl / RUN pip3 install --no-cache-dir -- *.whl +RUN playwright install --with-deps firefox + +ENV SB__BROWSER__TYPE="firefox" + ENTRYPOINT python3 -m scraper_bot diff --git a/README.md b/README.md index 0614e10..a44a820 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ As alternative, you can build by yourself the python package or the container ### Fast deploy (docker-compose) 1. [Create a telegram bot](https://core.telegram.org/bots#3-how-do-i-create-a-bot) and retrieve its token -2. Download `config.yaml` and put into `/etc/scraperbot` folder +2. Download `config.example.yaml` and rename it to `config.yaml` 3. Change the configuration follow the [guidelines](#configuration) 4. Download `docker-compose.yaml` 5. Start the scraper with `docker-compose` @@ -44,4 +44,4 @@ Furthermore you can get the config json schema from command line with `--config- scraper_bot --config-schema ``` -You can also find a configuration example in `config.yaml`. +You can also find a configuration example in `config.example.yaml`. diff --git a/config.example.yaml b/config.example.yaml new file mode 100644 index 0000000..a8fe3e7 --- /dev/null +++ b/config.example.yaml @@ -0,0 +1,42 @@ +####################### +# Example config.yaml # +####################### +# This file contains a config example +# thought to find real estate ads +# In particular we look for an apartment +# in Milano at least tree rooms +notifications: + message: | + # [{{title}}]({{url}}) + {% if location %}📍 *{{location}}*{% endif %} + {% if price %}💶 *{{price}}€*{% endif %} + {% if size %}📐 *{{size}}m²*{% endif %} + format: markdown + channels: + # It is a list of apprise supported channels + # where the scraped entities have to be sent + - "tgram://{YOUR_BOT_TOKEN}/{CHAT_ID1}" + - "tgram://{YOUR_BOT_TOKEN}/{CHAT_ID2}" + - message: "Found a new adds at {{url}}" + format: "text" + uri: "discord://webhook_id/webhook_token" +tasks: + - name: "immobiliare.it" + url: "https://www.immobiliare.it/affitto-case/lodi/?criterio=rilevanza&localiMinimo=3" + target: | + [...document.querySelectorAll("li.in-searchLayoutListItem")].map(t =>({ + url: t.querySelector("a.in-listingCardTitle")?.href, + title: t.querySelector("a.in-listingCardTitle")?.innerText, + price: t.querySelector(".in-listingCardPrice span")?.innerText, + size: t.querySelector(".in-listingCardFeatureList__item:nth-child(2) span")?.innerText.replace(/[^0-9]+/g,"") + })) + - name: "mioaffitto" + url: "https://www.mioaffitto.it/search?provincia=50&poblacion=67355" + target: | + [...document.querySelectorAll(".property-list .propertyCard:not(.property-alternative)")].map(t=> ({ + url: t.querySelector("a")?.href, + title: t.querySelector("a p")?.innerText, + price: t.querySelector(".propertyCard__price--value")?.innerText.replace(/[^0-9]+/g,""), + size: t.querySelector(".propertyCard__details li:has(.fa-size-o)")?.innerText.replace(/[^0-9]+/g,""), + location: t.querySelector(".propertyCard__location p")?.innerText + })) diff --git a/config.yaml b/config.yaml deleted file mode 100644 index 84cdd99..0000000 --- a/config.yaml +++ /dev/null @@ -1,38 +0,0 @@ -####################### -# Example config.yaml # -####################### -# This file contains a config example -# thought to find real estate ads -# In particular we look for an apartment -# in Milano at least tree rooms -notifications: - title: "New ads found" - message: | - **Ads found found** - - You can found it [here]({{url}}) - format: markdown - channels: - # It is a list of apprise supported channels - # where the scraped entities have to be sent - - "tgram://{YOUR_BOT_TOKEN}/{CHAT_ID1}" - - "tgram://{YOUR_BOT_TOKEN}/{CHAT_ID2}" - - message: "Found a new adds at {{url}}" - format: "text" - uri: "discord://webhook_id/webhook_token" -tasks: - # Created an entity for each site/link - # In this case we look to two site - - name: "immobiliare" - # In `url`, `{i}` is a placeholder - # for the pagination - url: "https://www.immobiliare.it/affitto-case/milano/?criterio=rilevanza&localiMinimo=3&pag={i}" - # `target` is a unique css selector - # to target the tag contains - # the link to the scraped page - target: "div.in-card.nd-mediaObject.nd-mediaObject--colToRow.in-realEstateCard.in-realEstateCard--interactive.in-realEstateListCard a.in-card__title" - interval: 3600 - - name: "trovacasa" - url: "https://www.trovacasa.net/Affitto/MI/Milano/index.aspx?nrlocMin=3&pag={i}" - target: "ol#risultati li.annuncio a.js_linkdettaglio" - interval: 3600 diff --git a/poetry.lock b/poetry.lock index 5b6da86..9594a91 100644 --- a/poetry.lock +++ b/poetry.lock @@ -30,24 +30,6 @@ PyYAML = "*" requests = "*" requests-oauthlib = "*" -[[package]] -name = "beautifulsoup4" -version = "4.10.0" -description = "Screen-scraping library" -optional = false -python-versions = ">3.0.0" -files = [ - {file = "beautifulsoup4-4.10.0-py3-none-any.whl", hash = "sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf"}, - {file = "beautifulsoup4-4.10.0.tar.gz", hash = "sha256:c23ad23c521d818955a4151a67d81580319d4bf548d3d49f4223ae041ff98891"}, -] - -[package.dependencies] -soupsieve = ">1.2" - -[package.extras] -html5lib = ["html5lib"] -lxml = ["lxml"] - [[package]] name = "certifi" version = "2024.6.2" @@ -221,6 +203,77 @@ docs = ["furo (>=2023.9.10)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1 testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8.0.1)", "pytest (>=7.4.3)", "pytest-asyncio (>=0.21)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)", "virtualenv (>=20.26.2)"] typing = ["typing-extensions (>=4.8)"] +[[package]] +name = "greenlet" +version = "3.0.3" +description = "Lightweight in-process concurrent programming" +optional = false +python-versions = ">=3.7" +files = [ + {file = "greenlet-3.0.3-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:9da2bd29ed9e4f15955dd1595ad7bc9320308a3b766ef7f837e23ad4b4aac31a"}, + {file = "greenlet-3.0.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d353cadd6083fdb056bb46ed07e4340b0869c305c8ca54ef9da3421acbdf6881"}, + {file = "greenlet-3.0.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dca1e2f3ca00b84a396bc1bce13dd21f680f035314d2379c4160c98153b2059b"}, + {file = "greenlet-3.0.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3ed7fb269f15dc662787f4119ec300ad0702fa1b19d2135a37c2c4de6fadfd4a"}, + {file = "greenlet-3.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd4f49ae60e10adbc94b45c0b5e6a179acc1736cf7a90160b404076ee283cf83"}, + {file = "greenlet-3.0.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:73a411ef564e0e097dbe7e866bb2dda0f027e072b04da387282b02c308807405"}, + {file = "greenlet-3.0.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:7f362975f2d179f9e26928c5b517524e89dd48530a0202570d55ad6ca5d8a56f"}, + {file = "greenlet-3.0.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:649dde7de1a5eceb258f9cb00bdf50e978c9db1b996964cd80703614c86495eb"}, + {file = "greenlet-3.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:68834da854554926fbedd38c76e60c4a2e3198c6fbed520b106a8986445caaf9"}, + {file = "greenlet-3.0.3-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:b1b5667cced97081bf57b8fa1d6bfca67814b0afd38208d52538316e9422fc61"}, + {file = "greenlet-3.0.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:52f59dd9c96ad2fc0d5724107444f76eb20aaccb675bf825df6435acb7703559"}, + {file = "greenlet-3.0.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:afaff6cf5200befd5cec055b07d1c0a5a06c040fe5ad148abcd11ba6ab9b114e"}, + {file = "greenlet-3.0.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fe754d231288e1e64323cfad462fcee8f0288654c10bdf4f603a39ed923bef33"}, + {file = "greenlet-3.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2797aa5aedac23af156bbb5a6aa2cd3427ada2972c828244eb7d1b9255846379"}, + {file = "greenlet-3.0.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b7f009caad047246ed379e1c4dbcb8b020f0a390667ea74d2387be2998f58a22"}, + {file = "greenlet-3.0.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c5e1536de2aad7bf62e27baf79225d0d64360d4168cf2e6becb91baf1ed074f3"}, + {file = "greenlet-3.0.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:894393ce10ceac937e56ec00bb71c4c2f8209ad516e96033e4b3b1de270e200d"}, + {file = "greenlet-3.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:1ea188d4f49089fc6fb283845ab18a2518d279c7cd9da1065d7a84e991748728"}, + {file = "greenlet-3.0.3-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:70fb482fdf2c707765ab5f0b6655e9cfcf3780d8d87355a063547b41177599be"}, + {file = "greenlet-3.0.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d4d1ac74f5c0c0524e4a24335350edad7e5f03b9532da7ea4d3c54d527784f2e"}, + {file = "greenlet-3.0.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:149e94a2dd82d19838fe4b2259f1b6b9957d5ba1b25640d2380bea9c5df37676"}, + {file = "greenlet-3.0.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:15d79dd26056573940fcb8c7413d84118086f2ec1a8acdfa854631084393efcc"}, + {file = "greenlet-3.0.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:881b7db1ebff4ba09aaaeae6aa491daeb226c8150fc20e836ad00041bcb11230"}, + {file = "greenlet-3.0.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fcd2469d6a2cf298f198f0487e0a5b1a47a42ca0fa4dfd1b6862c999f018ebbf"}, + {file = "greenlet-3.0.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:1f672519db1796ca0d8753f9e78ec02355e862d0998193038c7073045899f305"}, + {file = "greenlet-3.0.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2516a9957eed41dd8f1ec0c604f1cdc86758b587d964668b5b196a9db5bfcde6"}, + {file = "greenlet-3.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:bba5387a6975598857d86de9eac14210a49d554a77eb8261cc68b7d082f78ce2"}, + {file = "greenlet-3.0.3-cp37-cp37m-macosx_11_0_universal2.whl", hash = "sha256:5b51e85cb5ceda94e79d019ed36b35386e8c37d22f07d6a751cb659b180d5274"}, + {file = "greenlet-3.0.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:daf3cb43b7cf2ba96d614252ce1684c1bccee6b2183a01328c98d36fcd7d5cb0"}, + {file = "greenlet-3.0.3-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:99bf650dc5d69546e076f413a87481ee1d2d09aaaaaca058c9251b6d8c14783f"}, + {file = "greenlet-3.0.3-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2dd6e660effd852586b6a8478a1d244b8dc90ab5b1321751d2ea15deb49ed414"}, + {file = "greenlet-3.0.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e3391d1e16e2a5a1507d83e4a8b100f4ee626e8eca43cf2cadb543de69827c4c"}, + {file = "greenlet-3.0.3-cp37-cp37m-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e1f145462f1fa6e4a4ae3c0f782e580ce44d57c8f2c7aae1b6fa88c0b2efdb41"}, + {file = "greenlet-3.0.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:1a7191e42732df52cb5f39d3527217e7ab73cae2cb3694d241e18f53d84ea9a7"}, + {file = "greenlet-3.0.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:0448abc479fab28b00cb472d278828b3ccca164531daab4e970a0458786055d6"}, + {file = "greenlet-3.0.3-cp37-cp37m-win32.whl", hash = "sha256:b542be2440edc2d48547b5923c408cbe0fc94afb9f18741faa6ae970dbcb9b6d"}, + {file = "greenlet-3.0.3-cp37-cp37m-win_amd64.whl", hash = "sha256:01bc7ea167cf943b4c802068e178bbf70ae2e8c080467070d01bfa02f337ee67"}, + {file = "greenlet-3.0.3-cp38-cp38-macosx_11_0_universal2.whl", hash = "sha256:1996cb9306c8595335bb157d133daf5cf9f693ef413e7673cb07e3e5871379ca"}, + {file = "greenlet-3.0.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3ddc0f794e6ad661e321caa8d2f0a55ce01213c74722587256fb6566049a8b04"}, + {file = "greenlet-3.0.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c9db1c18f0eaad2f804728c67d6c610778456e3e1cc4ab4bbd5eeb8e6053c6fc"}, + {file = "greenlet-3.0.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7170375bcc99f1a2fbd9c306f5be8764eaf3ac6b5cb968862cad4c7057756506"}, + {file = "greenlet-3.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b66c9c1e7ccabad3a7d037b2bcb740122a7b17a53734b7d72a344ce39882a1b"}, + {file = "greenlet-3.0.3-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:098d86f528c855ead3479afe84b49242e174ed262456c342d70fc7f972bc13c4"}, + {file = "greenlet-3.0.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:81bb9c6d52e8321f09c3d165b2a78c680506d9af285bfccbad9fb7ad5a5da3e5"}, + {file = "greenlet-3.0.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:fd096eb7ffef17c456cfa587523c5f92321ae02427ff955bebe9e3c63bc9f0da"}, + {file = "greenlet-3.0.3-cp38-cp38-win32.whl", hash = "sha256:d46677c85c5ba00a9cb6f7a00b2bfa6f812192d2c9f7d9c4f6a55b60216712f3"}, + {file = "greenlet-3.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:419b386f84949bf0e7c73e6032e3457b82a787c1ab4a0e43732898a761cc9dbf"}, + {file = "greenlet-3.0.3-cp39-cp39-macosx_11_0_universal2.whl", hash = "sha256:da70d4d51c8b306bb7a031d5cff6cc25ad253affe89b70352af5f1cb68e74b53"}, + {file = "greenlet-3.0.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:086152f8fbc5955df88382e8a75984e2bb1c892ad2e3c80a2508954e52295257"}, + {file = "greenlet-3.0.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d73a9fe764d77f87f8ec26a0c85144d6a951a6c438dfe50487df5595c6373eac"}, + {file = "greenlet-3.0.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b7dcbe92cc99f08c8dd11f930de4d99ef756c3591a5377d1d9cd7dd5e896da71"}, + {file = "greenlet-3.0.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1551a8195c0d4a68fac7a4325efac0d541b48def35feb49d803674ac32582f61"}, + {file = "greenlet-3.0.3-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:64d7675ad83578e3fc149b617a444fab8efdafc9385471f868eb5ff83e446b8b"}, + {file = "greenlet-3.0.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b37eef18ea55f2ffd8f00ff8fe7c8d3818abd3e25fb73fae2ca3b672e333a7a6"}, + {file = "greenlet-3.0.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:77457465d89b8263bca14759d7c1684df840b6811b2499838cc5b040a8b5b113"}, + {file = "greenlet-3.0.3-cp39-cp39-win32.whl", hash = "sha256:57e8974f23e47dac22b83436bdcf23080ade568ce77df33159e019d161ce1d1e"}, + {file = "greenlet-3.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:c5ee858cfe08f34712f548c3c363e807e7186f03ad7a5039ebadb29e8c6be067"}, + {file = "greenlet-3.0.3.tar.gz", hash = "sha256:43374442353259554ce33599da8b692d5aa96f8976d567d4badf263371fbe491"}, +] + +[package.extras] +docs = ["Sphinx", "furo"] +test = ["objgraph", "psutil"] + [[package]] name = "identify" version = "2.5.36" @@ -401,6 +454,43 @@ docs = ["furo (>=2023.9.10)", "proselint (>=0.13)", "sphinx (>=7.2.6)", "sphinx- test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4.3)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)"] type = ["mypy (>=1.8)"] +[[package]] +name = "playwright" +version = "1.44.0" +description = "A high-level API to automate web browsers" +optional = false +python-versions = ">=3.8" +files = [ + {file = "playwright-1.44.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:c2317a80896796fdeb03d60f06cc229e775ff2e19b80c64b1bb9b29c8a59d992"}, + {file = "playwright-1.44.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:54d44fb634d870839301c2326e1e12a178a1be0de76d0caaec230ab075c2e077"}, + {file = "playwright-1.44.0-py3-none-macosx_11_0_universal2.whl", hash = "sha256:64b67194e73b47ae72acf25f1a9cfacfef38ca2b52e4bb8b0abd385c5deeaadf"}, + {file = "playwright-1.44.0-py3-none-manylinux1_x86_64.whl", hash = "sha256:29161b1fae71f7c402df5b15f0bd3deaeecd8b3d1ecd9ff01271700c66210e7b"}, + {file = "playwright-1.44.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f8c8a3bfea17576d3f94a2363eee195cbda8dbba86975588c7eaac7792b25eee"}, + {file = "playwright-1.44.0-py3-none-win32.whl", hash = "sha256:235e37832deaa9af8a629d09955396259ab757533cc1922f9b0308b4ee0d9cdf"}, + {file = "playwright-1.44.0-py3-none-win_amd64.whl", hash = "sha256:5b8a4a1d4d50f4ff99b47965576322a8c4e34631854b862a25c1feb824be22a8"}, +] + +[package.dependencies] +greenlet = "3.0.3" +pyee = "11.1.0" + +[[package]] +name = "playwright-stealth" +version = "1.0.6" +description = "playwright stealth" +optional = false +python-versions = ">=3, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +files = [ + {file = "playwright-stealth-1.0.6.tar.gz", hash = "sha256:b504d951d00fac755c7d13665a29611d415180510bd7d23f14ebc89439ba2043"}, + {file = "playwright_stealth-1.0.6-py3-none-any.whl", hash = "sha256:b1b2bcf58eb6859aa53d42c49b91c4e27b74a6d13fc3d0c85eea513dd55efda3"}, +] + +[package.dependencies] +playwright = "*" + +[package.extras] +test = ["pytest"] + [[package]] name = "pre-commit" version = "3.7.1" @@ -548,6 +638,23 @@ python-dotenv = ">=0.21.0" toml = ["tomli (>=2.0.1)"] yaml = ["pyyaml (>=6.0.1)"] +[[package]] +name = "pyee" +version = "11.1.0" +description = "A rough port of Node.js's EventEmitter to Python with a few tricks of its own" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pyee-11.1.0-py3-none-any.whl", hash = "sha256:5d346a7d0f861a4b2e6c47960295bd895f816725b27d656181947346be98d7c1"}, + {file = "pyee-11.1.0.tar.gz", hash = "sha256:b53af98f6990c810edd9b56b87791021a8f54fd13db4edd1142438d44ba2263f"}, +] + +[package.dependencies] +typing-extensions = "*" + +[package.extras] +dev = ["black", "build", "flake8", "flake8-black", "isort", "jupyter-console", "mkdocs", "mkdocs-include-markdown-plugin", "mkdocstrings[python]", "pytest", "pytest-asyncio", "pytest-trio", "sphinx", "toml", "tox", "trio", "trio", "trio-typing", "twine", "twisted", "validate-pyproject[all]"] + [[package]] name = "python-dotenv" version = "1.0.1" @@ -676,17 +783,6 @@ requests = ">=2.0.0" [package.extras] rsa = ["oauthlib[signedtoken] (>=3.0.0)"] -[[package]] -name = "soupsieve" -version = "2.5" -description = "A modern CSS selector implementation for Beautiful Soup." -optional = false -python-versions = ">=3.8" -files = [ - {file = "soupsieve-2.5-py3-none-any.whl", hash = "sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7"}, - {file = "soupsieve-2.5.tar.gz", hash = "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690"}, -] - [[package]] name = "termcolor" version = "2.4.0" @@ -752,4 +848,4 @@ test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess [metadata] lock-version = "2.0" python-versions = "^3.12" -content-hash = "651343e50c41818333a8241de7a5618a31d861f3dbaf50913abe57325919fbba" +content-hash = "bd5346f846abe7cea2a71a26a9a6484d8049353fafa330716ba2320b77619d01" diff --git a/pyproject.toml b/pyproject.toml index 22a9ee3..18db696 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,9 +17,7 @@ classifiers=[ [tool.poetry.dependencies] python = "^3.12" -beautifulsoup4 = ">=4.10.0,<4.11.0" redis = "^4.6.0" -requests = "^2.32.3" ischedule = ">=1.2.2,<1.3.0" pyyaml = ">=6.0,<7.0" pydantic = "^2.7.4" @@ -28,6 +26,8 @@ termcolor = "^2.4.0" urllib3 = "^2.2.2" apprise = "^1.8.0" jinja2 = "^3.1.4" +playwright = "^1.44.0" +playwright-stealth = "^1.0.6" [tool.poetry.group.dev.dependencies] diff --git a/scraper_bot/__main__.py b/scraper_bot/__main__.py index 35510fc..221410c 100644 --- a/scraper_bot/__main__.py +++ b/scraper_bot/__main__.py @@ -1,9 +1,10 @@ #!/usr/bin/env python3 +import asyncio import json import logging.config -import signal -import sys from argparse import ArgumentParser +from asyncio import CancelledError, create_task +from signal import SIGINT from pydantic import ValidationError @@ -13,8 +14,6 @@ def main() -> int: - signal.signal(signal.SIGINT, lambda: sys.exit(0)) - # loads logger config setup_default_logger() @@ -30,6 +29,14 @@ def main() -> int: help="configuration file path", ) + parser.add_argument( + "-d", + "--daemonize", + action="store_true", + dest="daemonize", + help="run the scraper as a daemon instead run only once", + ) + parser.add_argument( "--config-schema", action="store_true", @@ -46,6 +53,8 @@ def main() -> int: # parses args args = vars(parser.parse_args()) + cli_override_settings = {} + if args.get("show_config_schema"): print(json.dumps(Settings.model_json_schema(), indent=2)) return 0 @@ -54,8 +63,11 @@ def main() -> int: Settings.set_settings_path(config_path) LOGGER.info(f"Using config file '{config_path}'") + if args.get("daemonize"): + cli_override_settings["daemonize"] = True + try: - settings = Settings() + settings = Settings(**cli_override_settings) except ValidationError as e: LOGGER.critical(f"Configuration issue: {e}") return 1 @@ -65,11 +77,25 @@ def main() -> int: LOGGER.info("bot_scraper is ready to start") - # starts bot - bot.start() + if not settings.daemonize: + asyncio.run(bot.run_once()) + return 0 + + async def daemonize(): + LOGGER.info("Starting daemon") + task = create_task(bot.run()) + + task.get_loop().add_signal_handler(SIGINT, task.cancel) + + try: + await task + except CancelledError: + LOGGER.info("Daemon has been stopped") + # starts bot as daemon + asyncio.run(daemonize()) return 0 if __name__ == "__main__": - sys.exit(main()) + exit(main()) diff --git a/scraper_bot/bot/__init__.py b/scraper_bot/bot/__init__.py deleted file mode 100644 index b2fbc3b..0000000 --- a/scraper_bot/bot/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from ._bot import Bot diff --git a/scraper_bot/bot/_bot.py b/scraper_bot/bot/_bot.py deleted file mode 100644 index 477b18a..0000000 --- a/scraper_bot/bot/_bot.py +++ /dev/null @@ -1,29 +0,0 @@ -import logging -from typing import Self - -from telegram import Bot as _Bot -from telegram.error import BadRequest, Forbidden - -_LOGGER = logging.getLogger(__package__) - - -class Bot(_Bot): - _chats: list[str | int] - - def __init__(self, token: str, chats: list[str | int], **kwargs): - super(Bot, self).__init__(token, **kwargs) - self._chats = chats - - def send_found(self, entry: str) -> None: - _LOGGER.info(f"Sent entry {entry}") - for c in self._chats: - try: - self.send_message(c, entry) - except Forbidden: - _LOGGER.warning(f"Bot is not longer enabled for chat {c}") - except BadRequest: - _LOGGER.warning(f"Chat {c} not found") - - @classmethod - def make(cls, config: dict) -> Self: - return cls(**config) diff --git a/scraper_bot/cache/__init__.py b/scraper_bot/cache/__init__.py index 67482d3..9e8011f 100644 --- a/scraper_bot/cache/__init__.py +++ b/scraper_bot/cache/__init__.py @@ -1 +1 @@ -from ._cache import Cache +from .cache import Cache diff --git a/scraper_bot/cache/_cache.py b/scraper_bot/cache/_cache.py deleted file mode 100644 index 6def3e3..0000000 --- a/scraper_bot/cache/_cache.py +++ /dev/null @@ -1,14 +0,0 @@ -from redis import StrictRedis - - -class Cache: - redis: StrictRedis - - def __init__(self, redis: str = "redis://127.0.0.1/0"): - self.redis = StrictRedis.from_url(redis) - - def exists(self, entry: str) -> bool: - return self.redis.exists(entry) - - def add(self, entry: str) -> None: - self.redis.set(entry, "@") diff --git a/scraper_bot/cache/cache.py b/scraper_bot/cache/cache.py new file mode 100644 index 0000000..0d2e1ec --- /dev/null +++ b/scraper_bot/cache/cache.py @@ -0,0 +1,27 @@ +from asyncio import gather +from typing import Callable, TypeVar + +from redis.asyncio import StrictRedis + +T = TypeVar("T") + + +class Cache: + redis: StrictRedis + + def __init__(self, redis: str = "redis://127.0.0.1/0"): + self.redis = StrictRedis.from_url(redis) + + async def exists(self, entry: str) -> bool: + return await self.redis.exists(entry) != 0 + + async def add(self, entry: str) -> None: + await self.redis.set(entry, "@") + + async def _none_if_exists(self, entry: str, value: T) -> T | None: + if not await self.exists(entry): + return value + return None + + async def filter_exists(self, *entries: T, to_id: Callable[[T], str]) -> list[T]: + return [v for v in await gather(*(self._none_if_exists(to_id(e), e) for e in entries)) if v is not None] diff --git a/scraper_bot/exceptions/__init__.py b/scraper_bot/exceptions/__init__.py deleted file mode 100644 index a0d405c..0000000 --- a/scraper_bot/exceptions/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -class ConfigError(Exception): - pass diff --git a/scraper_bot/notifications/__init__.py b/scraper_bot/notifications/__init__.py index e69de29..738adbd 100644 --- a/scraper_bot/notifications/__init__.py +++ b/scraper_bot/notifications/__init__.py @@ -0,0 +1 @@ +from .notifications import NotificationsManager diff --git a/scraper_bot/notifications/notifications.py b/scraper_bot/notifications/notifications.py index c86b170..0c59248 100644 --- a/scraper_bot/notifications/notifications.py +++ b/scraper_bot/notifications/notifications.py @@ -1,11 +1,15 @@ +from asyncio import gather from logging import getLogger from typing import Any from apprise import Apprise -from scraper_bot.settings.settings import NotificationChannel, NotificationsSettings +from scraper_bot.settings.notifications import ( + NotificationChannel, + NotificationsSettings, +) -_LOGGER = getLogger(__name__) +_LOGGER = getLogger(__package__) class NotificationsManager: @@ -25,9 +29,17 @@ def __init__(self, settings: NotificationsSettings): def channels(self) -> list[NotificationChannel]: return self._channels - def notify(self, entity: dict[str, Any]) -> None: - for c in self.channels: - if not self._apprise.notify( - body=c.message_template.render(**entity), title=c.title, body_format=c.format, tag=c.tag - ): - _LOGGER.error(f"Failed to notify {c.uri}") + async def _notify(self, channel: NotificationChannel, entity: dict[str, Any]) -> None: + result = await self._apprise.async_notify( + body=channel.message_template.render(**entity), + title=channel.title, + body_format=channel.format, + tag=channel.tag, + ) + if not result: + _LOGGER.error(f"Failed to notify {channel.uri}") + + async def notify(self, *entity: dict[str, Any]) -> None: + _LOGGER.info(f"Notifying {len(entity)} entities to {len(self._channels)} channels") + await gather(*(self._notify(c, e) for c in self.channels for e in entity)) + _LOGGER.info("Notifying completed") diff --git a/scraper_bot/scraper/__init__.py b/scraper_bot/scraper/__init__.py index 2f22218..b4fcd12 100644 --- a/scraper_bot/scraper/__init__.py +++ b/scraper_bot/scraper/__init__.py @@ -1 +1 @@ -from ._scraper import Scraper +from .scraper import Scraper diff --git a/scraper_bot/scraper/_scraper.py b/scraper_bot/scraper/_scraper.py deleted file mode 100644 index 2f606b3..0000000 --- a/scraper_bot/scraper/_scraper.py +++ /dev/null @@ -1,97 +0,0 @@ -import logging -from typing import Callable -from urllib.parse import urljoin - -import requests -from bs4 import BeautifulSoup - -from ._exceptions import NoTargetFound, RequestError, ScraperError - -_LOGGER = logging.getLogger(__package__) - -_PAGE_PLACEHOLDER = "{i}" - -_REQUESTS_HEADER = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:94.0) Gecko/20100101 Firefox/94.0"} - - -class Scraper: - url: str - target: str - on_find: Callable[[...], None] - - def __init__(self, url: str, target: str, on_find: Callable[[...], None]): - self.url = url - self.target = target - self.on_find = on_find - - @property - def is_multipage(self): - return _PAGE_PLACEHOLDER in str(self.url) - - def _scrape_page(self, url) -> list: - page = requests.get(url, headers=_REQUESTS_HEADER) - - if not page.ok: - raise RequestError - - soup = BeautifulSoup(page.text, "html.parser") - - page_entities = soup.select(self.target) - - if len(page_entities) == 0: - raise NoTargetFound - - page_entities = map(lambda e: e["href"], page_entities) - page_entities = map(lambda e: urljoin(url, e), page_entities) - - return list(page_entities) - - def run(self): - _LOGGER.info(f"Start scraping {self.url}") - - entities = [] - last_page_entities = [] - - if self.is_multipage: - i = 0 - while True: - i += 1 - url = str(self.url).replace(_PAGE_PLACEHOLDER, f"{i}") - - _LOGGER.info(f"Get page {url}") - - try: - page_entities = self._scrape_page(url) - except ScraperError: - break - - # some site given a pagination greater than - # the last page return the last page - # if all links are identical between - # two consequential pages then break - # this is a WA to handle this situation - if ( - len(page_entities) == len(last_page_entities) - and len([1 for i, j in zip(page_entities, last_page_entities) if i != j]) == 0 - ): - break - - _LOGGER.info(f"Found {len(page_entities)} entries in the current page") - - entities += page_entities - last_page_entities = page_entities - - else: - try: - entities = self._scrape_page(self.url) - except ScraperError: - pass - - _LOGGER.info(f"Found {len(entities)} entries") - - if len(entities) == 0: - _LOGGER.warning("Nothing found: maybe a scraper countermeasure was triggered") - - self.on_find(*entities) - - _LOGGER.info(f"Scraping {self.url} completed") diff --git a/scraper_bot/scraper/browser_manager.py b/scraper_bot/scraper/browser_manager.py new file mode 100644 index 0000000..f9c1ab6 --- /dev/null +++ b/scraper_bot/scraper/browser_manager.py @@ -0,0 +1,41 @@ +from contextlib import asynccontextmanager +from logging import getLogger + +from playwright.async_api import Browser, Error, async_playwright + +from scraper_bot.settings.browser import BrowserSettings + +_LOGGER = getLogger(__name__) + + +class BrowserManager: + def __init__(self, settings: BrowserSettings): + self._settings = settings + + @asynccontextmanager + async def launch_browser(self) -> Browser: + async with async_playwright() as pw: + browser_types = [ + next((b for b in [pw.firefox, pw.chromium, pw.webkit] if b.name == i)) for i in self._settings.type + ] + + for browser_type in browser_types: + try: + browser = await browser_type.launch(headless=self._settings.headless) + except Error as e: + _LOGGER.debug(e) + _LOGGER.warning(f"{browser_type.name} not available") + continue + + _LOGGER.info(f"Use {browser.browser_type.name}") + + try: + yield browser + finally: + await browser.close() + + break + + @property + def stealth_enabled(self) -> bool: + return self._settings.stealthEnabled diff --git a/scraper_bot/scraper/_exceptions.py b/scraper_bot/scraper/exceptions.py similarity index 100% rename from scraper_bot/scraper/_exceptions.py rename to scraper_bot/scraper/exceptions.py diff --git a/scraper_bot/scraper/scraper.py b/scraper_bot/scraper/scraper.py new file mode 100644 index 0000000..6628a55 --- /dev/null +++ b/scraper_bot/scraper/scraper.py @@ -0,0 +1,31 @@ +from asyncio import gather +from logging import getLogger + +from scraper_bot.settings.browser import BrowserSettings +from scraper_bot.settings.task import TaskSettings + +from .browser_manager import BrowserManager +from .scraper_task import ScraperTask +from .scraper_task_result import ScraperTaskResult + +_LOGGER = getLogger(__package__) + + +class Scraper: + _tasks: list[ScraperTask] = [] + + def __init__(self, browser_settings: BrowserSettings): + self._browser_manager = BrowserManager(browser_settings) + + def add_task(self, *tasks: TaskSettings) -> list[ScraperTask]: + for t in tasks: + self._tasks.append(ScraperTask(t, browser_manager=self._browser_manager)) + _LOGGER.info(f"Created task {t.name}") + return self._tasks + + @property + def tasks(self) -> list[ScraperTask]: + return self._tasks + + async def run(self) -> tuple[ScraperTaskResult, ...]: + return await gather(*(t.run() for t in self._tasks)) diff --git a/scraper_bot/scraper/scraper_task.py b/scraper_bot/scraper/scraper_task.py new file mode 100644 index 0000000..c7cad55 --- /dev/null +++ b/scraper_bot/scraper/scraper_task.py @@ -0,0 +1,57 @@ +from logging import Logger, getLogger + +from playwright_stealth import stealth_async + +from scraper_bot.settings.task import TaskSettings + +from .browser_manager import BrowserManager +from .scraper_task_result import ScraperTaskResult + + +class ScraperTask: + def __init__(self, settings: TaskSettings, browser_manager: BrowserManager): + self._settings = settings + self._browser_manager = browser_manager + + @property + def settings(self) -> TaskSettings: + return self._settings + + @property + def name(self) -> str: + return self._settings.name + + @property + def _logger(self) -> Logger: + return getLogger(f"{__name__}.{self.name}") + + async def run(self) -> ScraperTaskResult: + self._logger.info("Starting scraper task") + + async with self._browser_manager.launch_browser() as browser: + page = await browser.new_page() + + if self._browser_manager.stealth_enabled: + await stealth_async(page) + + await page.goto(str(self.settings.url)) + + # TODO add support for waitingForTarget + + data: str | list[str] | dict | list[dict] = await page.evaluate(self.settings.target) + + # TODO add support for nextPageTarget + + if not isinstance(data, list): + data = list(data) + + data = [{"value": d} if isinstance(d, str) else d for d in data] + + self._logger.info(f"Scraper task retrieves {len(data)} entities") + + if not len(data): + self._logger.warning("Scraper task retrieve zero elements, maybe some error?") + + self._logger.info("End scraper task") + + return ScraperTaskResult(data=data, task=self) diff --git a/scraper_bot/scraper/scraper_task_result.py b/scraper_bot/scraper/scraper_task_result.py new file mode 100644 index 0000000..ac49691 --- /dev/null +++ b/scraper_bot/scraper/scraper_task_result.py @@ -0,0 +1,29 @@ +from collections.abc import Collection +from typing import TYPE_CHECKING, Iterator + +from .scraper_task_result_entity import ScraperTaskResultEntity + +if TYPE_CHECKING: + from .scraper_task import ScraperTask + + +class ScraperTaskResult(Collection): + def __init__(self, task: "ScraperTask", data: list[dict]): + self._task = task + self._data = [ScraperTaskResultEntity(d, task=task) for d in data] + + @property + def task(self) -> "ScraperTask": + return self._task + + def __len__(self) -> int: + return len(self._data) + + def __getitem__(self, index: int) -> ScraperTaskResultEntity: + return self._data[index] + + def __iter__(self) -> Iterator[ScraperTaskResultEntity]: + return self._data.__iter__() + + def __contains__(self, item) -> bool: + return item in self._data diff --git a/scraper_bot/scraper/scraper_task_result_entity.py b/scraper_bot/scraper/scraper_task_result_entity.py new file mode 100644 index 0000000..5c4a8b2 --- /dev/null +++ b/scraper_bot/scraper/scraper_task_result_entity.py @@ -0,0 +1,38 @@ +from collections.abc import Mapping +from hashlib import sha256 +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from .scraper_task import ScraperTask + + +class ScraperTaskResultEntity(Mapping): + def __init__(self, data: dict, task: "ScraperTask"): + self._data = data + self._task = task + + def __getitem__(self, item): + if item == "task": + return self._task + return self._data[item] + + def __contains__(self, item): + if item == "task": + return True + return item in self._data + + def __iter__(self): + return iter({**self._data, "task": self._task}) + + def __len__(self): + return len(self._data) + 1 + + def __str__(self) -> str: + return f"{self._task.name}#{"|".join(["=".join(v) for v in sorted(self._data.items(), key=lambda x: x[0])])}" + + def __hash__(self) -> int: + return int(sha256(str(self).encode()).hexdigest(), 16) + + @property + def task(self) -> "ScraperTask": + return self._task diff --git a/scraper_bot/scraper_bot/__init__.py b/scraper_bot/scraper_bot/__init__.py index a409080..b842b2b 100644 --- a/scraper_bot/scraper_bot/__init__.py +++ b/scraper_bot/scraper_bot/__init__.py @@ -1,2 +1 @@ -from ._scraper_bot import ScraperBot -from ._task import Task +from .scraper_bot import ScraperBot diff --git a/scraper_bot/scraper_bot/_scraper_bot.py b/scraper_bot/scraper_bot/_scraper_bot.py deleted file mode 100644 index 3c0119d..0000000 --- a/scraper_bot/scraper_bot/_scraper_bot.py +++ /dev/null @@ -1,50 +0,0 @@ -from __future__ import annotations - -import logging -import time - -from ischedule import run_pending - -from ..cache import Cache -from ..notifications.notifications import NotificationsManager -from ..settings import Settings -from ._task import Task - -_LOGGER = logging.getLogger(__package__) - - -class ScraperBot: - notificationsManager: NotificationsManager - tasks: list[Task] - cache: Cache - - def __init__(self, settings: Settings): - self.notificationsManager = NotificationsManager(settings.notifications) - - self.tasks = [Task(**c.model_dump(), on_find=self._on_find) for c in settings.tasks] - - self.cache = Cache(str(settings.redis)) - - def _setup_tasks(self) -> None: - for t in self.tasks: - t.schedule() - - _LOGGER.info("Setup schedule") - - def start(self) -> None: - self._setup_tasks() - - _LOGGER.info("Start schedule") - - while True: - run_pending() - time.sleep(1) - - def _on_find(self, *entries: str) -> None: - new_entries = list(filter(lambda e: not self.cache.exists(e), entries)) - - _LOGGER.info(f"Found {len(new_entries)} new entries") - - for n in new_entries: - self.notificationsManager.notify({"url": n}) - self.cache.add(n) diff --git a/scraper_bot/scraper_bot/_task.py b/scraper_bot/scraper_bot/_task.py deleted file mode 100644 index 393f85f..0000000 --- a/scraper_bot/scraper_bot/_task.py +++ /dev/null @@ -1,35 +0,0 @@ -from __future__ import annotations - -import logging -from typing import Callable - -from ischedule import schedule - -from ..scraper import Scraper - -_LOGGER = logging.getLogger(__package__) - - -class Task(Scraper): - name: str - interval: int - - def __init__( - self, - url: str, - target: str, - *, - on_find: Callable[[...], None], - interval: int = 60 * 60, - name: str = "generic-task", - ): - super().__init__(url, target, on_find) - self.name = name - self.interval = interval - - _LOGGER.info(f"Created task {self.name}") - - def schedule(self) -> None: - schedule(self.run, interval=self.interval) - - _LOGGER.info(f"Scheduled task {self.name}") diff --git a/scraper_bot/scraper_bot/scraper_bot.py b/scraper_bot/scraper_bot/scraper_bot.py new file mode 100644 index 0000000..df6cf94 --- /dev/null +++ b/scraper_bot/scraper_bot/scraper_bot.py @@ -0,0 +1,54 @@ +import logging +from asyncio import gather, sleep + +from scraper_bot.cache import Cache +from scraper_bot.notifications import NotificationsManager +from scraper_bot.scraper import Scraper +from scraper_bot.settings import Settings + +_LOGGER = logging.getLogger(__package__) + + +class ScraperBot: + _settings: Settings + _notificationsManager: NotificationsManager + _scraper: Scraper + _cache: Cache + + def __init__(self, settings: Settings): + self._settings = settings + + self._notificationsManager = NotificationsManager(self._settings.notifications) + + self._scraper = Scraper(browser_settings=self._settings.browser) + self._scraper.add_task(*self._settings.tasks) + + self._cache = Cache(str(settings.redis)) + + async def _run(self) -> None: + tasks_results = await self._scraper.run() + + new_entries = await self._cache.filter_exists( + *[t for r in tasks_results for t in r], to_id=lambda x: str(hash(x)) + ) + + if len(new_entries): + _LOGGER.info(f"Found {len(new_entries)} new entries") + + await self._notificationsManager.notify(*new_entries) + + await gather(*(self._cache.add(str(hash(e))) for e in new_entries)) + else: + _LOGGER.info("No new entry was found, skip notifications") + + async def run_once(self) -> None: + await self._run() + + async def run(self) -> None: + _LOGGER.info(f"Start schedule with interval of {self._settings.interval}") + + while True: + _LOGGER.info("Starting new iteration") + await self._run() + _LOGGER.info(f"Waiting {self._settings.interval} for the next iteration") + await sleep(self._settings.interval.total_seconds()) diff --git a/scraper_bot/settings/browser.py b/scraper_bot/settings/browser.py new file mode 100644 index 0000000..62e5f9e --- /dev/null +++ b/scraper_bot/settings/browser.py @@ -0,0 +1,21 @@ +from typing import Annotated, Literal + +from pydantic import BaseModel, Field, field_validator + +BrowserType = Literal["firefox", "chromium", "webkit"] + + +class BrowserSettings(BaseModel): + type: Annotated[ + BrowserType | list[BrowserType], + Field(description="Browser to use with playwright", default=["firefox", "chromium", "webkit"]), + ] + stealthEnabled: Annotated[bool, Field(description="Enable stealth mode", default=False)] + headless: Annotated[bool, Field(description="Enable headless mode", default=True)] + + @field_validator("type") + @classmethod + def browser_type_to_list(cls, v: BrowserType | list[BrowserType]) -> list[BrowserType]: + if isinstance(v, list): + return v + return [v] diff --git a/scraper_bot/settings/notifications.py b/scraper_bot/settings/notifications.py new file mode 100644 index 0000000..bffced6 --- /dev/null +++ b/scraper_bot/settings/notifications.py @@ -0,0 +1,55 @@ +from typing import Annotated, ClassVar, Literal, Self +from uuid import uuid4 + +from apprise import NOTIFY_FORMATS, NotifyFormat +from jinja2 import BaseLoader, Environment, Template +from pydantic import BaseModel, Field, PrivateAttr, model_validator +from pydantic_settings import BaseSettings + +from scraper_bot.utilities.apprise_uri import SecretAppriseUri + +Format = Literal[NOTIFY_FORMATS] + + +class NotificationChannel(BaseModel): + title: Annotated[str | None, Field(description="The title of the notification", default=None)] + message: Annotated[str | None, Field(description="The message of the notification", default=None)] + format: Annotated[Format | None, Field(description="The format of the notification message", default=None)] + uri: Annotated[SecretAppriseUri, Field(description="The URI of the notification")] + + _tag: Annotated[str, PrivateAttr(default_factory=lambda: uuid4().hex)] + + @property + def tag(self) -> str: + return self._tag + + _jinja_env: ClassVar[Environment] = Environment(loader=BaseLoader(), autoescape=True) + + @property + def message_template(self) -> Template: + return NotificationChannel._jinja_env.from_string(self.message) + + +class NotificationsSettings(BaseSettings): + title: Annotated[str, Field(description="Title of the notification", default="")] + message: Annotated[ + str, Field(description="The message of the notification", default="New entry found [here]({{url}})") + ] + format: Annotated[Format, Field(description="The format of the notification message", default=NotifyFormat.TEXT)] + channels: Annotated[ + list[SecretAppriseUri | NotificationChannel], + Field(description="Notification channel or apprise compatible URI", min_length=1), + ] + + @model_validator(mode="after") + def parse_channels(self) -> Self: + self.channels = [ + NotificationChannel( + **{ + **self.model_dump(), + **(c.model_dump(exclude_none=True) if isinstance(c, NotificationChannel) else {"uri": c}), + } + ) + for c in self.channels + ] + return self diff --git a/scraper_bot/settings/settings.py b/scraper_bot/settings/settings.py index 8b6f474..b09fc86 100644 --- a/scraper_bot/settings/settings.py +++ b/scraper_bot/settings/settings.py @@ -1,20 +1,18 @@ from datetime import timedelta from pathlib import Path -from typing import Annotated, ClassVar, Literal, Self, Type -from uuid import uuid4 +from typing import Annotated, ClassVar, Self, Type -from apprise import NOTIFY_FORMATS, NotifyFormat -from jinja2 import BaseLoader, Environment, Template -from pydantic import BaseModel, Field, HttpUrl, PrivateAttr, RedisDsn, model_validator +from pydantic import Field, RedisDsn from pydantic_settings import ( BaseSettings, - EnvSettingsSource, PydanticBaseSettingsSource, SettingsConfigDict, YamlConfigSettingsSource, ) -from scraper_bot.utilities.AppriseURI import SecretAppriseUri +from .browser import BrowserSettings +from .notifications import NotificationsSettings +from .task import TaskSettings DEFAULT_SETTINGS_PATH = [ Path.cwd() / "config.yml", @@ -24,72 +22,34 @@ ] -class Task(BaseModel): - name: Annotated[str, Field(description="A human readable label for teh task")] - url: Annotated[ - HttpUrl, Field(description="The url to the page to be scraped. Use `{i}` as a placeholder for the pagination") - ] - target: Annotated[ - str, - Field(description="It is a unique css selector to target the tag contains the link to the scraped page"), +class Settings(BaseSettings): + daemonize: Annotated[ + bool, Field(description="make the scraper run as a daemon instead run only once", default=False) ] - interval: Annotated[timedelta, Field(gt=0, description="How often the task should be done expressed in seconds")] - - -class NotificationChannel(BaseModel): - title: Annotated[str | None, Field(description="The title of the notification", default=None)] - message: Annotated[str | None, Field(description="The message of the notification", default=None)] - format: Annotated[ - Literal[NOTIFY_FORMATS] | None, Field(description="The format of the notification message", default=None) + interval: Annotated[ + timedelta, + Field( + gt=0, + description="How often the tasks should be done expressed in seconds. " + "It will be ignored if `daemonize` is False", + default=60 * 60, + ), ] - uri: Annotated[SecretAppriseUri, Field(description="The URI of the notification")] - - _tag: Annotated[str, PrivateAttr(default_factory=lambda: uuid4().hex)] - - @property - def tag(self) -> str: - return self._tag - - _jinja_env: ClassVar[Environment] = Environment(loader=BaseLoader(), autoescape=True) - - @property - def message_template(self) -> Template: - return NotificationChannel._jinja_env.from_string(self.message) + browser: Annotated[BrowserSettings, Field(description="Browser to use with playwright", default=BrowserSettings())] -class NotificationsSettings(BaseSettings): - title: Annotated[str, Field(description="Title of the notification", default="New entry found")] - message: Annotated[ - str, Field(description="The message of the notification", default="New entry found [here]({{url}})") + tasks: Annotated[ + list[TaskSettings], Field(min_length=1, description="The scraper tasks the bot will have to perform") ] - format: Annotated[ - Literal[NOTIFY_FORMATS], Field(description="The format of the notification message", default=NotifyFormat.TEXT) - ] - channels: Annotated[ - list[SecretAppriseUri | NotificationChannel], - Field(description="Notification channel or apprise compatible URI", min_length=1), - ] - - @model_validator(mode="after") - def parse_channels(self) -> Self: - self.channels = [ - NotificationChannel( - **{ - **self.model_dump(), - **(c.model_dump(exclude_none=True) if isinstance(c, NotificationChannel) else {"uri": c}), - } - ) - for c in self.channels - ] - return self - - -class Settings(BaseSettings): - tasks: Annotated[list[Task], Field(min_length=1)] notifications: Annotated[NotificationsSettings, Field(description="Notifications configuration")] redis: Annotated[RedisDsn, Field(description="An URI to a redis instance used to cache")] - model_config = SettingsConfigDict(extra="ignore") + model_config = SettingsConfigDict( + extra="ignore", + env_prefix="SB__", + env_nested_delimiter="__", + case_sensitive=False, + ) _SETTINGS_PATH: ClassVar[str | Path | list[str | Path]] = DEFAULT_SETTINGS_PATH @@ -99,21 +59,29 @@ def set_settings_path(cls, settings_path: str | Path | list[str | Path]) -> None @classmethod def settings_customise_sources( - cls, settings_cls: Type[BaseSettings], **kwargs + cls, + settings_cls: Type[BaseSettings], + init_settings: PydanticBaseSettingsSource, + env_settings: PydanticBaseSettingsSource, + **kwargs ) -> tuple[PydanticBaseSettingsSource, ...]: return ( - EnvSettingsSource( - settings_cls, - env_prefix="SB__", - env_nested_delimiter="__", - case_sensitive=False, - ), + init_settings, + env_settings, YamlConfigSettingsSource(settings_cls, yaml_file=cls._SETTINGS_PATH), ) + # Make the first initialization persistence + # between multiple initialization of the class _instance: ClassVar[Self | None] = None + _initialized: ClassVar[bool] = False - def __new__(cls, *args, **kwargs): + def __new__(cls, **kwargs): if cls._instance is None: - cls._instance = super().__new__(cls, *args, **kwargs) + cls._instance = super().__new__(cls) return cls._instance + + def __init__(self, **kwargs): + if not self.__class__._initialized: + super().__init__(**kwargs) + self.__class__._initialized = True diff --git a/scraper_bot/settings/task.py b/scraper_bot/settings/task.py new file mode 100644 index 0000000..6b5d5f5 --- /dev/null +++ b/scraper_bot/settings/task.py @@ -0,0 +1,36 @@ +from datetime import timedelta +from typing import Annotated + +from pydantic import BaseModel, Field, HttpUrl, PositiveInt + + +class TaskSettings(BaseModel): + name: Annotated[str, Field(description="A human readable label for teh task")] + url: Annotated[HttpUrl, Field(description="The url to the page to be scraped")] + + target: Annotated[ + str, + Field( + description="Javascript script to retrieve the target entities. " + "The script have to return a object(dict) or a list of them. " + "The attributes of the object will be accessible in the notification message template" + ), + ] + waitingForTarget: Annotated[ + str | None, Field(description="CSS selector for a target to wait before start the scraping", default=None) + ] + waitingTimeout: Annotated[ + timedelta, + Field(description="The time to wait to get find the `waitingForTarget` before give up", default=15, ge=0), + ] + nextPageTarget: Annotated[ + str | None, + Field( + description="Javascript script to retrieve the next page url. " + "The script have to return the url as a string", + default=None, + ), + ] + maxPages: Annotated[ + PositiveInt | None, Field(description="The maximum number of pages to scrape per task", default=None) + ] diff --git a/scraper_bot/utilities/AppriseURI.py b/scraper_bot/utilities/apprise_uri.py similarity index 93% rename from scraper_bot/utilities/AppriseURI.py rename to scraper_bot/utilities/apprise_uri.py index dfb5d6b..ebcb6a3 100644 --- a/scraper_bot/utilities/AppriseURI.py +++ b/scraper_bot/utilities/apprise_uri.py @@ -18,5 +18,5 @@ class SecretAppriseUri(SecretStr, str): def _display(self) -> str: v = self.get_secret_value() - i, j = len(v) // 4, len(v) * 3 // 4 + i, j = len(v) // 6, len(v) * 5 // 6 return f"{v[:i]}*****{v[j:]}"