-
Notifications
You must be signed in to change notification settings - Fork 67
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #29 from auchtetraborat/main
Integration of VERENA-scraper & remove misplaced comment in Lebensmittelwarnung
- Loading branch information
Showing
14 changed files
with
635 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,4 +23,4 @@ jobs: | |
pip install . | ||
- name: Test with pytest | ||
run: | | ||
pytest | ||
pytest |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
### Scraper for https://www.schulministerium.nrw.de/BiPo/Verena/online | ||
|
||
```json | ||
{ | ||
"school_id": "99999", | ||
"desc": "Eine Schule\nSchule der Sekundarstufe II\ndes Landkreis Schuling\n9999 Schulingen", | ||
"replacement_job_title": "Lehrkraft", | ||
"subjects": [ | ||
"Fach 1", | ||
"Fach 2" | ||
], | ||
"replacement_job_type_raw": "Vertretung für", | ||
"replacement_job_type": "Vertretung", | ||
"comments": "Bemerkung zur Stelle: Testbemerkung", | ||
"duration": "01.01.2021 - 01.01.2022", | ||
"hours_per_week": "13,5", | ||
"contact": { | ||
"phone": "0172 1111 1111", | ||
"fax": "0172 2222 2222", | ||
"homepage": "http://www.eine-schule.de", | ||
"mail": { | ||
"raw": "mailto:[email protected]?subject=Stellenausschreibung in VERENA", | ||
"adress": "[email protected]", | ||
"subject": "Stellenausschreibung in VERENA" | ||
} | ||
}, | ||
"deadline": "17.09.2021", | ||
"geolocation": { | ||
"coord_system": "epsg:25832", | ||
"coordinates": [1111111, 1111111], | ||
"post_adress": "Eine Stra\u00dfe 1\n99999 Schulingen" | ||
} | ||
} | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
from deutschland.verena.verenadownloader import VerenaDownloader | ||
from deutschland.verena.verenaextractor import VerenaExtractor | ||
import json | ||
|
||
|
||
class Verena: | ||
""" | ||
Downloads and extracts the current job listings from the VERENA portal. | ||
""" | ||
|
||
def get(self): | ||
""" | ||
Downloads and extracts the current job listings from the VERENA portal. | ||
Example of the json format can be found at ./example.json | ||
""" | ||
result = [] | ||
scraped_pages = VerenaDownloader().scrape() | ||
for idx, page in enumerate(scraped_pages): | ||
extract = VerenaExtractor(page).extract() | ||
result = result + extract | ||
return result | ||
|
||
|
||
if __name__ == "__main__": | ||
v = Verena() | ||
res = v.get() | ||
print(json.dumps(res)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,142 @@ | ||
import requests | ||
import math | ||
from bs4 import BeautifulSoup | ||
from typing import Tuple, List | ||
|
||
|
||
class VerenaDownloader: | ||
""" | ||
Downloads all pages (each containing 100 job offerings) of the VERENA portal. | ||
""" | ||
|
||
BASE_URL = "https://www.schulministerium.nrw.de" | ||
|
||
def __init__(self): | ||
self.session = requests.Session() | ||
|
||
def __scrape_landing_page(self) -> Tuple[int, str, str]: | ||
"""Returns (job_openings_count: int, access_listing_url_part: str, access_listing_action_id: str) | ||
Example: (513, "/BiPo/Verena/angebote?action=595.1764087184088", "595.1764087184088") | ||
Scrapes the VERENA landing page to get a session cookie, matching actionid | ||
to access the listing view and the count of job offerings in the listing. | ||
""" | ||
landing_url = self.BASE_URL + "/BiPo/Verena" | ||
landing_request = self.session.get(landing_url) | ||
landing_soup = BeautifulSoup(landing_request.text, "html.parser") | ||
links = landing_soup.findAll("a", {"title": "Zu den Stellenausschreibungen"}) | ||
for link in links: | ||
if "Derzeit im Netz veröffentlichte Ausschreibungen:" in link.text: | ||
job_openings_count = link.find_next("strong").text | ||
access_listing_url_part = link["href"] | ||
# split action_id from listing_url_part | ||
access_listing_action_id = access_listing_url_part.replace( | ||
"/BiPo/Verena/angebote?action=", "" | ||
) | ||
return ( | ||
int(job_openings_count), | ||
access_listing_url_part, | ||
access_listing_action_id, | ||
) | ||
|
||
def __scrape_listing_page_initial( | ||
self, access_listing_url_part: str | ||
) -> Tuple[str, str, str]: | ||
"""Returns (listing url with new actionid, blocksize 100 & valid suchid (aka. select_blocksize_url_part)), search_id, select_blocksize_action_id) | ||
Example: ("/BiPo/Verena/angebote?action=509.9848906326322&block=b100&suchid=188736", "188736", "509.9848906326322") | ||
Scrapes the VERENA listing page to get a listing url with blocksize = 100 and valid suchid (search_id). | ||
suchid is generated by the backend and stores your search preferences. | ||
""" | ||
listing_url = self.BASE_URL + access_listing_url_part | ||
listing_request = self.session.get(listing_url) | ||
listing_soup = BeautifulSoup(listing_request.text, "html.parser") | ||
blocksize_selector = listing_soup.find("div", id="blockauswahl") | ||
# -1 is blocksize 100, also gets a such_id (search_id) | ||
select_blocksize_url_part = blocksize_selector.findAll("a")[-1]["href"] | ||
search_id = select_blocksize_url_part.split("=")[-1] | ||
select_blocksize_action_id = select_blocksize_url_part.replace( | ||
"/BiPo/Verena/angebote?action=", "" | ||
).split("&")[0] | ||
return select_blocksize_url_part, search_id, select_blocksize_action_id | ||
|
||
def __set_block_size(self, select_blocksize_url_part: str): | ||
""" | ||
Run GET on search ID url to set correct block size for future requests in backend | ||
""" | ||
searchid_url = self.BASE_URL + select_blocksize_url_part | ||
self.session.get(searchid_url) | ||
|
||
def __generate_all_listing_urls( | ||
self, action_id: str, search_id: str, opening_count: int | ||
) -> List[str]: | ||
"""Based on action_id, search_id and opening_count, generates a list of all listing urls. | ||
Example: [ | ||
"https://www.schulministerium.nrw.de/BiPo/Verena/angebote?action=901.7040712715743&seite=a1&suchid=188265", | ||
"https://www.schulministerium.nrw.de/BiPo/Verena/angebote?action=901.7040712715743&seite=a2&suchid=188265" | ||
... | ||
] | ||
""" | ||
all_urls = [] | ||
# because block size = 100 | ||
site_count = math.ceil(opening_count / 100) | ||
for curr_site in range(0, site_count): | ||
curr_site += 1 | ||
listing_format_string = ( | ||
self.BASE_URL + "/BiPo/Verena/angebote?action={0}&seite=a{1}&suchid={2}" | ||
) | ||
all_urls.append( | ||
listing_format_string.format(action_id, curr_site, search_id) | ||
) | ||
return all_urls | ||
|
||
def __scrape_actual_listing(self, urls: List[str]): | ||
"""Downloads the job listing pages provided by 'urls' and returns their content as an list of sourcecodes. | ||
Example: [ | ||
<html>...</html> | ||
<html>...</html> | ||
] | ||
""" | ||
scraped_pages = [] | ||
for url in urls: | ||
r = self.session.get(url) | ||
scraped_pages.append(r.text) | ||
return scraped_pages | ||
|
||
def scrape(self) -> List[str]: | ||
"""Returns list of sourcecodes of all listing pages of the VERENA job listing portal. | ||
Example: [ | ||
<html>...</html> | ||
<html>...</html> | ||
] | ||
""" | ||
( | ||
job_opening_count, | ||
access_listing_url_part, | ||
access_listing_action_id, | ||
) = self.__scrape_landing_page() | ||
# select_blocksize_action_id is the action_id used to select the blocksize. | ||
# Its also reused to query the different pages of the job portal. | ||
( | ||
select_blocksize_url_part, | ||
search_id, | ||
select_blocksize_action_id, | ||
) = self.__scrape_listing_page_initial(access_listing_url_part) | ||
self.__set_block_size(select_blocksize_url_part) | ||
all_listing_urls = self.__generate_all_listing_urls( | ||
select_blocksize_action_id, search_id, job_opening_count | ||
) | ||
return self.__scrape_actual_listing(all_listing_urls) | ||
|
||
|
||
if __name__ == "__main__": | ||
vd = VerenaDownloader() | ||
res = vd.scrape() | ||
print(res) |
Oops, something went wrong.