diff --git a/.github/workflows/runtests.yml b/.github/workflows/runtests.yml index 5a042a8..a055e64 100644 --- a/.github/workflows/runtests.yml +++ b/.github/workflows/runtests.yml @@ -23,4 +23,4 @@ jobs: pip install . - name: Test with pytest run: | - pytest + pytest diff --git a/README.md b/README.md index b8e0d57..488344a 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,21 @@ print(data) # [{'id': 19601, 'guid': 'https://www.lebensmittelwarnung.de/bvl-lmw-de/detail/lebensmittel/19601', 'pubDate': 'Fri, 10 Feb 2017 12:28:45 +0000', 'imgSrc': 'https://www.lebensmittelwarnung.de/bvl-lmw-de/opensaga/attachment/979f8cd3-969e-4a6c-9a8e-4bdd61586cd4/data.jpg', 'title': 'Sidroga Bio Säuglings- und Kindertee', 'manufacturer': 'Lebensmittel', 'warning': 'Pyrrolizidinalkaloide', 'affectedStates': ['Baden-Württemberg', '...']}] ``` +## Federal Job Openings + +### NRW + +#### VERENA +Get open substitute teaching positions in NRW from https://www.schulministerium.nrw.de/BiPo/Verena/angebote +```python +from deutschland import Verena +v = Verena() +data = v.get() +print(data) +# a full example data can be found at deutschland/verena/example.md +# [{ "school_id": "99999", "desc": "Eine Schule\nSchule der Sekundarstufe II\ndes Landkreis Schuling\n9999 Schulingen", "replacement_job_title": "Lehrkraft", "subjects": [ "Fach 1", "Fach 2" ], "comments": "Bemerkung zur Stelle: Testbemerkung", "duration": "01.01.2021 - 01.01.2022", ...} ...] + + ## Autobahn Get data from the Autobahn. diff --git a/deutschland/__init__.py b/deutschland/__init__.py index 96e2f6e..2002eb7 100644 --- a/deutschland/__init__.py +++ b/deutschland/__init__.py @@ -6,4 +6,5 @@ from .bundesanzeiger.bundesanzeiger import Bundesanzeiger from .handelsregister.handelsregister import Handelsregister from .lebensmittelwarnung.lebensmittelwarnung import Lebensmittelwarnung +from .verena.verena import Verena from .bundesnetzagentur import * diff --git a/deutschland/lebensmittelwarnung/lebensmittelwarnung.py b/deutschland/lebensmittelwarnung/lebensmittelwarnung.py index 73bf5df..372fedd 100644 --- a/deutschland/lebensmittelwarnung/lebensmittelwarnung.py +++ b/deutschland/lebensmittelwarnung/lebensmittelwarnung.py @@ -177,6 +177,5 @@ def get( if __name__ == "__main__": lw = Lebensmittelwarnung() - # res = hr.search(keywords="Deutsche Bahn Aktiengesellschaft", keyword_match_option=3) res = lw.get() print(res) diff --git a/deutschland/verena/__init__.py b/deutschland/verena/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/deutschland/verena/example.md b/deutschland/verena/example.md new file mode 100644 index 0000000..b1e5d6a --- /dev/null +++ b/deutschland/verena/example.md @@ -0,0 +1,34 @@ +### Scraper for https://www.schulministerium.nrw.de/BiPo/Verena/online + +```json +{ + "school_id": "99999", + "desc": "Eine Schule\nSchule der Sekundarstufe II\ndes Landkreis Schuling\n9999 Schulingen", + "replacement_job_title": "Lehrkraft", + "subjects": [ + "Fach 1", + "Fach 2" + ], + "replacement_job_type_raw": "Vertretung für", + "replacement_job_type": "Vertretung", + "comments": "Bemerkung zur Stelle: Testbemerkung", + "duration": "01.01.2021 - 01.01.2022", + "hours_per_week": "13,5", + "contact": { + "phone": "0172 1111 1111", + "fax": "0172 2222 2222", + "homepage": "http://www.eine-schule.de", + "mail": { + "raw": "mailto:bewerbung@eineschule.de?subject=Stellenausschreibung in VERENA", + "adress": "bewerbung@eineschule.de", + "subject": "Stellenausschreibung in VERENA" + } + }, + "deadline": "17.09.2021", + "geolocation": { + "coord_system": "epsg:25832", + "coordinates": [1111111, 1111111], + "post_adress": "Eine Stra\u00dfe 1\n99999 Schulingen" + } +} +``` \ No newline at end of file diff --git a/deutschland/verena/verena.py b/deutschland/verena/verena.py new file mode 100644 index 0000000..1e101ec --- /dev/null +++ b/deutschland/verena/verena.py @@ -0,0 +1,28 @@ +from deutschland.verena.verenadownloader import VerenaDownloader +from deutschland.verena.verenaextractor import VerenaExtractor +import json + + +class Verena: + """ + Downloads and extracts the current job listings from the VERENA portal. + """ + + def get(self): + """ + Downloads and extracts the current job listings from the VERENA portal. + + Example of the json format can be found at ./example.json + """ + result = [] + scraped_pages = VerenaDownloader().scrape() + for idx, page in enumerate(scraped_pages): + extract = VerenaExtractor(page).extract() + result = result + extract + return result + + +if __name__ == "__main__": + v = Verena() + res = v.get() + print(json.dumps(res)) diff --git a/deutschland/verena/verenadownloader.py b/deutschland/verena/verenadownloader.py new file mode 100644 index 0000000..0401b94 --- /dev/null +++ b/deutschland/verena/verenadownloader.py @@ -0,0 +1,142 @@ +import requests +import math +from bs4 import BeautifulSoup +from typing import Tuple, List + + +class VerenaDownloader: + """ + Downloads all pages (each containing 100 job offerings) of the VERENA portal. + """ + + BASE_URL = "https://www.schulministerium.nrw.de" + + def __init__(self): + self.session = requests.Session() + + def __scrape_landing_page(self) -> Tuple[int, str, str]: + """Returns (job_openings_count: int, access_listing_url_part: str, access_listing_action_id: str) + + Example: (513, "/BiPo/Verena/angebote?action=595.1764087184088", "595.1764087184088") + + Scrapes the VERENA landing page to get a session cookie, matching actionid + to access the listing view and the count of job offerings in the listing. + """ + landing_url = self.BASE_URL + "/BiPo/Verena" + landing_request = self.session.get(landing_url) + landing_soup = BeautifulSoup(landing_request.text, "html.parser") + links = landing_soup.findAll("a", {"title": "Zu den Stellenausschreibungen"}) + for link in links: + if "Derzeit im Netz veröffentlichte Ausschreibungen:" in link.text: + job_openings_count = link.find_next("strong").text + access_listing_url_part = link["href"] + # split action_id from listing_url_part + access_listing_action_id = access_listing_url_part.replace( + "/BiPo/Verena/angebote?action=", "" + ) + return ( + int(job_openings_count), + access_listing_url_part, + access_listing_action_id, + ) + + def __scrape_listing_page_initial( + self, access_listing_url_part: str + ) -> Tuple[str, str, str]: + """Returns (listing url with new actionid, blocksize 100 & valid suchid (aka. select_blocksize_url_part)), search_id, select_blocksize_action_id) + + Example: ("/BiPo/Verena/angebote?action=509.9848906326322&block=b100&suchid=188736", "188736", "509.9848906326322") + + Scrapes the VERENA listing page to get a listing url with blocksize = 100 and valid suchid (search_id). + suchid is generated by the backend and stores your search preferences. + """ + listing_url = self.BASE_URL + access_listing_url_part + listing_request = self.session.get(listing_url) + listing_soup = BeautifulSoup(listing_request.text, "html.parser") + blocksize_selector = listing_soup.find("div", id="blockauswahl") + # -1 is blocksize 100, also gets a such_id (search_id) + select_blocksize_url_part = blocksize_selector.findAll("a")[-1]["href"] + search_id = select_blocksize_url_part.split("=")[-1] + select_blocksize_action_id = select_blocksize_url_part.replace( + "/BiPo/Verena/angebote?action=", "" + ).split("&")[0] + return select_blocksize_url_part, search_id, select_blocksize_action_id + + def __set_block_size(self, select_blocksize_url_part: str): + """ + Run GET on search ID url to set correct block size for future requests in backend + """ + searchid_url = self.BASE_URL + select_blocksize_url_part + self.session.get(searchid_url) + + def __generate_all_listing_urls( + self, action_id: str, search_id: str, opening_count: int + ) -> List[str]: + """Based on action_id, search_id and opening_count, generates a list of all listing urls. + + Example: [ + "https://www.schulministerium.nrw.de/BiPo/Verena/angebote?action=901.7040712715743&seite=a1&suchid=188265", + "https://www.schulministerium.nrw.de/BiPo/Verena/angebote?action=901.7040712715743&seite=a2&suchid=188265" + ... + ] + """ + all_urls = [] + # because block size = 100 + site_count = math.ceil(opening_count / 100) + for curr_site in range(0, site_count): + curr_site += 1 + listing_format_string = ( + self.BASE_URL + "/BiPo/Verena/angebote?action={0}&seite=a{1}&suchid={2}" + ) + all_urls.append( + listing_format_string.format(action_id, curr_site, search_id) + ) + return all_urls + + def __scrape_actual_listing(self, urls: List[str]): + """Downloads the job listing pages provided by 'urls' and returns their content as an list of sourcecodes. + + Example: [ + ... + ... + ] + + """ + scraped_pages = [] + for url in urls: + r = self.session.get(url) + scraped_pages.append(r.text) + return scraped_pages + + def scrape(self) -> List[str]: + """Returns list of sourcecodes of all listing pages of the VERENA job listing portal. + + Example: [ + ... + ... + ] + + """ + ( + job_opening_count, + access_listing_url_part, + access_listing_action_id, + ) = self.__scrape_landing_page() + # select_blocksize_action_id is the action_id used to select the blocksize. + # Its also reused to query the different pages of the job portal. + ( + select_blocksize_url_part, + search_id, + select_blocksize_action_id, + ) = self.__scrape_listing_page_initial(access_listing_url_part) + self.__set_block_size(select_blocksize_url_part) + all_listing_urls = self.__generate_all_listing_urls( + select_blocksize_action_id, search_id, job_opening_count + ) + return self.__scrape_actual_listing(all_listing_urls) + + +if __name__ == "__main__": + vd = VerenaDownloader() + res = vd.scrape() + print(res) diff --git a/deutschland/verena/verenaextractor.py b/deutschland/verena/verenaextractor.py new file mode 100644 index 0000000..8940b81 --- /dev/null +++ b/deutschland/verena/verenaextractor.py @@ -0,0 +1,218 @@ +from bs4 import BeautifulSoup, element +from typing import Tuple +import re + + +class VerenaExtractor: + """ + Extracts the job listings objects from the search result website of the VERENA portal. + """ + + def __init__(self, source: str): + """ + Init with string containing the sourcecode the website (for example from request.text) + """ + self.soup = BeautifulSoup(source, "html.parser") + self.map_onclick_regex = re.compile(r" .{0,}<\/b>") + + def extract(self): + """ + Returns a list of all job listings contained in the sourcecode used in VerenaExtractor(source) + + Example of the json format can be found at ./example.json + """ + ausschreibungen = self.soup.findAll("div", {"class": "ausschreibung"}) + res = [] + for aus in ausschreibungen: + aus_parts = aus.findAll("div", {"class": "ausschreibung_teil"}) + school_id, desc = self.__extract_part1(aus_parts[0]) + ( + replacement_job_type, + replacement_job_type_raw, + replacement_job_title, + subjects, + comments, + ) = self.__extract_part2(aus_parts[1]) + duration, hours_per_week = self.__extract_part3(aus_parts[2]) + phone, fax, homepage, email, deadline = self.__extract_part4(aus_parts[3]) + coord_system, coordinates, post_adress = self.__extract_part5(aus_parts[4]) + merged = { + **self.__format_part1(school_id, desc), + **self.__format_part2( + replacement_job_type, + replacement_job_type_raw, + replacement_job_title, + subjects, + comments, + ), + **self.__format_part3(duration, hours_per_week), + **self.__format_part4(phone, fax, homepage, email, deadline), + **self.__format_part5(coord_system, coordinates, post_adress), + } + res.append(merged) + return res + + def __extract_part1(self, content) -> Tuple[str, str]: + """Returns a tuple of (school_id : str, adress : str) + + Extracts the attributes school_id & adress from the VERENA search result. + + Should be applied to the 1.
in
+ """ + school_id = content.find("strong").text + result_elems = [] + for desc_elems in content.contents[6:-1]: + if type(desc_elems) == element.NavigableString: + result_elems.append(desc_elems.strip().replace(u"\xa0", u" ")) + desc = "\n".join(result_elems) + return school_id, desc + + def __format_part1(self, schoold_id, desc) -> dict: + """Returns a export-ready dict for school_id & adress""" + return {"school_id": schoold_id, "desc": desc} + + def __extract_part2(self, content): + # TODO add typing to return value when 3.10 is realeased + """Returns a tuple of (replacement_job_type: str, replacement_job_type_raw: str, replacement_job_title : str, subjects : List[str], comments : str | None, ) + + Should be applied to the 2.
in
+ """ + comments = None + strong = content.findAll("strong") + replacement_job_title = strong[0].text + subjects = [x.text for x in strong[1:]] + if type(content.contents[-1]) == element.NavigableString: + comments_or_empty = content.contents[-1].strip() + if comments_or_empty: + comments = comments_or_empty + replacement_job_type_raw = content.contents[0].strip() + replacement_job_type = None + if "AnC" in replacement_job_type_raw: + replacement_job_type = "Aufholen nach Corona" + elif "Vertretung" in replacement_job_type_raw: + replacement_job_type = "Vertretung" + return ( + replacement_job_type, + replacement_job_type_raw, + replacement_job_title, + subjects, + comments, + ) + + def __format_part2( + self, + replacement_job_type, + replacement_job_type_raw, + replacement_job_title, + subjects, + comments, + ) -> dict: + """Returns a export-ready dict for replacement_job_type, replacement_job_type_raw, replacement_job_title, subjects, comments""" + res = { + "replacement_job_type_raw": replacement_job_type_raw, + "replacement_job_title": replacement_job_title, + "subjects": subjects, + } + if comments: + res["comments"] = comments + if replacement_job_type: + res["replacement_job_type"] = replacement_job_type + return res + + def __extract_part3(self, content): + """Returns a tuple of (duration : str, hours_per_week : str) + + Extracts the attributes duration & hours_per_week. + + Should be applied to the 3.
in
+ """ + content_elems = content.contents + result_elems = [] + for elem in content_elems: + if type(elem) == element.NavigableString: + result_elems.append(elem.strip()) + return result_elems[1], result_elems[3] + + def __format_part3(self, duration, hours_per_week): + return {"duration": duration, "hours_per_week": hours_per_week} + + def __extract_part4(self, content): + # TODO add typing to return value when 3.10 is realeased + """Returns a tuple of (phone : str | None, fax : str | None, homepage : str | None, email : str | None, deadline : str) + + Extracts the attributes phone, fax, homepage, email, deadline from the VERENA search result. + phone, fax, homepage, email are optional and can be None. + + Should be applied to the 4.
in
+ """ + email, homepage, phone, fax = None, None, None, None + for link in content.findAll("a"): + if "E-Mail" in link.text: + email = link["href"] + elif "Homepage" in link.text: + homepage = link["href"] + content_arr = content.contents + for x in content_arr: + if type(x) == element.NavigableString: + if "☎" in x: + phone = x.replace("☎", "").strip() + elif "Fax" in x: + fax = x.replace("Fax", "").strip() + deadline = content.find("strong").text + return phone, fax, homepage, email, deadline + + def __format_part4(self, phone, fax, homepage, email, deadline) -> dict: + """Returns a export-ready dict for phone, fax, homepage, email, deadline""" + result = {"contact": {}, "deadline": deadline} + if phone: + result["contact"]["phone"] = phone + if fax: + result["contact"]["fax"] = fax + if homepage: + result["contact"]["homepage"] = homepage + if email: + # use urlparse, but subject is the only param anyway + result["contact"]["mail"] = {} + result["contact"]["mail"]["raw"] = email + email_split = email.split("?") + result["contact"]["mail"]["adress"] = email_split[0].replace("mailto:", "") + result["contact"]["mail"]["subject"] = email_split[1].replace( + "subject=", "" + ) + return result + + def __extract_part5(self, content) -> Tuple[str, str, str]: + """Returns a tuple of (coord_system : str, coordinates : str, post_adress : str) + + Example: ("epsg:25832", "12345, 67890", "Somestreet\n9999 SomeLocation") + + Extracts the attributes coord_system & coordinates from the VERENA search result. + + Should be applied to the 5.
in
+ """ + map_div = content.find("div", {"class": "itnrwMap"}) + data_itnrw_coords = map_div["data-itnrw-coords"] + if not data_itnrw_coords: + return None, None, None + else: + data_itnrw_coords = data_itnrw_coords[1:-1].split(";") + coord_system = data_itnrw_coords[0] + coords = [int(x.strip()) for x in data_itnrw_coords[1].split(",")] + name = data_itnrw_coords[2] + map_onclick = data_itnrw_coords[3] + post_adress = self.map_onclick_regex.sub("", map_onclick) + post_adress = post_adress.strip() + post_adress = "\n".join(post_adress.split("
")) + return coord_system, coords, post_adress + + def __format_part5(self, coord_system, coordinates, post_adress) -> dict: + """Returns a export-ready dict for coord_system, coordinates & postadress""" + if coord_system is None and coord_system is None and post_adress is None: + return {} + return { + "geolocation": { + "coord_system": coord_system, + "coordinates": coordinates, + "post_adress": post_adress, + } + } diff --git a/tests/verena/ausschreibung_correct_result.json b/tests/verena/ausschreibung_correct_result.json new file mode 100644 index 0000000..bc713e4 --- /dev/null +++ b/tests/verena/ausschreibung_correct_result.json @@ -0,0 +1,30 @@ +{ + "school_id": "99999", + "desc": "Eine Schule\nSchule der Sekundarstufe II\ndes Landkreis Schuling\n9999 Schulingen", + "replacement_job_title": "Lehrkraft", + "subjects": [ + "Fach 1", + "Fach 2" + ], + "replacement_job_type_raw": "Vertretung für", + "replacement_job_type": "Vertretung", + "comments": "Bemerkung zur Stelle: Testbemerkung", + "duration": "01.01.2021 - 01.01.2022", + "hours_per_week": "13,5", + "contact": { + "phone": "0172 1111 1111", + "fax": "0172 2222 2222", + "homepage": "http://www.eine-schule.de", + "mail": { + "raw": "mailto:bewerbung@eineschule.de?subject=Stellenausschreibung in VERENA", + "adress": "bewerbung@eineschule.de", + "subject": "Stellenausschreibung in VERENA" + } + }, + "deadline": "17.09.2021", + "geolocation": { + "coord_system": "epsg:25832", + "coordinates": [1111111, 1111111], + "post_adress": "Eine Stra\u00dfe 1\n99999 Schulingen" + } +} \ No newline at end of file diff --git a/tests/verena/ausschreibung_test_input.html b/tests/verena/ausschreibung_test_input.html new file mode 100644 index 0000000..3065b47 --- /dev/null +++ b/tests/verena/ausschreibung_test_input.html @@ -0,0 +1,89 @@ +
+ + +
+
+ +
+ Schule + 99999 +
+ + Eine Schule +
+ + Schule der Sekundarstufe II +
+ + des Landkreis Schuling +
+ + 9999 Schulingen +
+ +
+ +
+ + + + Vertretung für Lehrkraft +
+ Fächer
+ + Fach 1 +
+ + Fach 2 +
+ + Bemerkung zur Stelle: Testbemerkung + +
+ +
+ Dauer +
+ 01.01.2021 - 01.01.2022 +
+ Wochenstundenzahl +
+ 13,5 +
+ +
+ + ☎ 0172 1111 1111 +
+ + Fax 0172 2222 2222 +
+ + ⌂ Homepage +
+ + + ✉ E-Mail +
+ +
Bewerbungsfrist endet mit Ablauf des 17.09.2021 +
+ +
+ + + + +
+
diff --git a/tests/verena/test_verena.py b/tests/verena/test_verena.py new file mode 100644 index 0000000..91a273a --- /dev/null +++ b/tests/verena/test_verena.py @@ -0,0 +1,9 @@ +from deutschland.verena.verena import Verena + + +def test_verena(): + v = Verena() + res = v.get() + assert ( + len(res) > 0 + ), "Scraping and extracting all pages of the VERENA portal eturned 0 results. It very likely shouldn't." diff --git a/tests/verena/test_verenadownloader.py b/tests/verena/test_verenadownloader.py new file mode 100644 index 0000000..4cc5746 --- /dev/null +++ b/tests/verena/test_verenadownloader.py @@ -0,0 +1,49 @@ +from deutschland.verena.verenadownloader import VerenaDownloader + + +def test_downloader_page_count(): + vd = VerenaDownloader() + res = vd.scrape() + assert ( + len(res) > 0 + ), "Scraping all pages of the VERENA portal (block size 100) returned 0 pages. It very likely shouldn't." + + +def test_downloader_page_content(): + vd = VerenaDownloader() + res = vd.scrape() + any_empty = False + for x in res: + if len(x) == 0: + any_empty = True + assert ( + not any_empty + ), "Scraping all pages of the Verena portal returned a empty page len(sourcecode) == 0." + + +def test_downloader__generate_all_listing_urls_content(): + vd = VerenaDownloader() + valid_url = "https://www.schulministerium.nrw.de/BiPo/Verena/angebote?action=999.999999&seite=a1&suchid=12345" + action_id = "999.999999" + search_id = "12345" + opening_count = 10 + urls = vd._VerenaDownloader__generate_all_listing_urls( + action_id, search_id, opening_count + ) + assert ( + urls[0] == valid_url + ), "Generating the urls used to request all listing pages failed. Listing urls are malformed." + + +def test_downloader__generate_all_list_urls_count(): + vd = VerenaDownloader() + action_id = "999.999999" + search_id = "12345" + opening_count = 650 + urls = vd._VerenaDownloader__generate_all_listing_urls( + action_id, search_id, opening_count + ) + assert len(urls) == 7, ( + "Generating the urls used to request all listing pages failed. There are to few or too many urls generated. Expected: 7, Generated: " + + int(len(urls)) + ) diff --git a/tests/verena/test_verenaextractor.py b/tests/verena/test_verenaextractor.py new file mode 100644 index 0000000..b96a7aa --- /dev/null +++ b/tests/verena/test_verenaextractor.py @@ -0,0 +1,19 @@ +import json +from deutschland.verena.verenaextractor import VerenaExtractor + + +def test_extractor_content(): + with open("tests/verena/ausschreibung_test_input.html", "r") as f: + with open("tests/verena/ausschreibung_correct_result.json", "r") as correct: + content = "" + f.read() + "" + ve = VerenaExtractor(content) + res = ve.extract() + assert len(res) == 1 and res[0] == json.loads(correct.read()) + + +def test_extractor_simple_10(): + with open("tests/verena/ausschreibung_test_input.html", "r") as f: + content = "" + f.read() * 10 + "" + ve = VerenaExtractor(content) + res = ve.extract() + assert len(res) == 10