Skip to content

Commit

Permalink
Merge pull request #29 from auchtetraborat/main
Browse files Browse the repository at this point in the history
Integration of VERENA-scraper & remove misplaced comment in Lebensmittelwarnung
  • Loading branch information
LilithWittmann authored Oct 7, 2021
2 parents cac5953 + 91f05e9 commit 9846d23
Show file tree
Hide file tree
Showing 14 changed files with 635 additions and 2 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/runtests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,4 @@ jobs:
pip install .
- name: Test with pytest
run: |
pytest
pytest
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,21 @@ print(data)
# [{'id': 19601, 'guid': 'https://www.lebensmittelwarnung.de/bvl-lmw-de/detail/lebensmittel/19601', 'pubDate': 'Fri, 10 Feb 2017 12:28:45 +0000', 'imgSrc': 'https://www.lebensmittelwarnung.de/bvl-lmw-de/opensaga/attachment/979f8cd3-969e-4a6c-9a8e-4bdd61586cd4/data.jpg', 'title': 'Sidroga Bio Säuglings- und Kindertee', 'manufacturer': 'Lebensmittel', 'warning': 'Pyrrolizidinalkaloide', 'affectedStates': ['Baden-Württemberg', '...']}]
```

## Federal Job Openings

### NRW

#### VERENA
Get open substitute teaching positions in NRW from https://www.schulministerium.nrw.de/BiPo/Verena/angebote
```python
from deutschland import Verena
v = Verena()
data = v.get()
print(data)
# a full example data can be found at deutschland/verena/example.md
# [{ "school_id": "99999", "desc": "Eine Schule\nSchule der Sekundarstufe II\ndes Landkreis Schuling\n9999 Schulingen", "replacement_job_title": "Lehrkraft", "subjects": [ "Fach 1", "Fach 2" ], "comments": "Bemerkung zur Stelle: Testbemerkung", "duration": "01.01.2021 - 01.01.2022", ...} ...]


## Autobahn

Get data from the Autobahn.
Expand Down
1 change: 1 addition & 0 deletions deutschland/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@
from .bundesanzeiger.bundesanzeiger import Bundesanzeiger
from .handelsregister.handelsregister import Handelsregister
from .lebensmittelwarnung.lebensmittelwarnung import Lebensmittelwarnung
from .verena.verena import Verena
from .bundesnetzagentur import *
1 change: 0 additions & 1 deletion deutschland/lebensmittelwarnung/lebensmittelwarnung.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,5 @@ def get(

if __name__ == "__main__":
lw = Lebensmittelwarnung()
# res = hr.search(keywords="Deutsche Bahn Aktiengesellschaft", keyword_match_option=3)
res = lw.get()
print(res)
Empty file added deutschland/verena/__init__.py
Empty file.
34 changes: 34 additions & 0 deletions deutschland/verena/example.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
### Scraper for https://www.schulministerium.nrw.de/BiPo/Verena/online

```json
{
"school_id": "99999",
"desc": "Eine Schule\nSchule der Sekundarstufe II\ndes Landkreis Schuling\n9999 Schulingen",
"replacement_job_title": "Lehrkraft",
"subjects": [
"Fach 1",
"Fach 2"
],
"replacement_job_type_raw": "Vertretung für",
"replacement_job_type": "Vertretung",
"comments": "Bemerkung zur Stelle: Testbemerkung",
"duration": "01.01.2021 - 01.01.2022",
"hours_per_week": "13,5",
"contact": {
"phone": "0172 1111 1111",
"fax": "0172 2222 2222",
"homepage": "http://www.eine-schule.de",
"mail": {
"raw": "mailto:[email protected]?subject=Stellenausschreibung in VERENA",
"adress": "[email protected]",
"subject": "Stellenausschreibung in VERENA"
}
},
"deadline": "17.09.2021",
"geolocation": {
"coord_system": "epsg:25832",
"coordinates": [1111111, 1111111],
"post_adress": "Eine Stra\u00dfe 1\n99999 Schulingen"
}
}
```
28 changes: 28 additions & 0 deletions deutschland/verena/verena.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from deutschland.verena.verenadownloader import VerenaDownloader
from deutschland.verena.verenaextractor import VerenaExtractor
import json


class Verena:
"""
Downloads and extracts the current job listings from the VERENA portal.
"""

def get(self):
"""
Downloads and extracts the current job listings from the VERENA portal.
Example of the json format can be found at ./example.json
"""
result = []
scraped_pages = VerenaDownloader().scrape()
for idx, page in enumerate(scraped_pages):
extract = VerenaExtractor(page).extract()
result = result + extract
return result


if __name__ == "__main__":
v = Verena()
res = v.get()
print(json.dumps(res))
142 changes: 142 additions & 0 deletions deutschland/verena/verenadownloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
import requests
import math
from bs4 import BeautifulSoup
from typing import Tuple, List


class VerenaDownloader:
"""
Downloads all pages (each containing 100 job offerings) of the VERENA portal.
"""

BASE_URL = "https://www.schulministerium.nrw.de"

def __init__(self):
self.session = requests.Session()

def __scrape_landing_page(self) -> Tuple[int, str, str]:
"""Returns (job_openings_count: int, access_listing_url_part: str, access_listing_action_id: str)
Example: (513, "/BiPo/Verena/angebote?action=595.1764087184088", "595.1764087184088")
Scrapes the VERENA landing page to get a session cookie, matching actionid
to access the listing view and the count of job offerings in the listing.
"""
landing_url = self.BASE_URL + "/BiPo/Verena"
landing_request = self.session.get(landing_url)
landing_soup = BeautifulSoup(landing_request.text, "html.parser")
links = landing_soup.findAll("a", {"title": "Zu den Stellenausschreibungen"})
for link in links:
if "Derzeit im Netz veröffentlichte Ausschreibungen:" in link.text:
job_openings_count = link.find_next("strong").text
access_listing_url_part = link["href"]
# split action_id from listing_url_part
access_listing_action_id = access_listing_url_part.replace(
"/BiPo/Verena/angebote?action=", ""
)
return (
int(job_openings_count),
access_listing_url_part,
access_listing_action_id,
)

def __scrape_listing_page_initial(
self, access_listing_url_part: str
) -> Tuple[str, str, str]:
"""Returns (listing url with new actionid, blocksize 100 & valid suchid (aka. select_blocksize_url_part)), search_id, select_blocksize_action_id)
Example: ("/BiPo/Verena/angebote?action=509.9848906326322&block=b100&suchid=188736", "188736", "509.9848906326322")
Scrapes the VERENA listing page to get a listing url with blocksize = 100 and valid suchid (search_id).
suchid is generated by the backend and stores your search preferences.
"""
listing_url = self.BASE_URL + access_listing_url_part
listing_request = self.session.get(listing_url)
listing_soup = BeautifulSoup(listing_request.text, "html.parser")
blocksize_selector = listing_soup.find("div", id="blockauswahl")
# -1 is blocksize 100, also gets a such_id (search_id)
select_blocksize_url_part = blocksize_selector.findAll("a")[-1]["href"]
search_id = select_blocksize_url_part.split("=")[-1]
select_blocksize_action_id = select_blocksize_url_part.replace(
"/BiPo/Verena/angebote?action=", ""
).split("&")[0]
return select_blocksize_url_part, search_id, select_blocksize_action_id

def __set_block_size(self, select_blocksize_url_part: str):
"""
Run GET on search ID url to set correct block size for future requests in backend
"""
searchid_url = self.BASE_URL + select_blocksize_url_part
self.session.get(searchid_url)

def __generate_all_listing_urls(
self, action_id: str, search_id: str, opening_count: int
) -> List[str]:
"""Based on action_id, search_id and opening_count, generates a list of all listing urls.
Example: [
"https://www.schulministerium.nrw.de/BiPo/Verena/angebote?action=901.7040712715743&seite=a1&suchid=188265",
"https://www.schulministerium.nrw.de/BiPo/Verena/angebote?action=901.7040712715743&seite=a2&suchid=188265"
...
]
"""
all_urls = []
# because block size = 100
site_count = math.ceil(opening_count / 100)
for curr_site in range(0, site_count):
curr_site += 1
listing_format_string = (
self.BASE_URL + "/BiPo/Verena/angebote?action={0}&seite=a{1}&suchid={2}"
)
all_urls.append(
listing_format_string.format(action_id, curr_site, search_id)
)
return all_urls

def __scrape_actual_listing(self, urls: List[str]):
"""Downloads the job listing pages provided by 'urls' and returns their content as an list of sourcecodes.
Example: [
<html>...</html>
<html>...</html>
]
"""
scraped_pages = []
for url in urls:
r = self.session.get(url)
scraped_pages.append(r.text)
return scraped_pages

def scrape(self) -> List[str]:
"""Returns list of sourcecodes of all listing pages of the VERENA job listing portal.
Example: [
<html>...</html>
<html>...</html>
]
"""
(
job_opening_count,
access_listing_url_part,
access_listing_action_id,
) = self.__scrape_landing_page()
# select_blocksize_action_id is the action_id used to select the blocksize.
# Its also reused to query the different pages of the job portal.
(
select_blocksize_url_part,
search_id,
select_blocksize_action_id,
) = self.__scrape_listing_page_initial(access_listing_url_part)
self.__set_block_size(select_blocksize_url_part)
all_listing_urls = self.__generate_all_listing_urls(
select_blocksize_action_id, search_id, job_opening_count
)
return self.__scrape_actual_listing(all_listing_urls)


if __name__ == "__main__":
vd = VerenaDownloader()
res = vd.scrape()
print(res)
Loading

0 comments on commit 9846d23

Please sign in to comment.