From 1d313b4077f720b939963de7c4bf0ae9c0ad9841 Mon Sep 17 00:00:00 2001 From: Isidor Zeuner Date: Thu, 19 Sep 2024 15:05:35 +0200 Subject: [PATCH 1/2] for `bverfg`, also retrieve additional decisions from the official collection --- gesp/spiders/bund.py | 116 ++++++++++++++++++++++++++++++++++++++++ gesp/src/create_file.py | 22 +++++++- 2 files changed, 137 insertions(+), 1 deletion(-) diff --git a/gesp/spiders/bund.py b/gesp/spiders/bund.py index e7d5e75..5fe8df9 100644 --- a/gesp/spiders/bund.py +++ b/gesp/spiders/bund.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import re import scrapy +from ..src import config from ..pipelines.formatters import AZsPipeline, CourtsPipeline from ..pipelines.exporters import ExportAsHtmlPipeline, FingerprintExportPipeline, RawExporter @@ -25,6 +26,7 @@ def __init__(self, path, courts="", states="", fp=False, domains="", store_docId self.domains = domains self.store_docId = store_docId self.postprocess = postprocess + self.bverfg_az_bmj = {} if ("zivil" in domains and not any(court in courts for court in ["bgh", "bpatg", "bag"])): courts.extend(["bgh", "bpatg", "bag"]) if ("oeff" in domains and not any(court in courts for court in ["bfh", "bsg", "bverfg", "bverwg"])): @@ -34,6 +36,21 @@ def __init__(self, path, courts="", states="", fp=False, domains="", store_docId super().__init__(**kwargs) def parse(self, response): + if not self.courts or "bverfg" in self.courts: + for item in response.xpath("//item"): + link = item.xpath("link/text()").get() + if item.xpath("gericht/text()").get().startswith("BVerfG"): + azs = item.xpath("aktenzeichen/text()").get() + for az in azs.split(", "): + if "..." != az: + self.bverfg_az_bmj[az] = True + yield scrapy.Request( + url="https://www.bundesverfassungsgericht.de/DE/Entscheidungen/Entscheidungen/Amtliche%20Sammlung%20BVerfGE.html", + callback=self.parse_bverfg_collection, + headers=config.HEADERS | { + 'Referer':'https://www.bundesverfassungsgericht.de/' + } + ) for item in response.xpath("//item"): link = item.xpath("link/text()").get() y = { @@ -52,3 +69,102 @@ def parse(self, response): if domain in y["court"].lower(): yield y else: yield y else: yield y + + def parse_bverfg_collection(self, response): + for link in response.xpath("//a/@href"): + if "/Entscheidungen/Liste/" in link.get(): + yield scrapy.Request( + url=response.urljoin(link.get()), + callback=self.parse_bverfg_list, + headers=config.HEADERS + ) + + def parse_bverfg_list(self, response): + for row in response.xpath("//tr"): + needed = None + az_column = row.xpath(".//td[3]/text()").get() + if az_column: + azs = az_column + azs = re.sub("[),]$", "", azs) + azs = re.sub("\xa0", " ", azs) + azs = re.sub("([0-9]+) (/[0-9]+)", "\\1\\2", azs) + azs = re.sub(",? u[.] ?a[.]? ?(,|$)", "\\1", azs) + pat_range = "([A-Z,] ?)([0-9]+) bis ([0-9]+)(/[0-9]+)" + if re.match("^.*" + pat_range + ".*$", azs): + pre = re.sub("^.*(" + pat_range + ").*$", "\\1", azs) + last = re.sub("^" + pat_range + "$", "\\1", pre) + start = int(re.sub("^" + pat_range + "$", "\\2", pre)) + end = int(re.sub("^" + pat_range + "$", "\\3", pre)) + year = re.sub("^" + pat_range + "$", "\\4", pre) + post = last + delimiter = "" + for n in range(start, end + 1): + post += delimiter + str(n) + year + delimiter = ", " + azs = post.join(azs.split(pre)) + # xxx und yyy/zz -> xxx/zz, yyy/zz + pat_n = "([A-Z,] )([0-9]+) und ([0-9]+)(/[0-9]+)" + while re.match("^.*" + pat_n + ".*$", azs): + azs = re.sub(pat_n, "\\1\\2\\4, \\3\\4", azs) + # xxx, yyy/zz -> xxx/zz, yyy/zz + pat_n = "([A-Z,] ?)([0-9]+), ?([0-9]+)(/[0-9]+)" + while re.match("^.*" + pat_n + ".*$", azs): + azs = re.sub(pat_n, "\\1\\2\\4, \\3\\4", azs) + rz = "(Bv[ABCEFGHKLMNOPQR]|PBv[UV])" + # x Rr yyy/zz, uuu/vv -> x Rr yyy,zz, x Rr uuu/vv + pat_ny = "([0-9] " + rz + ") ([0-9]+/[0-9]+), ([0-9]+/[0-9]+)" + while re.match("^.*" + pat_ny + ".*$", azs): + azs = re.sub(pat_ny, "\\1 \\3, \\1 \\4", azs) + pat_az = "([0-9] " + rz + "|PBvV) [0-9]+/[0-9]+" + for az in azs.split(", "): + az = re.sub(" +$", "", az) + if not az in self.bverfg_az_bmj: + needed = az + break + if not needed: + continue + for link in row.xpath(".//td[1]/a/@href"): + if "/Entscheidungen/" in link.get(): + monate = { + 'Januar': '01', + 'Februar': '02', + 'März': '03', + 'April': '04', + 'Mai': '05', + 'Juni': '06', + 'Juli': '07', + 'August': '08', + 'September': '09', + 'Oktober': '10', + 'November': '11', + 'Dezember': '12', + } + monat = "(" + "|".join(list(monate)) + ")" + pat_d = "([0-9]+)[.] " + monat + " ([0-9]{4})" + pat = "(Beschluss|Urteil) vom " + pat_d + date_raw = row.xpath(".//td[2]/text()").get() + if not re.match(pat, date_raw): + continue + date_raw = re.sub( + "^(Beschluss|Urteil) vom (" + pat_d + ")$", + "\\2", + date_raw + ) + date_ymd = re.sub( + pat_d, + "\\3", + date_raw + ) + monate[ + re.sub(pat_d, "\\2", date_raw) + ] + re.sub( + pat_d, + "\\1", + date_raw + ).zfill(2) + yield { + "wait": self.wait, + "date": date_ymd, + "az": needed, + "court": "bverfg", + "link": response.urljoin(link.get()), + } diff --git a/gesp/src/create_file.py b/gesp/src/create_file.py index 200beb4..7682e4c 100644 --- a/gesp/src/create_file.py +++ b/gesp/src/create_file.py @@ -13,7 +13,13 @@ def info(item): def save_as_html(item, spider_name, spider_path, store_docId): # spider.name, spider.path info(item) - if (spider_name == "bund") or (spider_name == "by"): # Sonderfall Bund und Bayern: *.zip mit *.xml + is_zip_xml = False + if spider_name == "bund": + if not item["link"].startswith("https://www.bundesverfassungsgericht.de/"): + is_zip_xml = True + if spider_name == "by": + is_zip_xml = True + if is_zip_xml: # Sonderfall Bund und Bayern: *.zip mit *.xml filename = item["court"] + "_" + item["date"] + "_" + item["az"] if store_docId and item.get('docId'): filename += "_" + item['docId'] @@ -45,6 +51,20 @@ def save_as_html(item, spider_name, spider_path, store_docId): # spider.name, sp output(f"could not create file {filepath}", "err") else: return item + elif "link" in item: + filename = item["court"] + "_" + item["date"] + "_" + item["az"] + if store_docId and item.get('docId'): + filename += "_" + item['docId'] + filename += ".html" + filepath = os.path.join(spider_path, spider_name, filename) + enc = "utf-8" + try: + #while True: + with open(filepath, "w", encoding=enc) as f: + f.write(requests.get(item["link"]).content.decode(enc)) + # break + except: + output(f"could not create file {filepath}", "err") else: output("could not retrieve " + item["link"], "err") From 0991eb1866be64b2a7a6de37c3e8c7d62122d392 Mon Sep 17 00:00:00 2001 From: Isidor Zeuner Date: Mon, 23 Sep 2024 22:20:50 +0200 Subject: [PATCH 2/2] cleanup --- gesp/src/create_file.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/gesp/src/create_file.py b/gesp/src/create_file.py index 7682e4c..cf6225c 100644 --- a/gesp/src/create_file.py +++ b/gesp/src/create_file.py @@ -59,10 +59,8 @@ def save_as_html(item, spider_name, spider_path, store_docId): # spider.name, sp filepath = os.path.join(spider_path, spider_name, filename) enc = "utf-8" try: - #while True: with open(filepath, "w", encoding=enc) as f: f.write(requests.get(item["link"]).content.decode(enc)) - # break except: output(f"could not create file {filepath}", "err") else: