From 422995f5fe27b199e801418067ac91e308e3452d Mon Sep 17 00:00:00 2001 From: Daniel_Stoxreiter Date: Fri, 17 Jun 2022 13:37:12 +0200 Subject: [PATCH 1/5] merging p with following p prev --- freud_api_crawler/fixtures/make_tei.xslt | 33 +++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/freud_api_crawler/fixtures/make_tei.xslt b/freud_api_crawler/fixtures/make_tei.xslt index 467bbc4..cc87eac 100644 --- a/freud_api_crawler/fixtures/make_tei.xslt +++ b/freud_api_crawler/fixtures/make_tei.xslt @@ -23,8 +23,35 @@ #################### --> + + + + + + + + + + + + + + + + + + + + - + + + + + + + +

@@ -35,9 +62,9 @@ - + + + +
+ +
+ +
+ + + - + + + + diff --git a/freud_api_crawler/freud_api_crawler.py b/freud_api_crawler/freud_api_crawler.py index 0b4f92a..3b801e8 100644 --- a/freud_api_crawler/freud_api_crawler.py +++ b/freud_api_crawler/freud_api_crawler.py @@ -429,7 +429,8 @@ def make_xml(self, save=False, limit=True): pb_el = make_pb( pp['page_nr'], f"{FRD_BASE}{pp['faks__payload']}", - pp['faks__id'] + pp['faks__id'], + f"page__{pp['id']}" ) cur_div = div.xpath('//tei:div', namespaces=self.nsmap)[0] cur_div.insert(0, pb_el) diff --git a/freud_api_crawler/string_utils.py b/freud_api_crawler/string_utils.py index ca02909..904d019 100644 --- a/freud_api_crawler/string_utils.py +++ b/freud_api_crawler/string_utils.py @@ -5,6 +5,7 @@ ('-
', ''), ('
', '\n'), ('

', '

\n'), + ('-

', '

'), ('‚', ','), ('ı', 'i') ] diff --git a/freud_api_crawler/tei_utils.py b/freud_api_crawler/tei_utils.py index 0017139..8fa3a4f 100644 --- a/freud_api_crawler/tei_utils.py +++ b/freud_api_crawler/tei_utils.py @@ -1,7 +1,7 @@ import lxml.etree as ET -def make_pb(n, faks_url, faks_id): +def make_pb(n, faks_url, faks_id, page_id): """ returns a tei:pb """ pb_el = ET.Element("{http://www.tei-c.org/ns/1.0}pb") @@ -10,5 +10,6 @@ def make_pb(n, faks_url, faks_id): pb_el.attrib[ "{http://www.w3.org/XML/1998/namespace}id" ] = f"faks__{faks_id}" + pb_el.attrib['next'] = f"{page_id}" return pb_el From 51bbca0e965a029d3ccac768379f2fec89a37961 Mon Sep 17 00:00:00 2001 From: Daniel_Stoxreiter Date: Mon, 20 Jun 2022 11:24:23 +0200 Subject: [PATCH 3/5] handle missing whitespace in merged paragraphs: added lb break=pargraph el --- freud_api_crawler/fixtures/make_tei.xslt | 3 +++ freud_api_crawler/freud_api_crawler.py | 7 +------ freud_api_crawler/string_utils.py | 4 ++-- freud_api_crawler/tei_utils.py | 8 +++----- 4 files changed, 9 insertions(+), 13 deletions(-) diff --git a/freud_api_crawler/fixtures/make_tei.xslt b/freud_api_crawler/fixtures/make_tei.xslt index 107bcfd..b8e46b3 100644 --- a/freud_api_crawler/fixtures/make_tei.xslt +++ b/freud_api_crawler/fixtures/make_tei.xslt @@ -55,6 +55,9 @@
+ + +   diff --git a/freud_api_crawler/freud_api_crawler.py b/freud_api_crawler/freud_api_crawler.py index 3b801e8..4c90068 100644 --- a/freud_api_crawler/freud_api_crawler.py +++ b/freud_api_crawler/freud_api_crawler.py @@ -426,12 +426,7 @@ def make_xml(self, save=False, limit=True): page_json = self.get_page(x['id']) pp = self.process_page(page_json) div = ET.fromstring(pp['body']) - pb_el = make_pb( - pp['page_nr'], - f"{FRD_BASE}{pp['faks__payload']}", - pp['faks__id'], - f"page__{pp['id']}" - ) + pb_el = make_pb(pp) cur_div = div.xpath('//tei:div', namespaces=self.nsmap)[0] cur_div.insert(0, pb_el) body.append(div) diff --git a/freud_api_crawler/string_utils.py b/freud_api_crawler/string_utils.py index 904d019..f85c02f 100644 --- a/freud_api_crawler/string_utils.py +++ b/freud_api_crawler/string_utils.py @@ -4,8 +4,8 @@ ('\n', ''), ('-
', ''), ('
', '\n'), - ('

', '

\n'), - ('-

', '

'), + ('

', '

\n'), + ('-

', '

'), ('‚', ','), ('ı', 'i') ] diff --git a/freud_api_crawler/tei_utils.py b/freud_api_crawler/tei_utils.py index 8fa3a4f..dae1e33 100644 --- a/freud_api_crawler/tei_utils.py +++ b/freud_api_crawler/tei_utils.py @@ -1,15 +1,13 @@ import lxml.etree as ET -def make_pb(n, faks_url, faks_id, page_id): +def make_pb(json): """ returns a tei:pb """ pb_el = ET.Element("{http://www.tei-c.org/ns/1.0}pb") - pb_el.attrib['n'] = f"{n}" - pb_el.attrib['facs'] = f"{faks_url}" + pb_el.attrib['n'] = f"{json['page_nr']}" pb_el.attrib[ "{http://www.w3.org/XML/1998/namespace}id" - ] = f"faks__{faks_id}" - pb_el.attrib['next'] = f"{page_id}" + ] = f"page__{json['id']}" return pb_el From c18ede0a1b9f0054ea4d2860d492d097ee6905df Mon Sep 17 00:00:00 2001 From: Daniel_Stoxreiter Date: Mon, 20 Jun 2022 11:31:02 +0200 Subject: [PATCH 4/5] correcting lint errors --- .flake8 | 1 + freud_api_crawler/string_utils.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.flake8 b/.flake8 index ed5b57e..afe90ba 100644 --- a/.flake8 +++ b/.flake8 @@ -7,3 +7,4 @@ exclude = build dist env + venv diff --git a/freud_api_crawler/string_utils.py b/freud_api_crawler/string_utils.py index f85c02f..6de86f4 100644 --- a/freud_api_crawler/string_utils.py +++ b/freud_api_crawler/string_utils.py @@ -5,7 +5,7 @@ ('-
', ''), ('
', '\n'), ('

', '

\n'), - ('-

', '

'), + ('-

', '

'), ('‚', ','), ('ı', 'i') ] From c8c05da62709ee491ca82cba34f37b24d769ef44 Mon Sep 17 00:00:00 2001 From: Daniel_Stoxreiter Date: Mon, 20 Jun 2022 11:36:48 +0200 Subject: [PATCH 5/5] adapting tests for make_pb --- tests/test_freud_api_crawler.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/test_freud_api_crawler.py b/tests/test_freud_api_crawler.py index e33f372..0c6319d 100644 --- a/tests/test_freud_api_crawler.py +++ b/tests/test_freud_api_crawler.py @@ -205,10 +205,13 @@ def tearDown(self): def test_001_make_pg(self): """ Test make_pb""" pb_el = tei_utils.make_pb( - 1, 'https://whatever.com', "1234sieben" + { + "page_nr": 1, + "id": "xyz" + } ) pb_str = ET.tostring(pb_el).decode('utf-8') self.assertEqual( pb_str, - '' # noqa: E501 + '' # noqa: E501 )