From d4ec4d12b96fbd33e4017e1fba87b102cb9aa1ea Mon Sep 17 00:00:00 2001 From: Albert Sawczyn Date: Wed, 23 Oct 2024 16:48:15 +0200 Subject: [PATCH] feat: add extract_data_from_pages.py --- dev_notebooks/2_nsa_extraction.ipynb | 663 +------------------------ juddges/data/nsa/extractor.py | 31 +- scripts/nsa/extract_data_from_pages.py | 35 ++ 3 files changed, 79 insertions(+), 650 deletions(-) create mode 100644 scripts/nsa/extract_data_from_pages.py diff --git a/dev_notebooks/2_nsa_extraction.ipynb b/dev_notebooks/2_nsa_extraction.ipynb index 1089b2c..b1cf744 100644 --- a/dev_notebooks/2_nsa_extraction.ipynb +++ b/dev_notebooks/2_nsa_extraction.ipynb @@ -1,37 +1,18 @@ { "cells": [ { - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-22T14:47:17.987968Z", - "start_time": "2024-10-22T14:47:17.978639Z" - } - }, + "metadata": {}, "cell_type": "code", "source": [ "%load_ext autoreload\n", "%autoreload 2" ], "id": "9029f8e669cfb5dc", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], - "execution_count": 11 + "outputs": [], + "execution_count": null }, { - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-22T14:47:17.998309Z", - "start_time": "2024-10-22T14:47:17.989090Z" - } - }, + "metadata": {}, "cell_type": "code", "source": [ "from dataclasses import asdict\n", @@ -43,37 +24,19 @@ "import pandas as pd\n", "from mpire import WorkerPool\n", "\n", - "lf = pl.scan_parquet(NSA_DATA_PATH / \"pages\" / \"pages_chunk_0.parquet\")" + "lf = pl.scan_parquet(NSA_DATA_PATH / \"pages\" / \"pages_chunk_12.parquet\")" ], "id": "7aea733cd6d05a0b", "outputs": [], - "execution_count": 12 + "execution_count": null }, { - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-22T14:47:18.975979Z", - "start_time": "2024-10-22T14:47:17.999105Z" - } - }, + "metadata": {}, "cell_type": "code", "source": "df = lf.collect()", "id": "fd8407a0c08061c9", "outputs": [], - "execution_count": 13 - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-22T14:47:18.983978Z", - "start_time": "2024-10-22T14:47:18.976839Z" - } - }, - "cell_type": "code", - "source": "# df[2][\"page\"]", - "id": "2e10800d536614a6", - "outputs": [], - "execution_count": 14 + "execution_count": null }, { "metadata": {}, @@ -88,12 +51,7 @@ "id": "5d30eac358952608" }, { - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-22T14:47:53.959752Z", - "start_time": "2024-10-22T14:47:18.984420Z" - } - }, + "metadata": {}, "cell_type": "code", "source": [ "from juddges.data.nsa.extractor import NSADataExtractor, ORDER\n", @@ -102,31 +60,16 @@ "df = extractor.extract_data_from_pages_to_df(df[\"page\"], df[\"doc_id\"])" ], "id": "ec0c85fa029d48ad", - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/albert/miniconda3/envs/JuDDGES/lib/python3.11/site-packages/mpire/pool.py:692: RuntimeWarning: Failed to obtain length of iterable when chunk size or number of splits is None. Chunk size is set to 4. Remedy: either provide an iterable with a len() function or specify iterable_len in the function call\n", - " n_tasks, max_tasks_active, chunk_size, progress_bar, progress_bar_options = check_map_parameters(\n", - "100%|██████████| 50000/50000 [00:33<00:00, 1429.11it/s]\n" - ] - } - ], - "execution_count": 15 + "outputs": [], + "execution_count": null }, { - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-22T14:47:53.968091Z", - "start_time": "2024-10-22T14:47:53.960521Z" - } - }, + "metadata": {}, "cell_type": "code", "source": "assert len(df.columns) == len(ORDER)", "id": "ddbb4144a15d8d55", "outputs": [], - "execution_count": 16 + "execution_count": null }, { "metadata": {}, @@ -135,592 +78,36 @@ "id": "bdcdd66d906fee81" }, { - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-22T14:55:37.134146Z", - "start_time": "2024-10-22T14:55:37.119658Z" - } - }, + "metadata": {}, "cell_type": "code", "source": "len(ORDER)", "id": "1b3cbf67960cf4e2", - "outputs": [ - { - "data": { - "text/plain": [ - "22" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 20 + "outputs": [], + "execution_count": null }, { - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-22T14:47:54.031773Z", - "start_time": "2024-10-22T14:47:54.023730Z" - } - }, + "metadata": {}, "cell_type": "code", "source": "df.columns", "id": "9097ea94aded3ae8", - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['id', 'Docket number', 'Type of decision', 'Finality',\n", - " 'The day of the judgment', 'Date of submission', 'Court', 'Judges',\n", - " 'Presiding judge', 'Judge rapporteur',\n", - " 'Type of case with the detailed description', 'Keywords',\n", - " 'Related docket numbers', 'Challenged authority',\n", - " 'Nature of the verdict', 'The cited provisions',\n", - " 'Published in official collection of judgments Jurisprudence of the Voivodeship Administrative Courts and the Supreme Administrative Court',\n", - " 'Information on glosa(s)', 'Theses', 'Sentence of the judgment',\n", - " 'Reasons for judgment', 'Dissenting opinion'],\n", - " dtype='object')" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 18 + "outputs": [], + "execution_count": null }, { - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-22T14:56:05.193139Z", - "start_time": "2024-10-22T14:56:05.175702Z" - } - }, + "metadata": {}, "cell_type": "code", - "source": "df[:1000]", + "source": "df[df['Published in official collection of judgments Jurisprudence of the Voivodeship Administrative Courts and the Supreme Administrative Court'].notna()]", "id": "5d466f9f5a3a20ed", - "outputs": [ - { - "data": { - "text/plain": [ - " id Docket number Type of decision \\\n", - "0 /doc/BC8847077B II SA/Ol 1164/16 Postanowienie WSA w Olsztynie \n", - "1 /doc/C8D885030F III SA/Po 595/21 Wyrok WSA w Poznaniu \n", - "2 /doc/6447E5BA57 II SA/Op 8/05 Wyrok WSA w Opolu \n", - "3 /doc/B4984CED3A II FSK 1938/08 Wyrok NSA \n", - "4 /doc/21CBD4642A III SA/Wa 469/14 Postanowienie WSA w Warszawie \n", - ".. ... ... ... \n", - "995 /doc/F716824513 I SA/Wa 640/17 Postanowienie WSA w Warszawie \n", - "996 /doc/DE05AA888C I SA/Wa 660/18 Postanowienie WSA w Warszawie \n", - "997 /doc/FC756C2C43 I GZ 533/13 Postanowienie NSA \n", - "998 /doc/CE390B2319 II SA/Wr 475/17 Postanowienie WSA we Wrocławiu \n", - "999 /doc/7745497F74 II SA/Wa 2267/14 Postanowienie WSA w Warszawie \n", - "\n", - " Finality The day of the judgment Date of submission \\\n", - "0 orzeczenie prawomocne 2016-10-26 2016-09-21 \n", - "1 orzeczenie nieprawomocne 2021-10-14 2021-04-02 \n", - "2 orzeczenie prawomocne 2005-10-13 2005-01-12 \n", - "3 orzeczenie prawomocne 2010-04-02 2008-11-12 \n", - "4 None 2015-05-19 2014-02-10 \n", - ".. ... ... ... \n", - "995 orzeczenie prawomocne 2017-06-22 2017-04-26 \n", - "996 None 2018-10-30 2018-04-13 \n", - "997 orzeczenie prawomocne 2013-12-10 2013-11-28 \n", - "998 orzeczenie nieprawomocne 2017-09-08 2017-07-13 \n", - "999 None 2016-09-20 2014-12-19 \n", - "\n", - " Court \\\n", - "0 Wojewódzki Sąd Administracyjny w Olsztynie \n", - "1 Wojewódzki Sąd Administracyjny w Poznaniu \n", - "2 Wojewódzki Sąd Administracyjny w Opolu \n", - "3 Naczelny Sąd Administracyjny \n", - "4 Wojewódzki Sąd Administracyjny w Warszawie \n", - ".. ... \n", - "995 Wojewódzki Sąd Administracyjny w Warszawie \n", - "996 Wojewódzki Sąd Administracyjny w Warszawie \n", - "997 Naczelny Sąd Administracyjny \n", - "998 Wojewódzki Sąd Administracyjny we Wrocławiu \n", - "999 Wojewódzki Sąd Administracyjny w Warszawie \n", - "\n", - " Judges Presiding judge \\\n", - "0 [S. Beata Jezielska] S. Beata Jezielska \n", - "1 [Izabela Paluszyńska, Piotr Ławrynowicz, Walen... Izabela Paluszyńska \n", - "2 [Grażyna Jeżewska, Jerzy Krupiński, Teresa Cisyk] Jerzy Krupiński \n", - "3 [Bogusław Woźniak, Grzegorz Borkowski, Stefan ... Grzegorz Borkowski \n", - "4 [Maciej Kurasz] Maciej Kurasz \n", - ".. ... ... \n", - "995 [Elżbieta Sobielarska] Elżbieta Sobielarska \n", - "996 [Iwona Kosińska] Iwona Kosińska \n", - "997 [Wojciech Kręcisz] Wojciech Kręcisz \n", - "998 [Władysław Kulon] Władysław Kulon \n", - "999 [Ewa Pisula-Dąbrowska] Ewa Pisula-Dąbrowska \n", - "\n", - " Judge rapporteur ... \\\n", - "0 S. Beata Jezielska ... \n", - "1 Izabela Paluszyńska ... \n", - "2 Teresa Cisyk ... \n", - "3 Bogusław Woźniak ... \n", - "4 Maciej Kurasz ... \n", - ".. ... ... \n", - "995 Elżbieta Sobielarska ... \n", - "996 Iwona Kosińska ... \n", - "997 Wojciech Kręcisz ... \n", - "998 Władysław Kulon ... \n", - "999 Ewa Pisula-Dąbrowska ... \n", - "\n", - " Related docket numbers \\\n", - "0 NaN \n", - "1 NaN \n", - "2 NaN \n", - "3 [{'Docket number': 'I SA/Wr 183/08', 'Type of ... \n", - "4 NaN \n", - ".. ... \n", - "995 NaN \n", - "996 [{'Docket number': 'I OSK 945/22', 'Type of de... \n", - "997 [{'Docket number': 'III SA/Gl 1494/12', 'Type ... \n", - "998 [{'Docket number': 'I OZ 953/16', 'Type of dec... \n", - "999 NaN \n", - "\n", - " Challenged authority \\\n", - "0 Dyrektor Izby Celnej \n", - "1 Dyrektor Oddziału Regionalnego Agencji Restruk... \n", - "2 Komendant Policji \n", - "3 Dyrektor Izby Skarbowej \n", - "4 Dyrektor Izby Skarbowej \n", - ".. ... \n", - "995 Samorządowe Kolegium Odwoławcze \n", - "996 Inne \n", - "997 Dyrektor Izby Celnej \n", - "998 Wojewódzki Inspektor Nadzoru Budowlanego \n", - "999 Minister Nauki i Szkolnictwa Wyższego \n", - "\n", - " Nature of the verdict \\\n", - "0 [Odrzucono skargę] \n", - "1 [Oddalono skargę] \n", - "2 [Uchylono decyzję I i II instancji] \n", - "3 [Oddalono skargę kasacyjną] \n", - "4 [Odrzucono skargę kasacyjną] \n", - ".. ... \n", - "995 [Odrzucono skargę] \n", - "996 [Wstrzymano wykonanie zaskarżonej decyzji] \n", - "997 [Uchylono zaskarżone postanowienie i przekazan... \n", - "998 [*Odrzucono skargę] \n", - "999 [Podjęto zawieszone postępowanie] \n", - "\n", - " The cited provisions \\\n", - "0 [{'Journal of laws': 'Dz.U. 2016 poz 718', 'Ar... \n", - "1 [{'Journal of laws': 'Dz.U. 2019 poz 2325', 'A... \n", - "2 [{'Journal of laws': 'Dz.U. 2002 nr 7 poz. 58'... \n", - "3 [{'Journal of laws': 'Dz.U. 1997 nr 123 poz 77... \n", - "4 [{'Journal of laws': 'Dz.U. 2012 poz 270', 'Ar... \n", - ".. ... \n", - "995 NaN \n", - "996 [{'Journal of laws': 'Dz.U. 2018 poz 1302', 'A... \n", - "997 [{'Journal of laws': 'Dz.U. 2002 nr 153 poz 12... \n", - "998 [{'Journal of laws': 'Dz.U. 2017 poz 1369', 'A... \n", - "999 NaN \n", - "\n", - " Published in official collection of judgments Jurisprudence of the Voivodeship Administrative Courts and the Supreme Administrative Court \\\n", - "0 NaN \n", - "1 NaN \n", - "2 ONSAiWSA 2006 4 poz. 116 \n", - "3 NaN \n", - "4 NaN \n", - ".. ... \n", - "995 NaN \n", - "996 NaN \n", - "997 NaN \n", - "998 NaN \n", - "999 NaN \n", - "\n", - " Information on glosa(s) \\\n", - "0 NaN \n", - "1 NaN \n", - "2 NaN \n", - "3 NaN \n", - "4 NaN \n", - ".. ... \n", - "995 NaN \n", - "996 NaN \n", - "997 NaN \n", - "998 NaN \n", - "999 NaN \n", - "\n", - " Theses \\\n", - "0 NaN \n", - "1 NaN \n", - "2 Przepisy art. 32 ust. 1 i 2 ustawy z dnia 23 m... \n", - "3 NaN \n", - "4 NaN \n", - ".. ... \n", - "995 NaN \n", - "996 NaN \n", - "997 NaN \n", - "998 NaN \n", - "999 NaN \n", - "\n", - " Sentence of the judgment \\\n", - "0 Wojewódzki Sąd Administracyjny w Olsztynie w s... \n", - "1 Dnia 14 października 2021 roku Wojewódzki Sąd ... \n", - "2 Wojewódzki Sąd Administracyjny w Opolu w skład... \n", - "3 Naczelny Sąd Administracyjny w składzie: Przew... \n", - "4 Dnia 19 maja 2015 r. Wojewódzki Sąd Administra... \n", - ".. ... \n", - "995 Wojewódzki Sąd Administracyjny w Warszawie w s... \n", - "996 Wojewódzki Sąd Administracyjny w Warszawie, w ... \n", - "997 Naczelny Sąd Administracyjny w składzie: Przew... \n", - "998 Wojewódzki Sąd Administracyjny we Wrocławiu w ... \n", - "999 Wojewódzki Sąd Administracyjny w Warszawie w s... \n", - "\n", - " Reasons for judgment Dissenting opinion \n", - "0 Spółka A z siedzibą w W - reprezentowana przez... NaN \n", - "1 Decyzją z [...] lutego 2020r. Nr [...] Kierown... NaN \n", - "2 Komendant Miejski Policji w O., rozkazem perso... NaN \n", - "3 Wojewódzki Sąd Administracyjny we Wrocławiu wy... NaN \n", - "4 Wojewódzki Sąd Administracyjny w Warszawie wyr... NaN \n", - ".. ... ... \n", - "995 Pismem z dnia [...] marca 2017 r. M. O. i K. O... NaN \n", - "996 R. K. wniósł do Wojewódzkiego Sądu Administrac... NaN \n", - "997 Wyrokiem z 4 września 2013 r. Wojewódzki Sąd A... NaN \n", - "998 Stosownie do treści art. 230 § 1 ustawy z dnia... NaN \n", - "999 NaN NaN \n", - "\n", - "[1000 rows x 22 columns]" - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idDocket numberType of decisionFinalityThe day of the judgmentDate of submissionCourtJudgesPresiding judgeJudge rapporteur...Related docket numbersChallenged authorityNature of the verdictThe cited provisionsPublished in official collection of judgments Jurisprudence of the Voivodeship Administrative Courts and the Supreme Administrative CourtInformation on glosa(s)ThesesSentence of the judgmentReasons for judgmentDissenting opinion
0/doc/BC8847077BII SA/Ol 1164/16Postanowienie WSA w Olsztynieorzeczenie prawomocne2016-10-262016-09-21Wojewódzki Sąd Administracyjny w Olsztynie[S. Beata Jezielska]S. Beata JezielskaS. Beata Jezielska...NaNDyrektor Izby Celnej[Odrzucono skargę][{'Journal of laws': 'Dz.U. 2016 poz 718', 'Ar...NaNNaNNaNWojewódzki Sąd Administracyjny w Olsztynie w s...Spółka A z siedzibą w W - reprezentowana przez...NaN
1/doc/C8D885030FIII SA/Po 595/21Wyrok WSA w Poznaniuorzeczenie nieprawomocne2021-10-142021-04-02Wojewódzki Sąd Administracyjny w Poznaniu[Izabela Paluszyńska, Piotr Ławrynowicz, Walen...Izabela PaluszyńskaIzabela Paluszyńska...NaNDyrektor Oddziału Regionalnego Agencji Restruk...[Oddalono skargę][{'Journal of laws': 'Dz.U. 2019 poz 2325', 'A...NaNNaNNaNDnia 14 października 2021 roku Wojewódzki Sąd ...Decyzją z [...] lutego 2020r. Nr [...] Kierown...NaN
2/doc/6447E5BA57II SA/Op 8/05Wyrok WSA w Opoluorzeczenie prawomocne2005-10-132005-01-12Wojewódzki Sąd Administracyjny w Opolu[Grażyna Jeżewska, Jerzy Krupiński, Teresa Cisyk]Jerzy KrupińskiTeresa Cisyk...NaNKomendant Policji[Uchylono decyzję I i II instancji][{'Journal of laws': 'Dz.U. 2002 nr 7 poz. 58'...ONSAiWSA 2006 4 poz. 116NaNPrzepisy art. 32 ust. 1 i 2 ustawy z dnia 23 m...Wojewódzki Sąd Administracyjny w Opolu w skład...Komendant Miejski Policji w O., rozkazem perso...NaN
3/doc/B4984CED3AII FSK 1938/08Wyrok NSAorzeczenie prawomocne2010-04-022008-11-12Naczelny Sąd Administracyjny[Bogusław Woźniak, Grzegorz Borkowski, Stefan ...Grzegorz BorkowskiBogusław Woźniak...[{'Docket number': 'I SA/Wr 183/08', 'Type of ...Dyrektor Izby Skarbowej[Oddalono skargę kasacyjną][{'Journal of laws': 'Dz.U. 1997 nr 123 poz 77...NaNNaNNaNNaczelny Sąd Administracyjny w składzie: Przew...Wojewódzki Sąd Administracyjny we Wrocławiu wy...NaN
4/doc/21CBD4642AIII SA/Wa 469/14Postanowienie WSA w WarszawieNone2015-05-192014-02-10Wojewódzki Sąd Administracyjny w Warszawie[Maciej Kurasz]Maciej KuraszMaciej Kurasz...NaNDyrektor Izby Skarbowej[Odrzucono skargę kasacyjną][{'Journal of laws': 'Dz.U. 2012 poz 270', 'Ar...NaNNaNNaNDnia 19 maja 2015 r. Wojewódzki Sąd Administra...Wojewódzki Sąd Administracyjny w Warszawie wyr...NaN
..................................................................
995/doc/F716824513I SA/Wa 640/17Postanowienie WSA w Warszawieorzeczenie prawomocne2017-06-222017-04-26Wojewódzki Sąd Administracyjny w Warszawie[Elżbieta Sobielarska]Elżbieta SobielarskaElżbieta Sobielarska...NaNSamorządowe Kolegium Odwoławcze[Odrzucono skargę]NaNNaNNaNNaNWojewódzki Sąd Administracyjny w Warszawie w s...Pismem z dnia [...] marca 2017 r. M. O. i K. O...NaN
996/doc/DE05AA888CI SA/Wa 660/18Postanowienie WSA w WarszawieNone2018-10-302018-04-13Wojewódzki Sąd Administracyjny w Warszawie[Iwona Kosińska]Iwona KosińskaIwona Kosińska...[{'Docket number': 'I OSK 945/22', 'Type of de...Inne[Wstrzymano wykonanie zaskarżonej decyzji][{'Journal of laws': 'Dz.U. 2018 poz 1302', 'A...NaNNaNNaNWojewódzki Sąd Administracyjny w Warszawie, w ...R. K. wniósł do Wojewódzkiego Sądu Administrac...NaN
997/doc/FC756C2C43I GZ 533/13Postanowienie NSAorzeczenie prawomocne2013-12-102013-11-28Naczelny Sąd Administracyjny[Wojciech Kręcisz]Wojciech KręciszWojciech Kręcisz...[{'Docket number': 'III SA/Gl 1494/12', 'Type ...Dyrektor Izby Celnej[Uchylono zaskarżone postanowienie i przekazan...[{'Journal of laws': 'Dz.U. 2002 nr 153 poz 12...NaNNaNNaNNaczelny Sąd Administracyjny w składzie: Przew...Wyrokiem z 4 września 2013 r. Wojewódzki Sąd A...NaN
998/doc/CE390B2319II SA/Wr 475/17Postanowienie WSA we Wrocławiuorzeczenie nieprawomocne2017-09-082017-07-13Wojewódzki Sąd Administracyjny we Wrocławiu[Władysław Kulon]Władysław KulonWładysław Kulon...[{'Docket number': 'I OZ 953/16', 'Type of dec...Wojewódzki Inspektor Nadzoru Budowlanego[*Odrzucono skargę][{'Journal of laws': 'Dz.U. 2017 poz 1369', 'A...NaNNaNNaNWojewódzki Sąd Administracyjny we Wrocławiu w ...Stosownie do treści art. 230 § 1 ustawy z dnia...NaN
999/doc/7745497F74II SA/Wa 2267/14Postanowienie WSA w WarszawieNone2016-09-202014-12-19Wojewódzki Sąd Administracyjny w Warszawie[Ewa Pisula-Dąbrowska]Ewa Pisula-DąbrowskaEwa Pisula-Dąbrowska...NaNMinister Nauki i Szkolnictwa Wyższego[Podjęto zawieszone postępowanie]NaNNaNNaNNaNWojewódzki Sąd Administracyjny w Warszawie w s...NaNNaN
\n", - "

1000 rows × 22 columns

\n", - "
" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 21 + "outputs": [], + "execution_count": null }, { - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-22T14:47:54.051726Z", - "start_time": "2024-10-22T14:47:54.049833Z" - } - }, + "metadata": {}, "cell_type": "code", "source": "", "id": "a2d079fc5c0768c7", "outputs": [], - "execution_count": 19 + "execution_count": null } ], "metadata": { diff --git a/juddges/data/nsa/extractor.py b/juddges/data/nsa/extractor.py index 19becb6..2f85834 100644 --- a/juddges/data/nsa/extractor.py +++ b/juddges/data/nsa/extractor.py @@ -1,10 +1,11 @@ -from typing import Any, Iterable +from typing import Any, Iterable, Sequence import pandas as pd from mpire import WorkerPool import re from bs4 import BeautifulSoup, Tag + FIELD_MAP = { "id": "id", "Sygnatura": "Docket number", @@ -58,6 +59,15 @@ "Dissenting opinion", ] +LIST_TYPE_FIELDS = { + "Hasła tematyczne", + "Symbol z opisem", + "Sędziowie", + "Treść wyniku", + "Info. o glosach", + "Publikacja w u.z.o.", +} + class NSADataExtractor: def __init__(self) -> None: @@ -65,22 +75,24 @@ def __init__(self) -> None: assert (set(ORDER) - set(FIELD_MAP.values())) == set() def extract_data_from_pages( - self, pages: Iterable[str], doc_ids: Iterable[str] + self, pages: Sequence[str], doc_ids: Sequence[str], n_jobs: int | None = None ) -> list[dict[str, Any]]: extracted_data = [] - with WorkerPool() as pool: + with WorkerPool(n_jobs) as pool: args = ( {"page": page, "doc_id": doc_id} for page, doc_id in zip(pages, doc_ids, strict=True) ) - for item in pool.map(self.extract_data, args, progress_bar=True): + for item in pool.map( + self.extract_data, args, progress_bar=True, iterable_len=len(pages) + ): extracted_data.append(item) return extracted_data def extract_data_from_pages_to_df( - self, pages: Iterable[str], doc_ids: Iterable[str] + self, pages: Iterable[str], doc_ids: Iterable[str], n_jobs: int | None = None ) -> pd.DataFrame: - extracted_data = self.extract_data_from_pages(pages, doc_ids) + extracted_data = self.extract_data_from_pages(pages, doc_ids, n_jobs) return pd.DataFrame(extracted_data, columns=ORDER) def extract_data(self, page: str, doc_id: str) -> dict[str, Any]: @@ -137,12 +149,7 @@ def _extract_table(self, soup: BeautifulSoup) -> dict[str, Any]: ) elif "Sygn. powiązane" in label_text: extracted_data[FIELD_MAP["Sygn. powiązane"]] = self._extract_related(value) - elif "
" in value_text or label_text in ( - "Hasła tematyczne", - "Symbol z opisem", - "Sędziowie", - "Treść wyniku", - ): + elif "
" in value_text or label_text in LIST_TYPE_FIELDS: extracted_data |= self._extract_fields_with_br(label_text, value_text) elif "Data orzeczenia" in label_text: extracted_data |= self._extract_date_finality(value) diff --git a/scripts/nsa/extract_data_from_pages.py b/scripts/nsa/extract_data_from_pages.py new file mode 100644 index 0000000..6d66f7e --- /dev/null +++ b/scripts/nsa/extract_data_from_pages.py @@ -0,0 +1,35 @@ +import pandas as pd +from tqdm import tqdm + +from juddges.settings import NSA_DATA_PATH +from juddges.data.nsa.extractor import NSADataExtractor + +extractor = NSADataExtractor() + + +OUTPUT_PATH = NSA_DATA_PATH / "dataset" +OUTPUT_PATH.mkdir(exist_ok=True, parents=True) + +N_JOBS = 10 + +files = list(sorted(NSA_DATA_PATH.glob("pages/pages_chunk_*.parquet"))) + +for path in tqdm(files): + print(f"Extracting data from {path}") + df = pd.read_parquet(path) + data = extractor.extract_data_from_pages_to_df(df["page"], df["doc_id"], n_jobs=N_JOBS) + data.to_parquet(OUTPUT_PATH / f"data_{path.stem.split('_')[-1]}.parquet") + del df + del data + +# # Define the batch size +# batch_size = 10000 +# +# # Open the Parquet file +# parquet_file = pq.ParquetFile(NSA_DATA_PATH / "pages" / "pages_chunk_0.parquet") +# +# # Iterate over the file in batches +# for batch in parquet_file.iter_batches(batch_size=batch_size): +# df = batch.to_pandas() +# # Process each batch of data as a Pandas DataFrame +# print(df.head()) # Example: just print the first few rows