diff --git a/.github/workflows/pyinstaller.yml b/.github/workflows/pyinstaller.yml new file mode 100644 index 0000000..e4707a9 --- /dev/null +++ b/.github/workflows/pyinstaller.yml @@ -0,0 +1,48 @@ +name: Build executables + +on: + push: + tags: + - 'v*' + +jobs: + build: + name: Build executables + + strategy: + matrix: + runs-on: ['ubuntu-240.04', 'macos-11', 'windows-2019'] + runs-on: ${{ matrix.runs-on }} + + steps: + - name: Checkout code + uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: 3.7 + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install pyinstaller + + - name: Build executable with PyInstaller + run: | + pyinstaller --onefile antenati_gui.py + + - name: Upload artifact for Windows + if: runner.os == 'Windows' + uses: actions/upload-artifact@v3 + with: + name: antenati_gui_windows.exe + path: dist/antenati_gui.exe + + - name: Upload artifact for macOS and Ubuntu + if: runner.os != 'Windows' + uses: actions/upload-artifact@v3 + with: + name: antenati_gui_${{ matrix.runs-on }} + path: dist/antenati_gui diff --git a/.github/workflows/pythonapp.yml b/.github/workflows/pythonapp.yml index c00180a..42fe875 100644 --- a/.github/workflows/pythonapp.yml +++ b/.github/workflows/pythonapp.yml @@ -4,20 +4,28 @@ on: [push] jobs: test: + name: Test + strategy: matrix: runs-on: ['ubuntu-20.04', 'macos-11', 'windows-2019'] + runs-on: ${{ matrix.runs-on }} + steps: - - uses: actions/checkout@v2 + - name: Checkout code + uses: actions/checkout@v2 + - name: Set up Python 3.7 uses: actions/setup-python@v2 with: python-version: 3.7 + - name: Install dependencies run: | python -m pip install --upgrade pip - pip install -r requirements.txt + pip install -r requirements.txt + - name: Download test run: | python antenati.py "https://www.antenati.san.beniculturali.it/ark:/12657/an_ua19944535/w9DWR8x" diff --git a/PRINCIPIANTI.md b/PRINCIPIANTI.md index 49bca58..53073e4 100644 --- a/PRINCIPIANTI.md +++ b/PRINCIPIANTI.md @@ -1,39 +1,7 @@ # Istruzioni per principianti -## Windows -### Installare Python -Occorre Python 3 almeno alla versione 3.6. Il modo più veloce è passare dal Microsoft Store. Potete aprirlo e cercare "Python 3.10", o per semplicità [cliccare qui](https://www.microsoft.com/it-it/p/python-310/9pjpw5ldxlz5). +Scaricate la versione GUI dagli artefatti dell'ultima release. Trovate le versioni per Windows, Linux e macOS. -### Scaricare questo repository -Potete scaricare il contenuto di questo repository da [qui](https://github.com/gcerretani/antenati/archive/refs/heads/master.zip). Estraetene il contenuto, che dovrebbe chiamarsi **antenati-master**, da qualche parte, per esempio nella cartella dei Documenti. - -### Aprire un terminale -Aprite un terminale. La PowerShell è la soluzione più semplice e moderna: cercate "Windows PowerShell" dal menu start ed apritela. Per cambiare la cartella di lavoro a quella dove avete scaricato il contenuto di questo repository, eseguite: - - cd $env:HOMEPATH\Documents\antenati-master - -Controllate di essere nella cartella giusta. Eseguite: - - ls - -e guardate che ci sia il contenuto di questo repository. - -### Installare le dipendenze -Quindi, eseguite: - - pip install -r requirements.txt - -Dovrebbe impiegare qualche secondo. Questa cosa va fatta solamente la prima volta, e serve a installare le dipendenze di questo programma. Le volte successive potete saltare questo passaggio - -### Via! -Adesso siete pronti. Provate a scaricare un album copiando l'URL della pagina del Portale Antenati dopo a `python3 antenati.py`. Supponendo che siate interessati ai nati a Viareggio nel 1808, dovreste eseguire una cosa del genere: - - python3 antenati.py https://antenati.cultura.gov.it/ark:/12657/an_ua19944535/w9DWR8x - -Buon divertimento! - -## Linux -TODO - -## macos -TODO +0. Lanciate l'eseguibile! +1. Come URL inserite qualcosa tipo https://antenati.cultura.gov.it/ark:/12657/an_ua19944535/w9DWR8x. +2. Poi Selezionate una cartella di destinazione. Il programma scaricherà il contenuto in una sottocartella con un nome tipo *archivio-di-stato-di-lucca-stato-civile-napoleonico-viareggio-1807-nati-19944549*. diff --git a/README.md b/README.md index fb2b7b9..76753b8 100644 --- a/README.md +++ b/README.md @@ -3,18 +3,32 @@ A tool to download data from the *[Portale Antenati](http://antenati.cultura.gov Since the website tends to be pretty slow in the evening, we present a script to help the retrieval of the documents for your family tree. The script allows you to download **all the images of any archive at the same time**, without any human action. Just launch the script and have a coffee while it downloads all the stuff for you. -## Requirements +## GUI version + +Just get the executable from the release artifacts, and have fun! + +#### Example: +In the website, navigate to the archive you want to download. For example, for the people born in Viareggio in 1807 you should find the page: + +[https://antenati.cultura.gov.it/ark:/12657/an_ua19944535/w9DWR8x](https://antenati.cultura.gov.it/ark:/12657/an_ua19944535/w9DWR8x) + +Then, copy the link to the first page, and paste it in the Archive URL field of the windows. Them, specify a destination folder: +the results will be placed there, in a new subfolder named *archivio-di-stato-di-lucca-stato-civile-napoleonico-viareggio-1807-nati-19944549*. + +## CLI version + +### Requirements The software is written in Python 3 and tested with Python 3.7. On Windows the version on the Microsoft Store is fine, on Linux use your distribution package manager. -## Usage +### Usage Open your preferite terminal and change directory to where you've extracted the content of this repo. Then execute the following commands. -### Install the dependencies +#### Install the dependencies The first time you will have to install the dependencies: pip install -r requirements.txt -### Run +#### Run To download the images of a gallery, execute the script passing the URL of a collection you want to download as argument: python3 antenati.py @@ -23,7 +37,7 @@ The files will be downloaded to a new folder named as *ARCHIVE-PLACE-YEAR-TYPE-I python3 antenati.py -h -### Example: +#### Example: In the website, navigate to the archive you want to download. For example, for the people born in Viareggio in 1807 you should find the page: [https://antenati.cultura.gov.it/ark:/12657/an_ua19944535/w9DWR8x](https://antenati.cultura.gov.it/ark:/12657/an_ua19944535/w9DWR8x) diff --git a/antenati.py b/antenati.py index 7ac4b07..dc582d1 100755 --- a/antenati.py +++ b/antenati.py @@ -6,25 +6,40 @@ __author__ = 'Giovanni Cerretani' __copyright__ = 'Copyright (c) 2022, Giovanni Cerretani' __license__ = 'MIT License' -__version__ = '2.5' +__version__ = '3.0' +__contact__ = 'https://gcerretani.github.io/antenati/' from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser from concurrent.futures import ThreadPoolExecutor, as_completed +from dataclasses import dataclass from email.message import EmailMessage from json import loads from mimetypes import guess_extension from os import chdir, mkdir, path +from pathlib import Path from re import findall, search -from typing import Any, Dict, List +from typing import Any, Callable, Dict, List, Optional from certifi import where -from urllib3 import HTTPResponse, HTTPSConnectionPool, PoolManager, make_headers +from urllib3 import HTTPSConnectionPool, PoolManager, make_headers from click import echo, confirm from slugify import slugify from humanize import naturalsize from tqdm import tqdm +_UpdaterType = Optional[Callable[[], None]] + +@dataclass +class ProgressBar: + set_total: Callable[[int], None] + update: Callable[[], None] + + +DEFAULT_N_THREADS: int = 8 +DEFAULT_N_CONNECTIONS: int = 4 + + class AntenatiDownloader: """Downloader class""" @@ -32,9 +47,8 @@ class AntenatiDownloader: archive_id: str manifest: Dict[str, Any] canvases: List[Dict[str, Any]] - dirname: str + dirname: Path gallery_length: int - gallery_size: int def __init__(self, url: str, first: int, last: int): self.url = url @@ -43,7 +57,6 @@ def __init__(self, url: str, first: int, last: int): self.canvases = self.manifest['sequences'][0]['canvases'][first:last] self.dirname = self.__generate_dirname() self.gallery_length = len(self.canvases) - self.gallery_size = 0 @staticmethod def __http_headers() -> Dict[str, Any]: @@ -87,7 +100,7 @@ def __get_iiif_manifest(url: str) -> Dict[str, Any]: cert_reqs='CERT_REQUIRED', ca_certs=where() ) - http_reply: HTTPResponse = pool.request('GET', url) + http_reply = pool.request('GET', url) if http_reply.status != 200: raise RuntimeError(f'{url}: HTTP error {http_reply.status}') content_type = AntenatiDownloader.__parse_header(http_reply.headers['Content-Type']) @@ -112,12 +125,12 @@ def __get_metadata_content(self, label: str) -> str: except StopIteration as exc: raise RuntimeError(f'Cannot get {label} from manifest') from exc - def __generate_dirname(self) -> str: + def __generate_dirname(self) -> Path: """Generate directory name from info in IIIF manifest""" archive_context = self.__get_metadata_content('Contesto archivistico') archive_year = self.__get_metadata_content('Titolo') archive_typology = self.__get_metadata_content('Tipologia') - return slugify(f'{archive_context}-{archive_year}-{archive_typology}-{self.archive_id}') + return Path(slugify(f'{archive_context}-{archive_year}-{archive_typology}-{self.archive_id}')) def print_gallery_info(self) -> None: """Print IIIF gallery info""" @@ -127,11 +140,16 @@ def print_gallery_info(self) -> None: print(f'{label:<25}{value}') print(f'{self.gallery_length} images found.') - def check_dir(self) -> None: + def check_dir(self, dirname: Optional[str] = None, interactive = True) -> None: """Check if directory already exists and chdir to it""" + if dirname is not None: + self.dirname = Path(dirname) / self.dirname print(f'Output directory: {self.dirname}') if path.exists(self.dirname): - echo(f'Directory {self.dirname} already exists.') + msg = f'Directory {self.dirname} already exists.' + if not interactive: + raise RuntimeError(msg) + echo(msg) confirm('Do you want to proceed?', abort=True) else: mkdir(self.dirname) @@ -140,7 +158,7 @@ def check_dir(self) -> None: @staticmethod def __thread_main(pool: HTTPSConnectionPool, canvas: Dict[str, Any]) -> int: url = canvas['images'][0]['resource']['@id'] - http_reply: HTTPResponse = pool.request('GET', url) + http_reply = pool.request('GET', url) if http_reply.status != 200: raise RuntimeError(f'{url}: HTTP error {http_reply.status}') content_type = AntenatiDownloader.__parse_header(http_reply.headers['Content-Type']) @@ -169,29 +187,23 @@ def __pool(maxsize: int) -> HTTPSConnectionPool: ca_certs=where() ) - @staticmethod - def __progress(total: int) -> tqdm: - return tqdm(total=total, unit='img') + def run_cli(self, n_workers: int, n_connections) -> int: + """Main function spanning run function in a thread pool, with tqdm progress bar""" + with tqdm(unit='img') as progress: + progress_bar = ProgressBar(progress.reset, progress.update) + return self.run(n_workers, n_connections, progress_bar) - def run(self, n_workers: int, n_connections: int) -> None: + def run(self, n_workers: int, n_connections: int, progress: ProgressBar) -> int: """Main function spanning run function in a thread pool""" with self.__executor(n_workers) as executor, self.__pool(n_connections) as pool: future_img = {executor.submit(self.__thread_main, pool, i): i for i in self.canvases} - with self.__progress(self.gallery_length) as progress: - for future in as_completed(future_img): - progress.update() - canvas = future_img[future] - label = canvas['label'] - try: - size = future.result() - except RuntimeError as exc: - progress.write(f'{label} error ({exc})') - else: - self.gallery_size += size - - def print_summary(self) -> None: - """Print summary""" - print(f'Done. Total size: {naturalsize(self.gallery_size)}') + progress.set_total(self.gallery_length) + gallery_size = 0 + for future in as_completed(future_img): + progress.update() + size = future.result() + gallery_size += size + return gallery_size def main() -> None: @@ -204,8 +216,8 @@ def main() -> None: formatter_class=ArgumentDefaultsHelpFormatter ) parser.add_argument('url', metavar='URL', type=str, help='url of the gallery page') - parser.add_argument('-n', '--nthreads', type=int, help='max n. of threads', default=8) - parser.add_argument('-c', '--nconn', type=int, help='max n. of connections', default=4) + parser.add_argument('-n', '--nthreads', type=int, help='max n. of threads', default=DEFAULT_N_CONNECTIONS) + parser.add_argument('-c', '--nconn', type=int, help='max n. of connections', default=DEFAULT_N_THREADS) parser.add_argument('-f', '--first', type=int, help='first image to download', default=0) parser.add_argument('-l', '--last', type=int, help='first image NOT to download', default=None) parser.add_argument('-v', '--version', action='version', version=__version__) @@ -221,10 +233,10 @@ def main() -> None: downloader.check_dir() # Run - downloader.run(args.nthreads, args.nconn) + gallery_size = downloader.run_cli(args.nthreads, args.nconn) # Print summary - downloader.print_summary() + print(f'Done. Total size: {naturalsize(gallery_size)}') if __name__ == '__main__': diff --git a/antenati_gui.py b/antenati_gui.py new file mode 100644 index 0000000..d74f3b8 --- /dev/null +++ b/antenati_gui.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python3 +""" +antenati_gui.py: a GUI tool to download data from the Portale Antenati +""" + +__author__ = 'Giovanni Cerretani' +__copyright__ = 'Copyright (c) 2022, Giovanni Cerretani' +__license__ = 'MIT License' +__contact__ = 'https://gcerretani.github.io/antenati/' + +from concurrent.futures import ThreadPoolExecutor +from contextlib import contextmanager +from dataclasses import dataclass, field +import tkinter as tk +import tkinter.filedialog as tkfile +import tkinter.messagebox as tkmsg +import tkinter.ttk as ttk +from webbrowser import open as webopen + +from humanize import naturalsize + +import antenati + + +@dataclass +class _ProgressBarSetter: + progress_bar: ttk.Progressbar + total: int = field(default = 0) + n: int = field(default = 0) + + def set_total(self, total: int): + """Set max value""" + self.total = total + + def __set(self, value: float) -> None: + """Set progress bar value""" + self.progress_bar['value'] = value + + def reset(self) -> None: + """Reset""" + self.total = 0 + self.n = 0 + self.__set(0) + + def update(self) -> None: + """Set progress bar value in main Tk thread""" + self.n += 1 + percent_completed = 100 * self.n / self.total + self.progress_bar.master.after(0, self.__set, percent_completed) + + +@dataclass +class _CompletedFlag: + _variable: tk.BooleanVar + + @contextmanager + def set_at_exit(self): + """To be used in a with-statement""" + try: + yield + finally: + self._variable.set(True) + + +class _Window: + def __init__(self, root: tk.Tk, title: str): + self.__root = root + self.__root.minsize(400, 100) + self.__root.title(title.strip()) + + # Create menu + self.__menu = tk.Menu(self.__root) + self.__root.configure(menu=self.__menu) + + # Populate entries + self.__create_menu() + self.__create_entries() + self.__create_footer() + + def __create_menu(self): + menu_file = tk.Menu(self.__menu, tearoff=0) + menu_file.add_command(label='Portale Antenati Website', command=lambda: webopen('https://antenati.cultura.gov.it/')) + menu_file.add_command(label='Project Website', command=lambda: webopen(__contact__)) + menu_file.add_separator() + menu_file.add_command(label='About', command=self.__about) + self.__menu.add_cascade(label='File', menu=menu_file) + + def __create_entries(self): + entry_frame = ttk.Frame(self.__root) + entry_frame.pack(side=tk.TOP, fill=tk.X) + url_label = tk.Label(entry_frame, text='Archive URL') + url_label.grid(row=0, column=0, padx=10, pady=5, sticky=tk.W) + self.__url_textvariable = tk.StringVar() + url_entry = ttk.Entry(entry_frame, textvariable=self.__url_textvariable, width=100) + url_entry.grid(row=0, column=1, padx=10, pady=5, columnspan=2, sticky=tk.EW) + self.__path_textvariable = tk.StringVar() + path_label = tk.Label(entry_frame, text='Destination folder') + path_label.grid(row=1, column=0, padx=10, pady=5, sticky=tk.EW) + path_entry = ttk.Entry(entry_frame, textvariable=self.__path_textvariable, width=100) + path_entry.grid(row=1, column=1, padx=10, pady=5, sticky=tk.EW) + browse_button = ttk.Button(entry_frame, text='Browse', command=self.__browse_path) + browse_button.grid(row=1, column=2, padx=10, pady=5, sticky=tk.EW) + self.__download_button = ttk.Button(entry_frame, text='Download', command=self.__download) + self.__download_button.grid(row=2, column=1, padx=5, pady=5) + download_button = ttk.Button(entry_frame, text='Support this project', command=lambda: webopen('https://ko-fi.com/gcerretani')) + download_button.grid(row=3, column=1, padx=5, pady=5) + + def __create_footer(self): + footer_frame = ttk.Frame(self.__root) + footer_frame.pack(side=tk.BOTTOM, fill=tk.X) + self.__footer_label = ttk.Label(footer_frame, anchor=tk.W) + self.__footer_label.grid(row=0, column=0, padx=2, pady=2, sticky=tk.EW) + self.__footer_led = ttk.Label(footer_frame, anchor=tk.CENTER, width=18) + self.__footer_led.grid(row=0, column=1, padx=2, pady=2, sticky=tk.EW) + footer_frame.columnconfigure(0, weight=1) + self.__progress_bar = ttk.Progressbar(self.__root, mode='determinate', orient=tk.HORIZONTAL) + self.__progress_bar.pack(side=tk.BOTTOM, fill=tk.BOTH, padx=2, pady=2) + + def __about(self) -> None: + """Show about popup""" + msg = f'{__doc__.strip()}' + msg += f'\n{antenati.__version__}' + msg += f'\n{__copyright__}' + tkmsg.showinfo('About', msg) + + def __browse_path(self): + selected_path = tkfile.askdirectory() + if selected_path: + self.__path_textvariable.set(selected_path) + + @contextmanager + def __wait_flag(self): + variable = tk.BooleanVar(value=False) + try: + yield _CompletedFlag(variable) + finally: + self.__root.wait_variable(variable) + + def __download(self): + url = self.__url_textvariable.get() + if len(url) == 0: + raise RuntimeError('Please enter a valid URL.') + path_value = self.__path_textvariable.get() + if len(path_value) == 0: + raise RuntimeError('Please enter a valid destination folder.') + downloader = antenati.AntenatiDownloader(url, 0, None) + downloader.check_dir(path_value, False) + with ThreadPoolExecutor(max_workers=1) as exc, self.__progress_bar_setter() as pb, self.__in_progress(), self.__wait_flag() as flag: + def cmd(): + with flag.set_at_exit(): + progressbar = antenati.ProgressBar(pb.set_total, pb.update) + return downloader.run(antenati.DEFAULT_N_THREADS, antenati.DEFAULT_N_CONNECTIONS, progressbar) + future = exc.submit(cmd) + gallery_size = future.result() + tkmsg.showinfo('Success', f'Operation completed successfully. Total size: {naturalsize(gallery_size)}') + + def __set_footer_message(self, text: str) -> None: + """Set footer message""" + self.__footer_label.configure(text=text) + + @contextmanager + def __in_progress(self): + """Context manager to disable buttons""" + self.__download_button.configure(state=tk.DISABLED) + self.__footer_label.configure(text='Operation in progress...') + try: + yield + finally: + self.__set_footer_message('') + self.__footer_label.configure(text='') + self.__download_button.configure(state=tk.NORMAL) + + @contextmanager + def __progress_bar_setter(self): + """Context manager for progress bar""" + setter = _ProgressBarSetter(self.__progress_bar) + try: + yield setter + finally: + setter.reset() # Reset value + + +if __name__ == '__main__': + tk_root = tk.Tk() + def __callback_exception(_type, ex: BaseException, _traceback): + tkmsg.showerror('Error', f'{ex}') + tk_root.report_callback_exception = __callback_exception + app = _Window(tk_root, __doc__) + tk_root.mainloop()