Skip to content

Commit

Permalink
Switch to libzim instead of scraperlib and move code outside of commo…
Browse files Browse the repository at this point in the history
…n since only taskmanager has proper dependencies
  • Loading branch information
benoit74 committed Feb 29, 2024
1 parent 28306d6 commit ff10326
Show file tree
Hide file tree
Showing 5 changed files with 123 additions and 32 deletions.
26 changes: 0 additions & 26 deletions workers/app/common/zim.py

This file was deleted.

2 changes: 1 addition & 1 deletion workers/app/task/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
)
from common.utils import format_key, format_size
from common.worker import BaseWorker
from common.zim import get_zim_info
from task.zim import get_zim_info

SLEEP_INTERVAL = 60 # nb of seconds to sleep before watching
PENDING = "pending"
Expand Down
121 changes: 121 additions & 0 deletions workers/app/task/zim.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
from __future__ import annotations

import base64
import io
import pathlib
from collections import namedtuple
from typing import Any, Dict, Optional

from libzim import Archive


def get_zim_info(fpath: pathlib.Path) -> Dict[str, Any]:
zim = Archive(fpath)
payload = {
"id": str(zim.uuid),
"counter": counters(zim),
"article_count": zim.article_count,
"media_count": zim.media_count,
"size": fpath.stat().st_size,
"metadata": {
key: get_text_metadata(zim, key)
for key in zim.metadata_keys
if not key.startswith("Illustration_")
},
}
for size in zim.get_illustration_sizes():
payload["metadata"].update(
{
f"Illustration_{size}x{size}": base64.standard_b64encode(
zim.get_illustration_item(size).content
).decode("ASCII")
}
)
return payload


# Code below is duplicated from python-scraperlib, in order to depend only on
# python-libzim in the task manager, and not the whole python-scraperlib and all its
# dependencies

MimetypeAndCounter = namedtuple("MimetypeAndCounter", ["mimetype", "value"])
CounterMap = Dict[
type(MimetypeAndCounter.mimetype), type(MimetypeAndCounter.value) # pyright: ignore
]


def get_text_metadata(zim: Archive, name: str) -> str:
"""Decoded value of a text metadata"""
return zim.get_metadata(name).decode("UTF-8")


def getline(src: io.StringIO, delim: Optional[bool] = None) -> tuple[bool, str]:
"""C++ stdlib getline() ~clone
Reads `src` until it finds `delim`.
returns whether src is EOF and the extracted string (delim excluded)"""
output = ""
if not delim:
return True, src.read()

char = src.read(1)
while char:
if char == delim:
break
output += char
char = src.read(1)
return char == "", output


def counters(zim: Archive) -> dict[str, int]:
try:
return parseMimetypeCounter(get_text_metadata(zim, "Counter"))
except RuntimeError: # pragma: no cover (no ZIM avail to test itl)
return {} # pragma: no cover


def readFullMimetypeAndCounterString(
src: io.StringIO,
) -> tuple[bool, str]:
"""read a single mimetype-and-counter string from source
Returns whether the source is EOF and the extracted string (or empty one)"""
params = ""
eof, mtcStr = getline(src, ";") # pyright: ignore
if mtcStr.find("=") == -1:
while params.count("=") != 2: # noqa: PLR2004
eof, params = getline(src, ";") # pyright: ignore
if params.count("=") == 2: # noqa: PLR2004
mtcStr += ";" + params
if eof:
break
return eof, mtcStr


def parseASingleMimetypeCounter(string: str) -> MimetypeAndCounter:
"""MimetypeAndCounter from a single mimetype-and-counter string"""
k: int = string.rfind("=")
if k != len(string) - 1:
mimeType = string[:k]
counter = string[k + 1 :] # noqa: E203
try:
return MimetypeAndCounter(mimeType, int(counter))
except ValueError:
pass # value is not castable to int
return MimetypeAndCounter("", 0)


def parseMimetypeCounter(
counterData: str,
) -> CounterMap:
"""Mapping of MIME types with count for each from ZIM Counter metadata string"""
counters = {}
ss = io.StringIO(counterData)
eof = False
while not eof:
eof, mtcStr = readFullMimetypeAndCounterString(ss)
mtc = parseASingleMimetypeCounter(mtcStr)
if mtc.mimetype:
counters.update([mtc])
ss.close()
return counters
4 changes: 0 additions & 4 deletions workers/task-Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,6 @@ FROM python:3.12-slim-bookworm
LABEL zimfarm=true
LABEL org.opencontainers.image.source https://github.com/openzim/zimfarm

# install libmagic for zimscraperlib
RUN apt-get update -y \
&& apt-get install -y --no-install-recommends libmagic1

WORKDIR /usr/src

COPY task-requirements.txt requirements.txt
Expand Down
2 changes: 1 addition & 1 deletion workers/task-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,5 @@ humanfriendly==10.0
PyJWT==2.8.0
kiwixstorage==0.6
ujson==5.9.0
zimscraperlib==3.3.0
libzim==3.4.0
paramiko==2.11.0

0 comments on commit ff10326

Please sign in to comment.