Skip to content

Commit

Permalink
Merge pull request #279 from xxyzz/bz2
Browse files Browse the repository at this point in the history
Use bz2 Python library if `lbzcat` and `bzcat` are not installed
  • Loading branch information
xxyzz authored Apr 29, 2024
2 parents 10bfffb + ae9d945 commit c9440ce
Showing 1 changed file with 9 additions and 3 deletions.
12 changes: 9 additions & 3 deletions src/wikitextprocessor/dumpparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#
# Copyright (c) 2018-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org

import bz2
import hashlib
import json
import os
Expand All @@ -10,7 +11,7 @@
import sys
import unicodedata
from pathlib import Path
from typing import TYPE_CHECKING, Optional
from typing import TYPE_CHECKING, Optional, Union

if TYPE_CHECKING:
from .core import Wtp
Expand All @@ -19,8 +20,13 @@
from .logging_utils import logger


def decompress_dump_file(dump_path: str) -> subprocess.Popen:
def decompress_dump_file(
dump_path: str,
) -> Union[subprocess.Popen, bz2.BZ2File]:
if dump_path.endswith(".bz2"):
if shutil.which("lbzcat") is None and shutil.which("bzcat") is None:
return bz2.open(dump_path, "rb")

decompress_command = (
"lbzcat" if shutil.which("lbzcat") is not None else "bzcat"
)
Expand All @@ -43,7 +49,7 @@ def parse_dump_xml(wtp: "Wtp", dump_path: str, namespace_ids: set[int]) -> None:
namespaces = {None: namespace_str}
page_nums = 0
for _, page_element in etree.iterparse(
p.stdout, # type: ignore
p.stdout if isinstance(p, subprocess.Popen) else p, # type: ignore
tag=f"{{{namespace_str}}}page",
):
title = page_element.findtext("title", "", namespaces)
Expand Down

0 comments on commit c9440ce

Please sign in to comment.