Skip to content

Commit

Permalink
Make possible to disable progressbar all the way up
Browse files Browse the repository at this point in the history
  • Loading branch information
rlskoeser committed Mar 28, 2024
1 parent 7dfd911 commit 94aedc8
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 7 deletions.
29 changes: 25 additions & 4 deletions src/corppa/utils/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Utility for filtering PPA full-text corpus to work with a subset of
pages. Currently supports filtering by a list of PPA source ids.
Currently, there is no way to filter to a specific excerpt when
there are multiple.
there are multiple excerpts from a single source.
Can be run via command-line or python code. Takes jsonl file (compressed or
not) as input, a filename for output, and a file with a list of
Expand All @@ -18,6 +18,7 @@
"""

import argparse
import os.path

import orjsonl
from tqdm import tqdm
Expand Down Expand Up @@ -53,7 +54,9 @@ def filter_pages(input_filename, source_ids, disable_progress=False):
# based on HathiTrust page tags like UNTYPICAL_PAGE or text content


def save_filtered_corpus(input_filename, output_filename, idfile):
def save_filtered_corpus(
input_filename, output_filename, idfile, disable_progress=False
):
"""Takes a filename for input PPA full-text corpus in a format
orjsonl supports, filename where filtered corpus should be saved,
and a filename with a list of source ids, one id per line.
Expand All @@ -63,7 +66,10 @@ def save_filtered_corpus(input_filename, output_filename, idfile):
source_ids = [line.strip() for line in idfile_content]

# use orjsonl to stream filtered pages to specified output file
orjsonl.save(output_filename, filter_pages(input_filename, source_ids))
orjsonl.save(
output_filename,
filter_pages(input_filename, source_ids, disable_progress=disable_progress),
)


def main():
Expand All @@ -80,9 +86,24 @@ def main():
"output", help="filename where the filtered corpus should be saved"
)
parser.add_argument("idfile", help="filename with list of source ids, one per line")
parser.add_argument(
"--progress",
help="Show progress",
action=argparse.BooleanOptionalAction,
default=True,
)

args = parser.parse_args()
save_filtered_corpus(args.input, args.output, args.idfile)
# progress bar is enabled by default; disable if requested
disable_progress = not args.progress

output_filename = args.output
# if requested output filename has no extension, add jsonl
if os.path.splitext(output_filename)[1] == "":
output_filename = f"{output_filename}.jsonl"
save_filtered_corpus(
args.input, output_filename, args.idfile, disable_progress=disable_progress
)


if __name__ == "__main__":
Expand Down
61 changes: 58 additions & 3 deletions test/test_utils/test_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import pytest

from corppa.utils.filter import filter_pages, save_filtered_corpus
from corppa.utils.filter import filter_pages, save_filtered_corpus, main

# minimal/mock page data fixture for testing
fixture_page_data = [
Expand Down Expand Up @@ -44,12 +44,29 @@ def test_filter_pages_progressbar(mock_orjsonl, mock_tqdm, corpus_file):
mock_tqdm.assert_called_with(
mock_orjsonl.stream.return_value,
desc="Filtering",
bar_format="{desc}: {n:,} pages{postfix} | elapsed: {elapsed}",
bar_format="{desc}: checked {n:,} pages{postfix} | elapsed: {elapsed}",
disable=False,
)
mock_tqdm.return_value.set_postfix_str.assert_any_call("selected 1")


@patch("corppa.utils.filter.tqdm")
@patch("corppa.utils.filter.orjsonl")
def test_filter_pages_noprogressbar(mock_orjsonl, mock_tqdm, corpus_file):
# test disabling progressbar
# configure mock tqdm iterator to return fixture page data
mock_tqdm.return_value.__iter__.return_value = fixture_page_data
# use list to consume the generator
list(filter_pages(str(corpus_file), ["foo"], disable_progress=True))
mock_orjsonl.stream.assert_called_with(str(corpus_file))
mock_tqdm.assert_called_with(
mock_orjsonl.stream.return_value,
desc="Filtering",
bar_format="{desc}: checked {n:,} pages{postfix} | elapsed: {elapsed}",
disable=True,
)


@patch("corppa.utils.filter.filter_pages")
@patch("corppa.utils.filter.orjsonl")
def test_save_filtered_corpus(mock_orjsonl, mock_filter_pages, tmpdir):
Expand All @@ -61,8 +78,46 @@ def test_save_filtered_corpus(mock_orjsonl, mock_filter_pages, tmpdir):

save_filtered_corpus(input_filename, output_filename, str(idfile))
# filter should be called with input file and list of ids from text file
mock_filter_pages.assert_called_with(input_filename, ids)
mock_filter_pages.assert_called_with(input_filename, ids, disable_progress=False)
# should save result to specified output filename
mock_orjsonl.save.assert_called_with(
output_filename, mock_filter_pages.return_value
)


@pytest.mark.parametrize(
"cli_args, call_params",
[
# all required params, default progressbar behavior
(
["filter.py", "pages.json", "subset.jsonl", "id.txt"],
(("pages.json", "subset.jsonl", "id.txt"), {"disable_progress": False}),
),
# disable progress bar
(
[
"filter.py",
"pages.json.bz2",
"subset.jsonl.gz",
"id.txt",
"--no-progress",
],
(
("pages.json.bz2", "subset.jsonl.gz", "id.txt"),
{"disable_progress": True},
),
),
# no extension on output file; should add jsonl
(
["filter.py", "pages.json", "subset", "id.txt"],
(("pages.json", "subset.jsonl", "id.txt"), {"disable_progress": False}),
),
],
)
@patch("corppa.utils.filter.save_filtered_corpus")
def test_main(mock_save_filtered_corpus, cli_args, call_params):
# patch in test args for argparse to parse
with patch("sys.argv", cli_args):
main()
args, kwargs = call_params
mock_save_filtered_corpus.assert_called_with(*args, **kwargs)

0 comments on commit 94aedc8

Please sign in to comment.