Skip to content

Commit

Permalink
More error handling and tests for files that should / should not exist
Browse files Browse the repository at this point in the history
- idfile should exist and not be empty
- output file should not exist
  • Loading branch information
rlskoeser committed Apr 2, 2024
1 parent 7914742 commit 95c62e9
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 2 deletions.
16 changes: 15 additions & 1 deletion src/corppa/utils/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,10 +123,24 @@ def main():
# progress bar is enabled by default; disable if requested
disable_progress = not args.progress

output_filename = args.output
if not os.path.exists(args.idfile):
print(f"Error: idfile {args.idfile} does not exist")
exit(-1)
elif os.path.getsize(args.idfile) == 0:
print(f"Error: idfile {args.idfile} is empty")
exit(-1)

# if requested output filename has no extension, add jsonl
output_filename = args.output
if os.path.splitext(output_filename)[1] == "":
output_filename = f"{output_filename}.jsonl"

if os.path.exists(output_filename):
print(
f"Error: requested output file {args.output} already exists; not overwriting"
)
exit(-1)

try:
save_filtered_corpus(
args.input, args.idfile, output_filename, disable_progress=disable_progress
Expand Down
44 changes: 43 additions & 1 deletion test/test_utils/test_filter.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import os
from unittest.mock import patch

import pytest
Expand Down Expand Up @@ -118,9 +119,50 @@ def test_save_filtered_corpus(mock_orjsonl, mock_filter_pages, tmpdir):
],
)
@patch("corppa.utils.filter.save_filtered_corpus")
def test_main(mock_save_filtered_corpus, cli_args, call_params):
def test_main(mock_save_filtered_corpus, cli_args, call_params, tmp_path):
# change to temp directory, make sure id file exists and is non-zero
os.chdir(tmp_path)
idfile = tmp_path / cli_args[2]
idfile.write_text("id1\nid2")

# patch in test args for argparse to parse
with patch("sys.argv", cli_args):
main()
args, kwargs = call_params
mock_save_filtered_corpus.assert_called_with(*args, **kwargs)


@patch("corppa.utils.filter.save_filtered_corpus")
@patch("corppa.utils.filter.exit")
def test_main_idfile_nonexistent(mock_exit, mock_save_filtered_corpus, capsys):
with patch("sys.argv", ["f.py", "foo.jsonl", "/not/a/real/id.txt", "out.jsonl"]):
main()
mock_exit.assert_called_with(-1)
captured = capsys.readouterr()
assert "does not exist" in captured.out


@patch("corppa.utils.filter.save_filtered_corpus")
@patch("corppa.utils.filter.exit")
def test_main_idfile_empty(mock_exit, mock_save_filtered_corpus, capsys, tmp_path):
idfile = tmp_path / "id.txt"
idfile.touch()
with patch("sys.argv", ["f.py", "foo.jsonl", str(idfile), "out.jsonl"]):
main()
mock_exit.assert_called_with(-1)
captured = capsys.readouterr()
assert "is empty" in captured.out


@patch("corppa.utils.filter.save_filtered_corpus")
@patch("corppa.utils.filter.exit")
def test_main_outfile_exists(mock_exit, mock_save_filtered_corpus, capsys, tmp_path):
idfile = tmp_path / "id.txt"
idfile.write_text("id1\nid2")
outfile = tmp_path / "subset.jsonl"
outfile.touch()
with patch("sys.argv", ["f.py", "foo.jsonl", str(idfile), str(outfile)]):
main()
mock_exit.assert_called_with(-1)
captured = capsys.readouterr()
assert "already exists" in captured.out

0 comments on commit 95c62e9

Please sign in to comment.