Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
awdeorio committed Feb 15, 2022
2 parents 3270caf + e7f6205 commit 627d1c6
Show file tree
Hide file tree
Showing 12 changed files with 29 additions and 17 deletions.
9 changes: 1 addition & 8 deletions madoop/mapreduce.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def prepare_input_files(input_dir, output_dir):
st_size = inpath.stat().st_size
total_size += st_size
n_splits = math.ceil(st_size / MAX_INPUT_SPLIT_SIZE)
assert n_splits > 0
n_splits = 1 if not n_splits else n_splits # Handle empty input file
LOGGER.debug(
"input %s size=%sB partitions=%s", inpath, st_size, n_splits
)
Expand Down Expand Up @@ -278,13 +278,6 @@ def group_stage(input_dir, output_dir):
last_two(inpath), outparent.name, ",".join(outnames),
)

# Remove empty output files. We won't always use the maximum number of
# reducers because some MapReduce programs have fewer intermediate keys.
for path in sorted(output_dir.iterdir()):
if path.stat().st_size == 0:
LOGGER.debug("empty partition: rm %s", last_two(path))
path.unlink()

# Sort output files
for path in sorted(output_dir.iterdir()):
sort_file(path)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
description="A light weight MapReduce framework for education.",
long_description=LONG_DESCRIPTION,
long_description_content_type="text/markdown",
version="0.3.0",
version="0.4.0",
author="Andrew DeOrio",
author_email="[email protected]",
url="https://github.com/eecs485staff/madoop/",
Expand Down
15 changes: 15 additions & 0 deletions tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,18 @@ def test_missing_shebang(tmpdir):
map_exe=TESTDATA_DIR/"word_count/map.py",
reduce_exe=TESTDATA_DIR/"word_count/reduce_invalid.py",
)


def test_empty_inputs(tmpdir):
"""Empty input files should not raise an error."""
with tmpdir.as_cwd():
madoop.mapreduce(
input_dir=TESTDATA_DIR/"word_count/input_empty",
output_dir="output",
map_exe=TESTDATA_DIR/"word_count/map.py",
reduce_exe=TESTDATA_DIR/"word_count/reduce.py",
)
utils.assert_dirs_eq(
TESTDATA_DIR/"word_count/correct/output",
tmpdir/"output",
)
Empty file.
3 changes: 0 additions & 3 deletions tests/testdata/word_count/correct/output/part-00001
Original file line number Diff line number Diff line change
@@ -1,3 +0,0 @@
Bye 1
Hadoop 2
World 2
4 changes: 3 additions & 1 deletion tests/testdata/word_count/correct/output/part-00002
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
Hello 2
Bye 1
Hadoop 2
World 2
1 change: 1 addition & 0 deletions tests/testdata/word_count/correct/output/part-00003
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Hello 2
3 changes: 0 additions & 3 deletions tests/testdata/word_count/correct/reducer-output/part-00001
Original file line number Diff line number Diff line change
@@ -1,3 +0,0 @@
Bye 1
Hadoop 2
World 2
4 changes: 3 additions & 1 deletion tests/testdata/word_count/correct/reducer-output/part-00002
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
Hello 2
Bye 1
Hadoop 2
World 2
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Hello 2
4 changes: 4 additions & 0 deletions tests/testdata/word_count/input_empty/input01.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Hello World
Bye World
Hello Hadoop
Goodbye Hadoop
Empty file.

0 comments on commit 627d1c6

Please sign in to comment.