Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
awdeorio committed Feb 10, 2022
2 parents b8cd5a2 + 6c6d7fe commit 3270caf
Show file tree
Hide file tree
Showing 26 changed files with 97 additions and 118 deletions.
3 changes: 1 addition & 2 deletions madoop/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,7 @@ def __call__(self, parser, *args, **kwargs):
src = madoop_dir/"example"
dst = pathlib.Path("example")
if dst.exists():
print(f"Error: directory already exists: {dst}")
parser.exit(1)
parser.error(f"directory already exists: {dst}")
shutil.copytree(src, dst)
print(textwrap.dedent(f"""\
Created {dst}, try:
Expand Down
89 changes: 45 additions & 44 deletions madoop/mapreduce.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"""
import contextlib
import collections
import hashlib
import logging
import math
Expand Down Expand Up @@ -84,15 +85,15 @@ def mapreduce(input_dir, output_dir, map_exe, reduce_exe):
)

# Move files from temporary output dir to user-specified output dir
for filename in reduce_output_dir.glob("*"):
total_size = 0
for filename in sorted(reduce_output_dir.glob("*")):
st_size = filename.stat().st_size
total_size += st_size
shutil.copy(filename, output_dir)
output_path = output_dir.parent/last_two(filename)
LOGGER.debug("%s size=%sB", output_path, st_size)

# Remind user where to find output
total_size = 0
for outpath in sorted(output_dir.iterdir()):
st_size = outpath.stat().st_size
total_size += st_size
LOGGER.debug("%s size=%sB", outpath, st_size)
LOGGER.debug("total output size=%sB", total_size)
LOGGER.info("Output directory: %s", output_dir)

Expand Down Expand Up @@ -220,37 +221,25 @@ def keyhash(key):
return int(hexdigest, base=16)


def partition_keys(inpath, outpaths):
"""Allocate lines of inpath among outpaths using hash of key."""
def partition_keys(inpath, outpaths, input_keys_stats, output_keys_stats):
"""Allocate lines of inpath among outpaths using hash of key.
Update the data structures provided by the caller input_keys_stats and
output_keys_stats. Both map a filename to a set of of keys.
"""
assert len(outpaths) == MAX_NUM_REDUCE
outparent = outpaths[0].parent
assert all(i.parent == outparent for i in outpaths)
outnames = [i.name for i in outpaths]
LOGGER.debug(
"partition %s >> %s/{%s}",
last_two(inpath), outparent.name, ",".join(outnames),
)
with contextlib.ExitStack() as stack:
outfiles = [stack.enter_context(p.open("a")) for p in outpaths]
for line in stack.enter_context(inpath.open()):
key = line.partition('\t')[0]
input_keys_stats[inpath].add(key)
reducer_idx = keyhash(key) % MAX_NUM_REDUCE
outfiles[reducer_idx].write(line)


def keyspace(path):
"""Return the number of unique keys in {path}.
WARNING: This is a terribly slow implementation. It would be faster to
record this information while grouping.x
"""
keys = set()
with path.open() as infile:
for line in infile:
key = line.partition('\t')[0]
keys.add(key)
return keys
outpath = outpaths[reducer_idx]
output_keys_stats[outpath].add(key)


def group_stage(input_dir, output_dir):
Expand All @@ -260,22 +249,34 @@ def group_stage(input_dir, output_dir):
using the hash and modulo of the key.
"""
# Detailed keyspace debug output THIS IS SLOW
all_keys = set()
for inpath in sorted(input_dir.iterdir()):
keys = keyspace(inpath)
all_keys.update(keys)
LOGGER.debug("%s unique_keys=%s", last_two(inpath), len(keys))
LOGGER.debug("%s all_unique_keys=%s", input_dir.name, len(all_keys))

# Compute output filenames
outpaths = []
for i in range(MAX_NUM_REDUCE):
outpaths.append(output_dir/part_filename(i))

# Parition input, appending to output files
# Track keyspace stats, map filename -> set of keys
input_keys_stats = collections.defaultdict(set)
output_keys_stats = collections.defaultdict(set)

# Partition input, appending to output files
for inpath in sorted(input_dir.iterdir()):
partition_keys(inpath, outpaths)
partition_keys(inpath, outpaths, input_keys_stats, output_keys_stats)

# Log input keyspace stats
all_input_keys = set()
for inpath, keys in sorted(input_keys_stats.items()):
all_input_keys.update(keys)
LOGGER.debug("%s unique_keys=%s", last_two(inpath), len(keys))
LOGGER.debug("%s all_unique_keys=%s", input_dir.name, len(all_input_keys))

# Log partition input and output filenames
outnames = [i.name for i in outpaths]
outparent = outpaths[0].parent
for inpath in sorted(input_keys_stats.keys()):
LOGGER.debug(
"partition %s >> %s/{%s}",
last_two(inpath), outparent.name, ",".join(outnames),
)

# Remove empty output files. We won't always use the maximum number of
# reducers because some MapReduce programs have fewer intermediate keys.
Expand All @@ -288,13 +289,13 @@ def group_stage(input_dir, output_dir):
for path in sorted(output_dir.iterdir()):
sort_file(path)

# Detailed keyspace debug output THIS IS SLOW
all_keys = set()
for outpath in sorted(output_dir.iterdir()):
keys = keyspace(outpath)
all_keys.update(keys)
# Log output keyspace stats
all_output_keys = set()
for outpath, keys in sorted(output_keys_stats.items()):
all_output_keys.update(keys)
LOGGER.debug("%s unique_keys=%s", last_two(outpath), len(keys))
LOGGER.debug("%s all_unique_keys=%s", output_dir.name, len(all_keys))
LOGGER.debug("%s all_unique_keys=%s", output_dir.name,
len(all_output_keys))


def reduce_stage(exe, input_dir, output_dir):
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
description="A light weight MapReduce framework for education.",
long_description=LONG_DESCRIPTION,
long_description_content_type="text/markdown",
version="0.2.0",
version="0.3.0",
author="Andrew DeOrio",
author_email="[email protected]",
url="https://github.com/eecs485staff/madoop/",
Expand Down
14 changes: 12 additions & 2 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,12 +91,22 @@ def test_hadoop_arguments(tmpdir):
def test_example(tmpdir):
"""Example option should copy files."""
with tmpdir.as_cwd():
subprocess.run(["madoop", "--example"], check=True)
subprocess.run(
["madoop", "--example"],
check=True,
stderr=subprocess.PIPE,
stdout=subprocess.PIPE,
)
assert (tmpdir/"example/input/input01.txt").exists()
assert (tmpdir/"example/input/input02.txt").exists()
assert (tmpdir/"example/map.py").exists()
assert (tmpdir/"example/reduce.py").exists()

# Call it again and it should refuse to clobber
with tmpdir.as_cwd(), pytest.raises(subprocess.CalledProcessError):
subprocess.run(["madoop", "--example"], check=True)
subprocess.run(
["madoop", "--example"],
check=True,
stderr=subprocess.PIPE,
stdout=subprocess.PIPE,
)
2 changes: 1 addition & 1 deletion tests/test_stages.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ def test_map_stage(tmpdir):
"""Test the map stage using word count example."""
map_stage(
exe=TESTDATA_DIR/"word_count/map.py",
input_dir=TESTDATA_DIR/"word_count/correct/input",
input_dir=TESTDATA_DIR/"word_count/input",
output_dir=Path(tmpdir),
)
utils.assert_dirs_eq(
Expand Down
7 changes: 1 addition & 6 deletions tests/testdata/word_count/correct/grouper-output/part-00000
Original file line number Diff line number Diff line change
@@ -1,6 +1 @@
cool 1
file 1
file 1
file 1
file 1
streaming 1
Goodbye 1
6 changes: 0 additions & 6 deletions tests/testdata/word_count/correct/grouper-output/part-00001

This file was deleted.

12 changes: 5 additions & 7 deletions tests/testdata/word_count/correct/grouper-output/part-00002
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
google 1
map 1
map 1
map 1
map 1
system 1
system 1
Bye 1
Hadoop 1
Hadoop 1
World 1
World 1
2 changes: 2 additions & 0 deletions tests/testdata/word_count/correct/grouper-output/part-00003
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Hello 1
Hello 1
2 changes: 0 additions & 2 deletions tests/testdata/word_count/correct/input/part-00000

This file was deleted.

3 changes: 0 additions & 3 deletions tests/testdata/word_count/correct/input/part-00001

This file was deleted.

6 changes: 0 additions & 6 deletions tests/testdata/word_count/correct/mapper-output/part-00000

This file was deleted.

17 changes: 4 additions & 13 deletions tests/testdata/word_count/correct/mapper-output/part-00001
Original file line number Diff line number Diff line change
@@ -1,13 +1,4 @@
hadoop 1
map 1
reduce 1
file 1
map 1
map 1
streaming 1
file 1
reduce 1
map 1
reduce 1
is 1
cool 1
Hello 1
World 1
Bye 1
World 1
4 changes: 4 additions & 0 deletions tests/testdata/word_count/correct/mapper-output/part-00002
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Hello 1
Hadoop 1
Goodbye 1
Hadoop 1
4 changes: 1 addition & 3 deletions tests/testdata/word_count/correct/output/part-00000
Original file line number Diff line number Diff line change
@@ -1,3 +1 @@
cool 1
file 4
streaming 1
Goodbye 1
6 changes: 3 additions & 3 deletions tests/testdata/word_count/correct/output/part-00001
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
hadoop 2
is 1
reduce 3
Bye 1
Hadoop 2
World 2
4 changes: 1 addition & 3 deletions tests/testdata/word_count/correct/output/part-00002
Original file line number Diff line number Diff line change
@@ -1,3 +1 @@
google 1
map 4
system 2
Hello 2
4 changes: 1 addition & 3 deletions tests/testdata/word_count/correct/reducer-output/part-00000
Original file line number Diff line number Diff line change
@@ -1,3 +1 @@
cool 1
file 4
streaming 1
Goodbye 1
6 changes: 3 additions & 3 deletions tests/testdata/word_count/correct/reducer-output/part-00001
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
hadoop 2
is 1
reduce 3
Bye 1
Hadoop 2
World 2
4 changes: 1 addition & 3 deletions tests/testdata/word_count/correct/reducer-output/part-00002
Original file line number Diff line number Diff line change
@@ -1,3 +1 @@
google 1
map 4
system 2
Hello 2
3 changes: 0 additions & 3 deletions tests/testdata/word_count/input/file01

This file was deleted.

2 changes: 0 additions & 2 deletions tests/testdata/word_count/input/file02

This file was deleted.

2 changes: 2 additions & 0 deletions tests/testdata/word_count/input/input01.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Hello World
Bye World
2 changes: 2 additions & 0 deletions tests/testdata/word_count/input/input02.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Hello Hadoop
Goodbye Hadoop
2 changes: 1 addition & 1 deletion tests/testdata/word_count/map.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@
set -Eeuo pipefail

# Map
cat | tr '[ \t]' '\n' | tr '[:upper:]' '[:lower:]' | awk '{print $1"\t1"}'
cat | tr '[ \t]' '\n' | awk '{print $1"\t1"}'
7 changes: 6 additions & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,14 @@ python =
[testenv]
setenv =
PYTHONPATH = {toxinidir}
allowlist_externals = sh
allowlist_externals =
sh
diff
extras = dev
commands =
diff -r madoop/example/input tests/testdata/word_count/input/
diff -r madoop/example/map.py tests/testdata/word_count/map.py
diff -r madoop/example/reduce.py tests/testdata/word_count/reduce.py
pycodestyle madoop tests setup.py
sh -c "pydocstyle madoop tests/* setup.py"
pylint madoop tests setup.py
Expand Down

0 comments on commit 3270caf

Please sign in to comment.