-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* add text file buffering with tests * integrate with main runner
- Loading branch information
1 parent
9c5ee6a
commit f39a0e5
Showing
4 changed files
with
143 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
import pathlib | ||
|
||
|
||
class BufferedTextReader: | ||
def __init__(self, *, file_path: str | pathlib.Path, maximum_ram_usage: int = 10**9): | ||
""" | ||
Lazily read a text file into RAM using buffers of a specified size. | ||
Parameters | ||
---------- | ||
file_path : string or pathlib.Path | ||
The path to the text file to be read. | ||
maximum_ram_usage : int, default: 1 GB | ||
The theoretical maximum amount of RAM to be used by the BufferedTextReader object. | ||
""" | ||
self.file_path = file_path | ||
self.maximum_ram_usage = maximum_ram_usage | ||
|
||
# The actual amount of bytes to read per iteration is 3x less than theoretical maximum usage | ||
# due to decoding and handling | ||
self.buffer_size_in_bytes = int(maximum_ram_usage / 3) | ||
|
||
self.total_file_size = pathlib.Path(file_path).stat().st_size | ||
self.offset = 0 | ||
|
||
def __iter__(self): | ||
return self | ||
|
||
def __next__(self) -> list[str]: | ||
"""Retrieve the next buffer from the file, or raise StopIteration if the file is exhausted.""" | ||
if self.offset >= self.total_file_size: | ||
raise StopIteration | ||
|
||
with open(file=self.file_path, mode="rb", buffering=0) as io: | ||
io.seek(self.offset) | ||
intermediate_bytes = io.read(self.buffer_size_in_bytes) | ||
decoded_intermediate_buffer = intermediate_bytes.decode() | ||
split_intermediate_buffer = decoded_intermediate_buffer.splitlines() | ||
|
||
# Check if we are at the end of the file | ||
if len(intermediate_bytes) < self.buffer_size_in_bytes: | ||
self.offset = self.total_file_size | ||
return split_intermediate_buffer | ||
|
||
buffer = split_intermediate_buffer[:-1] | ||
last_line = split_intermediate_buffer[-1] | ||
|
||
if len(buffer) == 0 and last_line != "": | ||
raise ValueError( | ||
f"BufferedTextReader encountered a line at offset {self.offset} that exceeds the buffer " | ||
"size! Try increasing the `buffer_size_in_bytes` to account for this line." | ||
) | ||
|
||
# The last line split by the intermediate buffer may or may not be incomplete | ||
if decoded_intermediate_buffer.endswith("\n"): | ||
# By chance, this iteration finished on a clean line break | ||
self.offset += self.buffer_size_in_bytes | ||
else: | ||
self.offset += self.buffer_size_in_bytes - len(last_line.encode("utf-8")) | ||
|
||
return buffer |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
import pathlib | ||
import sys | ||
|
||
import pytest | ||
|
||
import dandi_s3_log_parser | ||
|
||
|
||
@pytest.fixture(scope="session") | ||
def large_text_file_path(tmp_path_factory: pytest.TempPathFactory): | ||
tmp_path = tmp_path_factory.mktemp("large_text_file") | ||
|
||
# Generate a test file ~10 MB in total size | ||
# Content does not matter, each line is ~100 bytes | ||
test_file_path = tmp_path / "large_text_file.txt" | ||
fill_string = "a" * 60 + "\n" | ||
content = [fill_string for _ in range(10**5)] | ||
with open(file=test_file_path, mode="w") as test_file: | ||
test_file.writelines(content) | ||
|
||
return test_file_path | ||
|
||
|
||
@pytest.fixture(scope="session") | ||
def single_line_text_file_path(tmp_path_factory: pytest.TempPathFactory): | ||
"""For testing the ValueError case during iteration.""" | ||
tmp_path = tmp_path_factory.mktemp("single_line_text_file") | ||
|
||
# Generate test file ~3 MB in total size, consisting of only a single line | ||
test_file_path = tmp_path / "single_line_text_file.txt" | ||
with open(file=test_file_path, mode="w") as test_file: | ||
test_file.write("a" * 30**6) | ||
|
||
return test_file_path | ||
|
||
|
||
def test_buffered_text_reader(large_text_file_path: pathlib.Path): | ||
maximum_ram_usage = 10**6 # 1 MB | ||
buffered_text_reader = dandi_s3_log_parser.BufferedTextReader( | ||
file_path=large_text_file_path, maximum_ram_usage=maximum_ram_usage | ||
) | ||
|
||
assert iter(buffered_text_reader) is buffered_text_reader, "BufferedTextReader object is not iterable!" | ||
|
||
for buffer_index, buffer in enumerate(buffered_text_reader): | ||
assert isinstance(buffer, list), "BufferedTextReader object did not load a buffer as a list!" | ||
assert ( | ||
sys.getsizeof(buffer) <= buffered_text_reader.buffer_size_in_bytes | ||
), "BufferedTextReader object loaded a buffer exceeding the threshold!" | ||
|
||
assert buffer_index == 18, "BufferedTextReader object did not load the correct number of buffers!" | ||
|
||
with pytest.raises(StopIteration): | ||
next(buffered_text_reader) | ||
|
||
|
||
def test_value_error(single_line_text_file_path: pathlib.Path): | ||
maximum_ram_usage = 10**6 # 1 MB | ||
with pytest.raises(ValueError) as error_info: | ||
buffered_text_reader = dandi_s3_log_parser.BufferedTextReader( | ||
file_path=single_line_text_file_path, maximum_ram_usage=maximum_ram_usage | ||
) | ||
next(buffered_text_reader) | ||
|
||
expected_message = ( | ||
"BufferedTextReader encountered a line at offset 0 that exceeds the buffer size! " | ||
"Try increasing the `buffer_size_in_bytes` to account for this line." | ||
) | ||
assert str(error_info.value) == expected_message |