Skip to content

Commit

Permalink
NF: tools/text2filetree.py helper to convert some indented text into …
Browse files Browse the repository at this point in the history
…our filetree example spec

See included within script tests
  • Loading branch information
yarikoptic committed Dec 18, 2024
1 parent 5b4b0fd commit c8c5910
Showing 1 changed file with 287 additions and 0 deletions.
287 changes: 287 additions & 0 deletions tools/text2filetree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,287 @@
#!/usr/bin/env python3
import argparse
import json
import sys


def parse_file_tree(input_data, tab_width=4):
"""
Parse a file tree hierarchy from an input stream and convert it to a nested dictionary.
:param input_data: Input string or file-like object representing the file tree
:param tab_width: Number of spaces to replace tabs with
:return: A nested dictionary representing the file tree
"""
# If input is a string, convert to a list of lines
if isinstance(input_data, str):
lines = input_data.splitlines()
else:
lines = [line.rstrip() for line in input_data.readlines()]

def _parse_tree(lines):
# Create main tree dictionary
tree = {}

# Stack to keep track of nested dictionaries and their indentation levels
dict_stack = [(tree, -1)]

for i, line in enumerate(lines):
# Skip empty lines
if not line.strip():
continue

# Replace tabs with specified number of spaces
line = line.replace("\t", " " * tab_width).rstrip()

# Compute indentation and clean name
indent_level = len(line) - len(line.lstrip())
current_name = line.strip()

# Find the correct parent dictionary based on indentation
while dict_stack and dict_stack[-1][1] >= indent_level:
dict_stack.pop()

# If stack is empty, something went wrong with indentation
if not dict_stack:
raise ValueError(f"Invalid indentation for line: {line}")

# Get the current parent dictionary
parent_dict, _ = dict_stack[-1]

# Determine if it's a directory
# 1. Explicitly marked with '/'
# 2. Has children on next lines with more indentation
is_dir = current_name.endswith("/") or (
i < len(lines) - 1
and len(lines[i + 1]) - len(lines[i + 1].lstrip()) > indent_level
)

# Normalize directory name
if is_dir and not current_name.endswith("/"):
current_name += "/"

# Add item to the dictionary
if is_dir:
# Create a new nested dictionary for directories
new_dict = {}
parent_dict[current_name] = new_dict
# Push new dictionary and its indentation to the stack
dict_stack.append((new_dict, indent_level))
else:
# Add files with empty string value
parent_dict[current_name] = ""

return tree

# Call the internal parsing function and return its result
return _parse_tree(lines)


def decorate_output(output_str, decoration_type):
"""
Decorate the output based on the specified decoration type
:param output_str: JSON string to be decorated
:param decoration_type: Type of decoration to apply
:return: Decorated output string
"""
if decoration_type == "bids-filetree":
return f"{{{{ MACROS___make_filetree_example(\n\n{output_str}\n\n) }}}}"
return output_str


def main():
# Set up argument parsing
parser = argparse.ArgumentParser(
description="Parse file tree hierarchy into a nested dictionary."
)
parser.add_argument(
"input_file",
nargs="?",
type=argparse.FileType("r"),
default=sys.stdin,
help="Input file to parse (default: stdin)",
)
parser.add_argument(
"--tab-width",
type=int,
default=4,
help="Number of spaces to replace tabs with (default: 4)",
)
parser.add_argument(
"--output-file",
type=str,
default=None,
help="Output file to write the parsed dictionary (default: stdout)",
)
parser.add_argument(
"--indent", type=int, default=2, help="Indentation for JSON output (default: 2)"
)
parser.add_argument(
"-D",
"--decorate",
type=str,
choices=["bids-filetree"],
default=None,
help="Decorate the output with a specific format",
)

# Parse arguments
args = parser.parse_args()

# Parse the file tree
result = parse_file_tree(args.input_file, args.tab_width)

# Prepare output using json.dumps with specified indent
output_str = json.dumps(result, indent=args.indent)

# Decorate output if specified
if args.decorate:
output_str = decorate_output(output_str, args.decorate)

# Determine output destination
if args.output_file:
# Write to file
with open(args.output_file, "w") as f:
f.write(output_str)
else:
# Print to stdout
print(output_str)


def test_example1():
"""
Test parsing a file tree with nested directories
"""
input_tree = """file1
a.dat
sub-1
subsub
file.dat
filehere
anotherfile"""

expected_output = {
"file1": "",
"a.dat": "",
"sub-1/": {"subsub/": {"file.dat": ""}, "filehere": ""},
"anotherfile": "",
}

# Parse the input tree
result = parse_file_tree(input_tree)

# Use deep comparison to check the result
assert result == expected_output, f"Expected {expected_output}, but got {result}"


def test_decorations():
"""
Test the output decoration functionality
"""
dummy_json = '{"test": "value"}'

# Test bids-filetree decoration
decorated = decorate_output(dummy_json, "bids-filetree")
assert (
decorated == '{{ MACROS___make_filetree_example(\n\n{"test": "value"}\n\n) }}'
)

# Test no decoration
undecorated = decorate_output(dummy_json, None)
assert undecorated == dummy_json


def test_more_complex_tree():
"""
Test a more complex nested directory structure
"""
input_tree = """root
subdir1
file1.txt
subsubdir
file2.txt
subdir2
file3.txt"""

expected_output = {
"root/": {
"subdir1/": {"file1.txt": "", "subsubdir/": {"file2.txt": ""}},
"subdir2/": {"file3.txt": ""},
}
}

# Parse the input tree
result = parse_file_tree(input_tree)

# Use deep comparison to check the result
assert result == expected_output, f"Expected {expected_output}, but got {result}"


def test_neuroimaging_dataset():
"""
Test parsing a complex neuroimaging dataset file structure
"""
input_tree = """dataset_description.json
tasks.tsv
tasks.json
participants.tsv
sub-A/
ses-20220101/
ephys/
sub-A_ses-20220101_task-nosepoke_ephys.nix
sub-A_ses-20220101_task-nosepoke_ephys.json
sub-A_ses-20220101_task-nosepoke_events.tsv
sub-A_ses-20220101_task-rest_ephys.nix
sub-A_ses-20220101_task-rest_ephys.json
sub-A_ses-20220101_channels.tsv
sub-A_ses-20220101_electrodes.tsv
sub-A_ses-20220101_probes.tsv
ses-20220102/
ephys/
sub-A_ses-20220102_task-rest_ephys.nix
sub-A_ses-20220102_task-rest_ephys.json
sub-A_ses-20220102_channels.tsv
sub-A_ses-20220102_electrodes.tsv
sub-A_ses-20220102_probes.tsv"""

expected_output = {
"dataset_description.json": "",
"tasks.tsv": "",
"tasks.json": "",
"participants.tsv": "",
"sub-A/": {
"ses-20220101/": {
"ephys/": {
"sub-A_ses-20220101_task-nosepoke_ephys.nix": "",
"sub-A_ses-20220101_task-nosepoke_ephys.json": "",
"sub-A_ses-20220101_task-nosepoke_events.tsv": "",
"sub-A_ses-20220101_task-rest_ephys.nix": "",
"sub-A_ses-20220101_task-rest_ephys.json": "",
"sub-A_ses-20220101_channels.tsv": "",
"sub-A_ses-20220101_electrodes.tsv": "",
"sub-A_ses-20220101_probes.tsv": "",
}
},
"ses-20220102/": {
"ephys/": {
"sub-A_ses-20220102_task-rest_ephys.nix": "",
"sub-A_ses-20220102_task-rest_ephys.json": "",
"sub-A_ses-20220102_channels.tsv": "",
"sub-A_ses-20220102_electrodes.tsv": "",
"sub-A_ses-20220102_probes.tsv": "",
}
},
},
}

# Parse the input tree
result = parse_file_tree(input_tree)

# Use deep comparison to check the result
assert result == expected_output, f"Expected {expected_output}, but got {result}"


if __name__ == "__main__":
# If run directly, execute main
main()

0 comments on commit c8c5910

Please sign in to comment.