Skip to content

Commit

Permalink
Wrap tsv inputs in types that support line and column numbers
Browse files Browse the repository at this point in the history
  • Loading branch information
HenningTimm committed Feb 22, 2024
1 parent 104e27e commit 0cff720
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 10 deletions.
2 changes: 1 addition & 1 deletion yml2block/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,7 @@ def no_trailing_spaces(list_item, tsv_keyword, level=Level.ERROR):

for entry in entries_to_check[tsv_keyword]:
try:
value = list_item[entry]
value = list_item[entry].value
except KeyError:
# This case occurs, when a typo in one of the required
# keywords is present. They can safely be skipped here,
Expand Down
49 changes: 40 additions & 9 deletions yml2block/tsv_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,31 @@
from yml2block.rules import LintViolation, Level



class MDBlockList(list):
__slots__ = ("line", "column")

class MDBlockDict(dict):
__slots__ = ("line", "column")

class MDBlockNode:
__slots__ = ("line", "column", "value")

def __init__(self, value, line=None, column=None):
self.line = line
self.column = column
self.value = value

def __repr__(self):
return f"({self.line}, {self.column}) {self.value}"



def _identify_break_points(full_file):
"""Identify where to split the metadata block into its three subsections"""
violations = []

# Split slong lines starting with '#'
# Split along lines starting with '#'
break_points = [i for i, line in enumerate(full_file) if line.startswith("#")]
if len(break_points) == 3:
split_blocks = (
Expand All @@ -27,6 +47,7 @@ def _identify_break_points(full_file):
None,
)
else:
# TODO: suggest better fix for this
split_blocks = full_file
violations.append(
LintViolation(
Expand All @@ -41,7 +62,7 @@ def _identify_break_points(full_file):
def read_tsv(tsv_path):
"""Read in a Dataverse TSV metadata block file and convert it into a python dictionary structure."""
violations = []
data = dict()
data = MDBlockDict()

with open(tsv_path, "r") as raw_file:
full_file = raw_file.readlines()
Expand All @@ -61,13 +82,13 @@ def _parse(block):

# Unpack each tsv-chunk of the metadata block into a list
# of dictionaries.
parsed_blocks = [_parse(block) for block in split_blocks]
parsed_blocks = [zip(_parse(block), itertools.repeat(offset)) for offset, block in enumerate(split_blocks, 1)]

for line_no, row in enumerate(itertools.chain(*parsed_blocks)):
# Each row corresponds to a line in the TSV file
for line_no, (row, offset) in enumerate(itertools.chain(*parsed_blocks), 1):
# Each row corresponds to a content line in the TSV file
# unpacked into a dictionary with keys depending
# on the part of the block identified by the top level keyword

# Get the toplevel keyword from the first column of the TSV file
# e.g. #metadataBlock, #datasetField, #controlledVocabulary
toplevel_key_with_prefix = [
Expand All @@ -76,8 +97,14 @@ def _parse(block):

# For consistency with the yaml format
toplevel_key = toplevel_key_with_prefix.lstrip("#")
row_as_dict = dict()

# print(f"{line_no} + {offset} = {line_no + offset}", row)
row_as_dict = MDBlockDict()
offset_line_no = line_no + offset
row_as_dict.line = offset_line_no
row_as_dict.column = None


for key, value in row.items():
if key is None:
# These entries cannot be associated with a column header
Expand All @@ -94,12 +121,16 @@ def _parse(block):
continue
else:
# Copy all other entries into a new data structure for this row
row_as_dict[key] = value
row_as_dict[key] = MDBlockNode(value, line=offset_line_no)

# Initialize the entry for this toplevel keyword with an empty list
if toplevel_key not in data.keys():
data[toplevel_key] = []
block_list = MDBlockList()
block_list.line = line_no
block_list.column = None
data[toplevel_key] = block_list

data[toplevel_key].append(row_as_dict)

print(data)
return data, violations

0 comments on commit 0cff720

Please sign in to comment.