Skip to content

Commit

Permalink
Track doc_id in parse_ether
Browse files Browse the repository at this point in the history
  • Loading branch information
amir-zeldes committed Nov 8, 2018
1 parent f558b7c commit ab8b3ac
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 5 deletions.
21 changes: 17 additions & 4 deletions modules/ether.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,15 +96,18 @@ def read_config(self,config_file):
else:
self.template = "<meta %%all%%>\n%%body%%\n</meta>\n"

def parse_ether(ether):

def parse_ether(ether, doc_id=None):
"""Take in raw socialcalc data and turn it into a dict of Cells. Used in validation."""

class Cell:
def __init__(self, col, row, content, span):
self.col = col
self.row = row
self.header = ""
self.content = content
self.span = span

def __repr__(self):
return "<Cell (" + repr((self.col, self.row, self.header, self.content, self.span)) + ")>"

Expand All @@ -125,12 +128,12 @@ def __repr__(self):
cell_row = cell_id[1:]
cell_col = cell_id[0]
# We'd need something like this to support more than 26 cols, i.e. columns AA, AB...
#for c in cell_id:
# for c in cell_id:
# if c in ["0","1","2","3","4","5","6","7","8","9"]:
# cell_row += c
# else:
# cell_col += c
cell_content = parts[3].replace("\\c",":")
cell_content = parts[3].replace("\\c", ":")
cell_span = parts[-1] if "rowspan:" in line else "1"

# record col name
Expand All @@ -143,11 +146,17 @@ def __repr__(self):
all_cells.append(cell)

for cell in all_cells:
cell.header = rev_colmap[cell.col]
if cell.col in rev_colmap:
cell.header = rev_colmap[cell.col]
else:
if doc_id is None:
doc_id = "unknown"
raise IOError("Undocumented column: " + cell.col + " in '" + str(cell) + " from doc: " + str(doc_id))

parsed["__colmap__"] = colmap # Save colmap for apply_rule
return parsed


def unescape_xml(text):
# Fix various common compounded XML escapes
text = text.replace("&amp;lt;","<").replace("&amp;gt;",">")
Expand Down Expand Up @@ -676,6 +685,10 @@ def ether_to_sgml(ether, doc_id,config=None):
# Priorities have been supplied, but this column is not in them
continue

# content may not contain straight double quotes in span annotations in SGML export
# Note that " is allowed in tokens and in tab-delimited token annotations!
content = content.replace('"', "&quot;")

if sec_element != "":
#open_tags[row][sec_element].append((attrib, content))
sec_element_checklist.append((element,sec_element,attrib,content,rowspan))
Expand Down
2 changes: 1 addition & 1 deletion validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def validate_doc_ether(doc_id, editor=False, dirty=True):

ether_doc_name = "gd_" + doc_corpus + "_" + doc_name
socialcalc = get_socialcalc(ether_url, ether_doc_name, doc_id=doc_id, dirty=dirty)
parsed_ether = parse_ether(socialcalc)
parsed_ether = parse_ether(socialcalc,doc_id=doc_id)

report = ''
cells = []
Expand Down

0 comments on commit ab8b3ac

Please sign in to comment.