Skip to content

Commit

Permalink
changes
Browse files Browse the repository at this point in the history
  • Loading branch information
Burak Ozyurt committed Nov 25, 2020
1 parent 2021ab4 commit e3681e0
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 8 deletions.
68 changes: 61 additions & 7 deletions hocr2pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from xml.etree.ElementTree import Element, SubElement
import xml.etree.ElementTree as ET
import spacy
import utils

class BBox(object):
Expand Down Expand Up @@ -100,7 +101,7 @@ def is_inside(self, candidate: BBox):
return self.y0min <= ymid and self.y1max >= ymid and self.x0min <= xmid and self.x1max >= xmid


def get_text(self):
def get_text(self, nlp=None):
lines = []
for m in self.members:
collect_text(m.node, lines)
Expand All @@ -114,12 +115,54 @@ def get_text(self):
line[-1] = line[-1][:-1] + lines[i+1][0]
del lines[i+1][0]

if nlp:
lines = clean_figure_text(lines, nlp, from_top=True)
lines = clean_figure_text(lines, nlp, from_top=False)

content = ""
for line in lines:
content += " ".join(line) + "\n"
content = sanitize(content)
return content


def sanitize(content):
content = content.replace("\uFB02 ",'fl')
content = content.replace("\uFB01 ",'fi')
content = content.replace("\uFB02",'fl')
content = content.replace("\uFB01 ",'fi')
return content


def clean_figure_text(lines, nlp, from_top=True):
if len(lines) == 0:
return lines
removed = []
if from_top:
i = 0
while i < len(lines):
if utils.is_figure_text(lines[i], nlp):
removed.append(i)
else:
break
i += 1
else:
i = len(lines) -1
while i >= 0:
if utils.is_figure_text(lines[i], nlp):
removed.append(i)
else:
break
i -= 1
if len(removed) == 0:
return lines
filtered = []
for i, line in enumerate(lines):
if i not in removed:
filtered.append(line)
return filtered


def remove_figure_captions(content):
p = re.compile(r"^Figure\s+\d+(?:-\d+)\s+[A-Z].+(\n?.+)?\.", re.MULTILINE)
s = re.sub(p, '', content)
Expand Down Expand Up @@ -148,6 +191,13 @@ def is_eligible(node):
return attr != 'figure_caption' and attr != 'header'


def is_figure_caption(node):
if 'pdftotree' not in node.attrib:
return False
attr = node.attrib['pdftotree']
return attr == 'figure_caption'


def collect_all_text(node, lines):
if node.tag == 'div':
for child in node:
Expand All @@ -166,6 +216,8 @@ def collect_text(node, lines):
if is_eligible(node):
for child in node:
collect_text(child, lines)
elif is_figure_caption(node):
lines.append(['FIGURE_CAPTION'])
elif node.tag == 'span':
if node.attrib['class'] == 'ocrx_line':
lines.append([])
Expand All @@ -175,7 +227,7 @@ def collect_text(node, lines):
lines[-1].append(node.text)


def cluster_bboxes(bbox_list, top_el=None):
def cluster_bboxes(bbox_list, top_el=None, nlp=None):
clusters = []
for bbox in bbox_list:
closest = None
Expand Down Expand Up @@ -206,11 +258,11 @@ def cluster_bboxes(bbox_list, top_el=None):

content = ""
print("Left Column\n---------\n")
col_text = ct[0].get_text()
col_text = ct[0].get_text(nlp=nlp)
content += col_text
print(col_text)
print("\nRight Column\n---------\n")
col_text = ct[1].get_text()
col_text = ct[1].get_text(nlp=nlp)
content += col_text
print(col_text)
if top_el is not None:
Expand All @@ -234,15 +286,15 @@ def find_columns(clusters):



def handle_page(node, num_cols=2, top_el=None):
def handle_page(node, num_cols=2, top_el=None, nlp=None):
bbox_list = []
for child in node:
if child.tag != 'div':
continue
bbox = BBox.from_node(child)
bbox_list.append(bbox)
print('# boxes:', len(bbox_list))
cluster_bboxes(bbox_list, top_el=top_el)
cluster_bboxes(bbox_list, top_el=top_el, nlp=nlp)


def main():
Expand All @@ -254,10 +306,12 @@ def main():

hocr_file = args.i
out_xml_file = args.o
nlp = spacy.load("en_core_web_sm")
print("loaded spacy.")
tree = ET.parse(hocr_file)
top = Element('pdf')
for node in tree.findall('.//body/div'):
handle_page(node, top_el=top)
handle_page(node, top_el=top, nlp=nlp)
utils.indent(top)
out_tree = ET.ElementTree(top)
out_tree.write(out_xml_file, encoding="UTF-8")
Expand Down
33 changes: 32 additions & 1 deletion utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,36 @@ def is_mostly_numbers(line):
return False


def is_figure_text(line_toks, nlp):
if len(line_toks) >= 5:
return False
has_title_case = False
has_verb = False
num_nouns = 0
has_period = False
line = " ".join(line_toks)
if is_mostly_numbers(line):
return True
for doc in nlp.pipe([line], disable=['ner', 'parser']):
num_tokens = len(doc)
for i, token in enumerate(doc):
if token.text == '.':
has_period = True
m = re.match(r'^X[x]+$', token.shape_)
if i == 0 and m:
has_title_case = True
if token.tag_.startswith('VB'):
has_verb = True
if token.tag_.startswith('NN'):
num_nouns += 1

noun_frac = num_nouns / float(num_tokens)
if not has_verb and not has_period and noun_frac >= 0.5:
return True
return False



def is_heading(line, nlp):
if isempty(line):
return (False, False)
Expand All @@ -79,7 +109,8 @@ def is_heading(line, nlp):
alpha_sec_pat = re.compile(r'(\^[abcdefg]\.\s*)')
headings_set = {"abstract", "introduction", "background", "methods",
"materials and methods", "discussion", "conclusions",
"references", "acknowledgements", "online methods"}
"references", "acknowledgements", "online methods",
"bibliography"}
m = sec_num_pat.match(line)
if m:
prefix = m.group(1)
Expand Down

0 comments on commit e3681e0

Please sign in to comment.