changes

SciCrunch · Nov 25, 2020 · e3681e0 · e3681e0
1 parent 2021ab4
commit e3681e0
Show file tree

Hide file tree

Showing 2 changed files with 93 additions and 8 deletions.
diff --git a/hocr2pages.py b/hocr2pages.py
@@ -3,6 +3,7 @@
 
 from xml.etree.ElementTree import Element, SubElement
 import xml.etree.ElementTree as ET
+import spacy
 import utils
 
 class BBox(object):
@@ -100,7 +101,7 @@ def is_inside(self, candidate: BBox):
         return self.y0min <= ymid and self.y1max >= ymid and self.x0min <= xmid and self.x1max >= xmid
 
 
-    def get_text(self):
+    def get_text(self, nlp=None):
         lines = []
         for m in self.members:
             collect_text(m.node, lines)
@@ -114,12 +115,54 @@ def get_text(self):
                 line[-1] =  line[-1][:-1] + lines[i+1][0]
                 del lines[i+1][0]
 
+        if nlp:
+            lines = clean_figure_text(lines, nlp, from_top=True)
+            lines = clean_figure_text(lines, nlp, from_top=False)
 
         content = ""
         for line in lines:
             content += " ".join(line) + "\n"
+        content = sanitize(content)
         return content
 
+
+def sanitize(content):
+    content = content.replace("\uFB02 ",'fl')
+    content = content.replace("\uFB01 ",'fi')
+    content = content.replace("\uFB02",'fl')
+    content = content.replace("\uFB01 ",'fi')
+    return content
+
+
+def clean_figure_text(lines, nlp,  from_top=True):
+    if len(lines) == 0:
+        return lines
+    removed = []
+    if from_top:
+        i = 0
+        while i < len(lines):
+            if utils.is_figure_text(lines[i], nlp):
+                removed.append(i)
+            else:
+                break
+            i += 1
+    else:
+        i = len(lines) -1
+        while i >= 0:
+            if utils.is_figure_text(lines[i], nlp):
+                removed.append(i)
+            else:
+                break
+            i -= 1
+    if len(removed) == 0:
+        return lines
+    filtered = []
+    for i, line in enumerate(lines):
+        if i not in removed:
+            filtered.append(line)
+    return filtered
+
+
 def remove_figure_captions(content):
     p = re.compile(r"^Figure\s+\d+(?:-\d+)\s+[A-Z].+(\n?.+)?\.", re.MULTILINE)
     s = re.sub(p, '', content)
@@ -148,6 +191,13 @@ def is_eligible(node):
     return attr != 'figure_caption' and attr != 'header'
 
 
+def is_figure_caption(node):
+    if 'pdftotree' not in node.attrib:
+        return False
+    attr = node.attrib['pdftotree']
+    return attr == 'figure_caption'
+
+
 def collect_all_text(node, lines):
     if  node.tag == 'div':
         for child in node:
@@ -166,6 +216,8 @@ def collect_text(node, lines):
        if is_eligible(node):
            for child in node:
                collect_text(child, lines)
+       elif is_figure_caption(node):
+            lines.append(['FIGURE_CAPTION'])
     elif node.tag == 'span':
         if node.attrib['class'] == 'ocrx_line':
             lines.append([])
@@ -175,7 +227,7 @@ def collect_text(node, lines):
             lines[-1].append(node.text)
 
 
-def cluster_bboxes(bbox_list, top_el=None):
+def cluster_bboxes(bbox_list, top_el=None, nlp=None):
     clusters = []
     for bbox in bbox_list:
         closest = None
@@ -206,11 +258,11 @@ def cluster_bboxes(bbox_list, top_el=None):
 
         content = ""
         print("Left Column\n---------\n")
-        col_text = ct[0].get_text()
+        col_text = ct[0].get_text(nlp=nlp)
         content += col_text
         print(col_text)
         print("\nRight Column\n---------\n")
-        col_text = ct[1].get_text()
+        col_text = ct[1].get_text(nlp=nlp)
         content += col_text
         print(col_text)
         if top_el is not None:
@@ -234,15 +286,15 @@ def find_columns(clusters):
 
 
 
-def handle_page(node, num_cols=2, top_el=None):
+def handle_page(node, num_cols=2, top_el=None, nlp=None):
     bbox_list = []
     for child in node:
         if child.tag != 'div':
             continue
         bbox = BBox.from_node(child)
         bbox_list.append(bbox)
     print('# boxes:', len(bbox_list))
-    cluster_bboxes(bbox_list, top_el=top_el)
+    cluster_bboxes(bbox_list, top_el=top_el, nlp=nlp)
 
 
 def main():
@@ -254,10 +306,12 @@ def main():
 
     hocr_file = args.i
     out_xml_file = args.o
+    nlp = spacy.load("en_core_web_sm")
+    print("loaded spacy.")
     tree = ET.parse(hocr_file)
     top = Element('pdf')
     for node in tree.findall('.//body/div'):
-        handle_page(node, top_el=top)
+        handle_page(node, top_el=top, nlp=nlp)
     utils.indent(top)
     out_tree = ET.ElementTree(top)
     out_tree.write(out_xml_file, encoding="UTF-8")

diff --git a/utils.py b/utils.py
@@ -64,6 +64,36 @@ def is_mostly_numbers(line):
     return False
 
 
+def is_figure_text(line_toks, nlp):
+    if len(line_toks) >= 5:
+        return False
+    has_title_case = False
+    has_verb  = False
+    num_nouns = 0
+    has_period = False
+    line = " ".join(line_toks)
+    if is_mostly_numbers(line):
+        return True
+    for doc in nlp.pipe([line], disable=['ner', 'parser']):
+        num_tokens = len(doc)
+        for i, token in enumerate(doc):
+            if token.text == '.':
+                has_period = True
+            m = re.match(r'^X[x]+$', token.shape_)
+            if i == 0 and m:
+                has_title_case = True
+            if token.tag_.startswith('VB'):
+                has_verb = True
+            if token.tag_.startswith('NN'):
+                num_nouns += 1
+
+    noun_frac = num_nouns / float(num_tokens)
+    if not has_verb and not has_period and noun_frac >= 0.5:
+        return True
+    return False
+
+
+
 def is_heading(line, nlp):
     if isempty(line):
         return (False, False)
@@ -79,7 +109,8 @@ def is_heading(line, nlp):
     alpha_sec_pat = re.compile(r'(\^[abcdefg]\.\s*)')
     headings_set = {"abstract", "introduction", "background", "methods",
                     "materials and methods", "discussion", "conclusions",
-                    "references", "acknowledgements", "online methods"}
+                    "references", "acknowledgements", "online methods",
+                    "bibliography"}
     m = sec_num_pat.match(line)
     if m:
         prefix = m.group(1)