From 0a66b9be3efdd1a0043ff172f2ed62d6fd30c51e Mon Sep 17 00:00:00 2001
From: Ian Ross <ian.ackerman.ross@gmail.com>
Date: Tue, 13 Nov 2018 15:13:33 -0600
Subject: [PATCH 01/19] Bugfixes; few changes for tesseract4 quirks; extract
 should work in py3 now

---
 extract.py | 22 +++++++++++-----------
 helpers.py | 16 ++++++++++++----
 2 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/extract.py b/extract.py
index 17a5012..e0bbe4a 100644
--- a/extract.py
+++ b/extract.py
@@ -3,6 +3,7 @@
 from bs4 import BeautifulSoup
 import psycopg2
 import math
+import codecs
 import re
 import numpy as np
 import itertools
@@ -82,11 +83,11 @@ def find_above_and_below(extract):
 
     def expand_extraction(extract_idx, props):
         # Iterate on above and below areas for each extract
-        for direction, areas in extract_relations[extract_idx].iteritems():
+        for direction, areas in extract_relations[extract_idx].items():
             stopped = False
             for area_idx in extract_relations[extract_idx][direction]:
                 # Iterate on all other extracts, making sure that extending the current one won't run into any of the others
-                for extract_idx2, props2 in extract_relations.iteritems():
+                for extract_idx2, props2 in extract_relations.items():
                     if extract_idx != extract_idx2:
                         will_intersect = helpers.rectangles_intersect(extracts[extract_idx2], helpers.enlarge_extract(extracts[extract_idx], page['areas'][area_idx]))
                         if will_intersect:
@@ -271,7 +272,7 @@ def expand_extraction(extract_idx, props):
 
 
     # Sanity check the caption-area assignments
-    for caption, areas in caption_areas.iteritems():
+    for caption, areas in caption_areas.items():
         # Only check if the caption is assigned to more than one area
         if len(areas) > 1:
             # draw a line through the middle of the caption that spans the page
@@ -335,7 +336,7 @@ def expand_extraction(extract_idx, props):
 
     # Extracts are bounding boxes that will be used to actually extract the tables
     extracts = []
-    for caption, areas in caption_areas.iteritems():
+    for caption, areas in caption_areas.items():
         print(indicator_lines[caption])
         area_of_interest_centroid_y_mean = np.mean([ helpers.centroid(page['areas'][area])['y'] for area in areas ])
         indicator_line_centroid_y = helpers.centroid(indicator_lines[caption])['y']
@@ -362,7 +363,7 @@ def expand_extraction(extract_idx, props):
     # Make sure each table was assigned a caption
     assigned_tables = []
     unassigned_tables = []
-    for caption_idx, areas in caption_areas.iteritems():
+    for caption_idx, areas in caption_areas.items():
         assigned_tables = assigned_tables + areas
 
     all_tables = []
@@ -400,7 +401,7 @@ def expand_extraction(extract_idx, props):
     for extract_idx, extract in enumerate(extracts):
         expand_extraction(extract_idx, find_above_and_below(extract))
 
-    # for extract_idx, props in extract_relations.iteritems():
+    # for extract_idx, props in extract_relations.items():
     #     expand_extraction(extract_idx, props)
 
     for extract in orphan_extracts:
@@ -439,7 +440,7 @@ def extract_tables(document_path):
 
     # Read in each tesseract page with BeautifulSoup so we can look at the document holistically
     for page_no, page in enumerate(page_paths):
-        with open(page) as hocr:
+        with codecs.open(page, "r", "utf-8") as hocr:
             text = hocr.read()
             soup = BeautifulSoup(text, 'html.parser')
             merged_areas = helpers.merge_areas(soup.find_all('div', 'ocr_carea'))
@@ -467,11 +468,10 @@ def extract_tables(document_path):
             # Use the model to assign an area type and probabilty of that area type
             probabilities = clf.predict_proba([ heuristics.classify_list(area, doc_stats, page['areas']) ])
             # Apply a label to each probability
-            classifications = zip(clf.classes_, probabilities)
-            # Sort by highest probability
-            classifications.sort(key=lambda x: x[1], reverse=True)
+            classifications = zip(clf.classes_, probabilities[0])
+            classifications = sorted(classifications, key = lambda x: x[1], reverse=True)
 
-            area['classification_p'] = classifications[0][0]
+            area['classification_p'] = classifications[0][1]
 
             area['type'] = clf.predict([ heuristics.classify_list(area, doc_stats, page['areas']) ])
 
diff --git a/helpers.py b/helpers.py
index 08d13ab..f9cc25a 100644
--- a/helpers.py
+++ b/helpers.py
@@ -473,12 +473,21 @@ def area_summary(area):
         for word_idx, word in enumerate(words):
             wordbbox = extractbbox(word.get('title'))
 
+            word_area = (wordbbox['x2'] - wordbbox['x1']) * (wordbbox['y2'] - wordbbox['y1'])
+            if word_area > summary['area'] or \
+                    wordbbox['x2'] > summary['x2'] or \
+                    wordbbox['x1'] < summary['x1'] or \
+                    wordbbox['y1'] < summary['y1'] or \
+                    wordbbox['y2'] > summary['y2']:
+                print("Word outside of the enclosing area! Tesseract's black box strikes again!")
+                continue
+
             # Record the x coordinate of the first word of each line
             if word_idx == 0:
                 summary['first_word_x'] = wordbbox['x1']
 
             summary['word_heights'].append(wordbbox['y2'] - wordbbox['y1'])
-            summary['word_areas'].append((wordbbox['x2'] - wordbbox['x1']) * (wordbbox['y2'] - wordbbox['y1']))
+            summary['word_areas'].append(word_area)
 
             for x in range(wordbbox['x1'] - summary['x1'], wordbbox['x2'] - summary['x1']):
                 summary['x_gaps'][x] = 1
@@ -524,9 +533,8 @@ def summarize_document(area_stats):
         'word_height_avg': np.nanmean([area['word_height_avg'] for area in area_stats if area['words'] > 0 and area['lines'] > 1]),
         'word_height_avg_median': np.nanmedian([area['word_height_avg'] for area in area_stats if area['words'] > 0 and area['lines'] > 1]),
         'word_height_avg_std': np.nanstd([area['word_height_avg'] for area in area_stats if area['words'] > 0 and area['lines'] > 1]),
-
-        'line_height_avg': np.nanmean([a for a in area['line_heights'] for area in area_stats]),
-        'line_height_std': np.nanstd([a for a in area['line_heights'] for area in area_stats]),
+        'line_height_avg': np.nanmean([height for area in area_stats for height in area["line_heights"]]),
+        'line_height_std': np.nanstd([height for area in area_stats for height in area["line_heights"]]),
         'max_area': max([ area['area'] for area in area_stats ]),
         'max_lines': max([ area['lines'] for area in area_stats ]),
         'max_gaps': max([ len(area['gaps']) for area in area_stats ])

From 6be3a1ff139d1ecd2abdf1eb5bd05e25b616f806 Mon Sep 17 00:00:00 2001
From: Ian Ross <ian.ackerman.ross@gmail.com>
Date: Tue, 13 Nov 2018 16:38:33 -0600
Subject: [PATCH 02/19] visualize areas and classifications

---
 extract.py    | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 preprocess.sh |  4 +++-
 2 files changed, 56 insertions(+), 2 deletions(-)

diff --git a/extract.py b/extract.py
index e0bbe4a..e8b56db 100644
--- a/extract.py
+++ b/extract.py
@@ -9,6 +9,9 @@
 import itertools
 import glob
 import logging
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from scipy.misc import imread
 
 np.set_printoptions(threshold=np.inf)
 
@@ -422,6 +425,7 @@ def expand_extraction(extract_idx, props):
 
 
 # Entry into table extraction
+@profile
 def extract_tables(document_path):
     # Connect to Postgres
     connection = psycopg2.connect(
@@ -473,7 +477,7 @@ def extract_tables(document_path):
 
             area['classification_p'] = classifications[0][1]
 
-            area['type'] = clf.predict([ heuristics.classify_list(area, doc_stats, page['areas']) ])
+            area['type'] = clf.predict([ heuristics.classify_list(area, doc_stats, page['areas']) ])[0]
 
 
     # Attempt to identify all charts/tables/etc in the paper by looking at the text layer
@@ -526,6 +530,54 @@ def extract_tables(document_path):
     for ttype in figure_idx:
         print('    ', ttype, figure_idx[ttype])
 
+    colormap = {
+            'other' : '#26547C',
+            'header / footer': '#EF476F',
+            'graphic caption' : '#FFD166',
+            'graphic' : '#06D6A0',
+            'reference' : '#3E92CC',
+            'body' : '#F4FAFF'
+            }
+    for page in pages:
+        fig = plt.figure()
+        print(document_path + "/png/page_%s.png" % page['page_no'])
+        img = plt.imread(document_path + "/png/page_%s.png" % page['page_no'])
+        ax = fig.add_subplot(111, aspect='equal')
+        for area in page['areas']:
+            box = {
+                    '_left': int(area['x1']),
+                    '_top': int(area['y1']),
+                    '_right': int(area['x2']),
+                    '_bottom': int(area['y2']),
+                    'width': int(area['x2']) - int(area['x1']),
+                    'height': int(area['y2']) - int(area['y1'])
+                    }
+            ax.add_patch(patches.Rectangle(
+                (box['_left'], box['_top']),
+                box['_right'] - box['_left'],
+                box['_bottom'] - box['_top'],
+                fill=True,
+                linewidth=0.5,
+                facecolor=colormap[area['type']],
+                label=area['type'],
+                alpha = 0.2
+                )
+                )
+        plt.ylim(0,pages[0]['page']['y2'])
+        plt.xlim(0,pages[0]['page']['x2'])
+        plt.axis("off")
+        plt.imshow(img, zorder=0)
+        ax = plt.gca()
+        ax.invert_yaxis()
+        patchlist = [
+                patches.Patch(color = color, label = label, alpha=0.2)
+                for label,color in colormap.items()
+                ]
+        fig.legend(patchlist, colormap.keys(), loc='lower center', fontsize='x-small', ncol=int(len(colormap)/2), bbox_transform=fig.transFigure)
+        plt.axis('off')
+        fig.savefig(document_path + "/annotated/page_%s_with_areatypes.png" % page['page_no'], dpi=400, bbox_inches='tight', pad_inches=0)
+        plt.close(fig)
+
     for page in pages:
         page_extracts = process_page(doc_stats, page)
 
diff --git a/preprocess.sh b/preprocess.sh
index 6af1905..1ab4b1b 100755
--- a/preprocess.sh
+++ b/preprocess.sh
@@ -20,7 +20,9 @@ mkdir -p docs/$1/$docname
 mkdir -p docs/$1/$docname/png
 mkdir -p docs/$1/$docname/tesseract
 if [ "$1" == "classified" ]
-  then mkdir -p docs/$1/$docname/extracts
+then 
+    mkdir -p docs/$1/$docname/extracts
+    mkdir -p docs/$1/$docname/annotated
 fi
 
 gs -dBATCH -dNOPAUSE -sDEVICE=png16m -dGraphicsAlphaBits=4 -dTextAlphaBits=4 -r600 -sOutputFile="./docs/$1/$docname/png/page_%d.png" $2

From 69328a466ad324a3034b9e1ae40f7cdaf4ad1ed9 Mon Sep 17 00:00:00 2001
From: Ian Ross <ian.ackerman.ross@gmail.com>
Date: Tue, 13 Nov 2018 19:41:47 -0600
Subject: [PATCH 03/19] fix that mem garbage

---
 extract.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/extract.py b/extract.py
index e8b56db..b38ec99 100644
--- a/extract.py
+++ b/extract.py
@@ -576,7 +576,8 @@ def extract_tables(document_path):
         fig.legend(patchlist, colormap.keys(), loc='lower center', fontsize='x-small', ncol=int(len(colormap)/2), bbox_transform=fig.transFigure)
         plt.axis('off')
         fig.savefig(document_path + "/annotated/page_%s_with_areatypes.png" % page['page_no'], dpi=400, bbox_inches='tight', pad_inches=0)
-        plt.close(fig)
+        fig.clf()
+        plt.close()
 
     for page in pages:
         page_extracts = process_page(doc_stats, page)

From e839444a8db1f1bce694d2cdfbc40f765135f6f6 Mon Sep 17 00:00:00 2001
From: Ian Ross <ian.ackerman.ross@gmail.com>
Date: Thu, 15 Nov 2018 09:26:35 -0600
Subject: [PATCH 04/19] don't require a model to be present already to do
 training

---
 annotator/server.py | 42 +++++++++++++++++++++++++-----------------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/annotator/server.py b/annotator/server.py
index 7f47a34..d0db1db 100644
--- a/annotator/server.py
+++ b/annotator/server.py
@@ -69,9 +69,14 @@ def random_area():
     area = cursor.fetchall()[0]
     q = list(area[7:])
 
-    estimated_label = clf.predict([q])[0]
-
-    p = zip(clf.classes_, clf.predict_proba([q])[0])
+    if clf is not None:
+        estimated_label = clf.predict([q])[0]
+        p = list(zip(clf.classes_, clf.predict_proba([q])[0]))
+    else:
+        estimated_label = "dummy"
+        cursor.execute("SELECT name FROM labels")
+        labels = [i[0] for i in cursor.fetchall()]
+        p = list(zip(labels, ["-"] * len(labels)))
 
     bad = False
     for each in p:
@@ -92,6 +97,7 @@ def random_area():
     }
 
     for each in p:
+        # should be label, probability
         new_area[each[0]] = each[1]
 
     new_area['img'] = get_area_image(area[1], area[2], { 'x1': area[3], 'y1': area[4], 'x2': area[5], 'y2': area[6] })
@@ -112,7 +118,7 @@ def random_area():
 
 def get_area_image(doc, page, extract):
     img_name = random_name()
-    image = np.array(Image.open('../docs/%s/png/page_%s.png' % (doc, page)), dtype=np.uint8)
+    image = np.array(Image.open('./docs/training/%s/png/page_%s.png' % (doc, page)), dtype=np.uint8)
     fig,ax = plt.subplots(1)
     ax.imshow(image)
     ax.add_patch(patches.Rectangle(
@@ -199,19 +205,21 @@ def learn():
 
 """)
 data = cursor.fetchall()
-
-# Omit area_id, doc_id, page_no, and label_name
-train = [ list(d[4:]) for d in data ]
-
-label = np.array([ d[3] for d in data ])
-index = [ d[0:3] for d in data ]
-
-# gamma - influence of a single training example. low = far, high = close
-# C - low = less freedom, high = more freedom
-#clf = svm.SVC(gamma=0.001, C=100., probability=True, cache_size=500)
-clf = svm.SVC(gamma=1, C=100, probability=True, cache_size=500, kernel='rbf')
-
-clf.fit(train, label)
+if data == []:
+    print("Unable to initialize model! You'll still be able to train a new one, but you will not see classification probabilities as you do so.")
+    clf = None
+else:
+    # Omit area_id, doc_id, page_no, and label_name
+    train = [ list(d[4:]) for d in data ]
+
+    label = np.array([ d[3] for d in data ])
+    index = [ d[0:3] for d in data ]
+
+    # gamma - influence of a single training example. low = far, high = close
+    # C - low = less freedom, high = more freedom
+    #clf = svm.SVC(gamma=0.001, C=100., probability=True, cache_size=500)
+    clf = svm.SVC(gamma=1, C=100, probability=True, cache_size=500, kernel='rbf')
+    clf.fit(train, label)
 
 # print clf.classes_
 

From aaa02be74fa22a720be39106b9b8ba293bf23c4c Mon Sep 17 00:00:00 2001
From: Ian Ross <ian.ackerman.ross@gmail.com>
Date: Thu, 15 Nov 2018 09:27:05 -0600
Subject: [PATCH 05/19] rm function profiler

---
 extract.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/extract.py b/extract.py
index b38ec99..7a3469d 100644
--- a/extract.py
+++ b/extract.py
@@ -425,7 +425,6 @@ def expand_extraction(extract_idx, props):
 
 
 # Entry into table extraction
-@profile
 def extract_tables(document_path):
     # Connect to Postgres
     connection = psycopg2.connect(

From 5dfbbe4c981c2a7c6c55038943ce41721e0d01d3 Mon Sep 17 00:00:00 2001
From: Ian Ross <ian.ackerman.ross@gmail.com>
Date: Thu, 15 Nov 2018 09:27:38 -0600
Subject: [PATCH 06/19] rm extraneous (?) classifier creation

---
 helpers.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/helpers.py b/helpers.py
index f9cc25a..f46d5d3 100644
--- a/helpers.py
+++ b/helpers.py
@@ -11,10 +11,6 @@
 from difflib import SequenceMatcher
 from bs4 import BeautifulSoup
 
-import classifier
-
-clf = classifier.create()
-
 def similar(a, b):
     return SequenceMatcher(None, a, b).ratio()
 

From ad49509f334214b57058de286921d60922c2dc2b Mon Sep 17 00:00:00 2001
From: Ian Ross <ian.ackerman.ross@gmail.com>
Date: Thu, 15 Nov 2018 09:28:34 -0600
Subject: [PATCH 07/19] fix path, utf encoding

---
 summarize.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/summarize.py b/summarize.py
index 7a89cc3..2af7087 100644
--- a/summarize.py
+++ b/summarize.py
@@ -3,6 +3,7 @@
 import heuristics
 import helpers
 from bs4 import BeautifulSoup
+import codecs
 import psycopg2
 from psycopg2.extensions import AsIs
 
@@ -23,12 +24,12 @@
 
 doc_id = sys.argv[1]
 
-page_paths = glob.glob('./docs/' + doc_id + '/tesseract/*.html')
+page_paths = glob.glob('./docs/training/' + doc_id + '/tesseract/*.html')
 
 pages = []
 for page_no, page in enumerate(page_paths):
     # Read in each tesseract page with BeautifulSoup so we can look at the document holistically
-    with open(page) as hocr:
+    with codecs.open(page, "r", "utf-8") as hocr:
         text = hocr.read()
         soup = BeautifulSoup(text, 'html.parser')
         merged_areas = helpers.merge_areas(soup.find_all('div', 'ocr_carea'))

From ae0e8bf9eb2752db33b7f3ee5aedc0a461ccaff0 Mon Sep 17 00:00:00 2001
From: Ian Ross <ian.ackerman.ross@gmail.com>
Date: Thu, 15 Nov 2018 10:24:02 -0600
Subject: [PATCH 08/19] Add rudimentary wrapper + envvar toggle

---
 Dockerfile            |  7 +++----
 blackstack_wrapper.sh | 24 ++++++++++++++++++++++++
 docker-compose.yml    |  1 +
 3 files changed, 28 insertions(+), 4 deletions(-)
 create mode 100755 blackstack_wrapper.sh

diff --git a/Dockerfile b/Dockerfile
index 10dc9e7..86c1819 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -23,13 +23,12 @@ COPY *py $PDF/
 COPY annotator $PDF/
 COPY config.py.env $PDF/config.py
 
-COPY test/WH897R_29453_000452.pdf $PDF/test/
+COPY 1-s2.0-0031018280900164-main.pdf $PDF/test/
+ARG BLACKSTACK_MODE
 
 RUN mkdir out
 WORKDIR $PDF
 
 EXPOSE 5555
 
-CMD bash -c "sleep 10; $PDF/preprocess.sh training test/WH897R_29453_000452.pdf; python3 $PDF/server.py"
-#CMD ["./preprocess.sh", "training", "test/WH897R_29453_000452.pdf"]
-
+CMD ["./blackstack_wrapper.sh"]
diff --git a/blackstack_wrapper.sh b/blackstack_wrapper.sh
new file mode 100755
index 0000000..8904870
--- /dev/null
+++ b/blackstack_wrapper.sh
@@ -0,0 +1,24 @@
+#!/bin/sh
+
+
+if [ -z "${BLACKSTACK_MODE}" ]
+then
+    echo "Please specify BLACKSTACK_MODE as an envvar"
+    exit 1
+else
+    BLACKSTACK_MODE=${BLACKSTACK_MODE}
+    echo Running blackstack in $BLACKSTACK_MODE mode.
+    if [ "$BLACKSTACK_MODE" = "classified" ] 
+    then
+        echo classified
+        ./preprocess.sh classified test/1-s2.0-0031018280900164-main.pdf ; 
+        python3 extract.py ./docs/classified/1-s2*/
+    elif [ "$BLACKSTACK_MODE" = "training" ] 
+    then
+        echo training
+        ./preprocess.sh training test/1-s2.0-0031018280900164-main.pdf
+        python3 server.py
+    else
+        echo "Unknown blackstack mode specified. Please choose classified or training."
+    fi
+fi
diff --git a/docker-compose.yml b/docker-compose.yml
index de5a777..3728827 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -6,6 +6,7 @@ services:
     environment:
       - PG_PASSWORD=blackstack
       - PG_USERNAME=postgres
+      - BLACKSTACK_MODE=${BLACKSTACK_MODE}
     ports:
       - 5555:5555
 

From 3bb790e57b900d442ef895e2a030ee6bec95a189 Mon Sep 17 00:00:00 2001
From: Ian Ross <ian.ackerman.ross@gmail.com>
Date: Thu, 15 Nov 2018 11:19:50 -0600
Subject: [PATCH 09/19] run classification/extraction mode by default

---
 blackstack_wrapper.sh | 42 +++++++++++++++++++++++++++---------------
 1 file changed, 27 insertions(+), 15 deletions(-)

diff --git a/blackstack_wrapper.sh b/blackstack_wrapper.sh
index 8904870..5d290b9 100755
--- a/blackstack_wrapper.sh
+++ b/blackstack_wrapper.sh
@@ -3,22 +3,34 @@
 
 if [ -z "${BLACKSTACK_MODE}" ]
 then
-    echo "Please specify BLACKSTACK_MODE as an envvar"
-    exit 1
+    echo "No BLACKSTACK_MODE specified -- assuming classification mode on prebuilt model."
+    BLACKSTACK_MODE='classified'
 else
     BLACKSTACK_MODE=${BLACKSTACK_MODE}
-    echo Running blackstack in $BLACKSTACK_MODE mode.
-    if [ "$BLACKSTACK_MODE" = "classified" ] 
-    then
-        echo classified
-        ./preprocess.sh classified test/1-s2.0-0031018280900164-main.pdf ; 
-        python3 extract.py ./docs/classified/1-s2*/
-    elif [ "$BLACKSTACK_MODE" = "training" ] 
-    then
-        echo training
-        ./preprocess.sh training test/1-s2.0-0031018280900164-main.pdf
+fi
+
+echo Running blackstack in $BLACKSTACK_MODE mode.
+if [ "$BLACKSTACK_MODE" = "classified" ] 
+then
+    for doc in input/*.pdf;
+    do
+        filename=$(basename "$doc")
+        docname="${filename%.*}"
+        echo ./preprocess.sh classified input/$filename
+        ./preprocess.sh classified input/$filename
+        echo python3 extract.py ./docs/classified/$docname/
+        python3 extract.py ./docs/classified/$docname/
+    done
+elif [ "$BLACKSTACK_MODE" = "training" ] 
+then
+    for doc in input/*.pdf;
+    do
+        filename=$(basename "$doc")
+        docname="${filename%.*}"
+        ./preprocess.sh training input/$filename
         python3 server.py
-    else
-        echo "Unknown blackstack mode specified. Please choose classified or training."
-    fi
+    done
+else
+    echo "Unknown blackstack mode specified. Please choose classified or training."
+    exit 1
 fi

From 89cadd420cd54daeba8bdc37059a7a893efa1d07 Mon Sep 17 00:00:00 2001
From: Ian Ross <ian.ackerman.ross@gmail.com>
Date: Thu, 15 Nov 2018 11:21:26 -0600
Subject: [PATCH 10/19] Use input/ and docs/ to manage input and output

---
 docker-compose.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docker-compose.yml b/docker-compose.yml
index 3728827..4bcccf7 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -9,6 +9,8 @@ services:
       - BLACKSTACK_MODE=${BLACKSTACK_MODE}
     ports:
       - 5555:5555
+    volumes:
+      - ./docs/:/app/pdf/docs/
 
   postgres:
     image: postgres:10.5-alpine

From 830926122ab67647fe0e6d94e675f401fb3a7664 Mon Sep 17 00:00:00 2001
From: Ian Ross <ian.ackerman.ross@gmail.com>
Date: Fri, 16 Nov 2018 09:38:17 -0600
Subject: [PATCH 11/19] Use input/

---
 Dockerfile | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 86c1819..9dad3f9 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -22,9 +22,7 @@ COPY *sh $PDF/
 COPY *py $PDF/
 COPY annotator $PDF/
 COPY config.py.env $PDF/config.py
-
-COPY 1-s2.0-0031018280900164-main.pdf $PDF/test/
-ARG BLACKSTACK_MODE
+COPY input/ $PDF/input/
 
 RUN mkdir out
 WORKDIR $PDF

From cd8e029d4d0b7af80c3bf913ec3d5c46b7f66f35 Mon Sep 17 00:00:00 2001
From: Ian Ross <iross@cs.wisc.edu>
Date: Fri, 16 Nov 2018 09:46:35 -0600
Subject: [PATCH 12/19] fix - move server start to a sensible place

---
 blackstack_wrapper.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/blackstack_wrapper.sh b/blackstack_wrapper.sh
index 5d290b9..43779bb 100755
--- a/blackstack_wrapper.sh
+++ b/blackstack_wrapper.sh
@@ -28,8 +28,8 @@ then
         filename=$(basename "$doc")
         docname="${filename%.*}"
         ./preprocess.sh training input/$filename
-        python3 server.py
     done
+    python3 server.py
 else
     echo "Unknown blackstack mode specified. Please choose classified or training."
     exit 1

From 098a02ed6f8a529c621dae8a9c63dc7b5478d7b7 Mon Sep 17 00:00:00 2001
From: Ian Ross <ian.ackerman.ross@gmail.com>
Date: Fri, 16 Nov 2018 11:33:02 -0600
Subject: [PATCH 13/19] use defaultdict -- fixes crash that happens if server
 is restarted before all labels have been seen in training

---
 annotator/server.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/annotator/server.py b/annotator/server.py
index d0db1db..0aadaf6 100644
--- a/annotator/server.py
+++ b/annotator/server.py
@@ -9,6 +9,7 @@
 import matplotlib.pyplot as plt
 import matplotlib.patches as patches
 import numpy as np
+from collections import defaultdict
 from sklearn import svm
 
 # Import database credentials
@@ -86,15 +87,14 @@ def random_area():
     if bad:
         return random_area()
 
-    new_area = {
-        'area_id': area[0],
-        'doc_id': area[1],
-        'page_no': area[2],
-        'x1': area[3],
-        'y1': area[4],
-        'x2': area[5],
-        'y2': area[6]
-    }
+    new_area = defaultdict(float)
+    new_area['area_id'] = area[0]
+    new_area['doc_id'] = area[1]
+    new_area['page_no'] = area[2]
+    new_area['x1'] = area[3]
+    new_area['y1'] = area[4]
+    new_area['x2'] = area[5]
+    new_area['y2'] = area[6]
 
     for each in p:
         # should be label, probability

From ffceeed9b34d271f5458bab274d03e3e8d12953b Mon Sep 17 00:00:00 2001
From: Ian Ross <ian.ackerman.ross@gmail.com>
Date: Fri, 16 Nov 2018 11:35:00 -0600
Subject: [PATCH 14/19] make tmp dir for training images if it doesn't exist

---
 annotator/server.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/annotator/server.py b/annotator/server.py
index 0aadaf6..9443df7 100644
--- a/annotator/server.py
+++ b/annotator/server.py
@@ -224,4 +224,6 @@ def learn():
 # print clf.classes_
 
 if __name__ == '__main__':
+    if not os.path.exists("./tmp"):
+        os.mkdir("tmp")
     app.run(host='0.0.0.0', port=5555)

From 5dcda56ad872dfb0adfa2fa7e9c1537074dc2047 Mon Sep 17 00:00:00 2001
From: Ian Ross <ian.ackerman.ross@gmail.com>
Date: Tue, 4 Dec 2018 11:45:00 -0600
Subject: [PATCH 15/19] make things work in python3 (filter is a generator, so
 this silently mucked up the works)

---
 helpers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/helpers.py b/helpers.py
index f46d5d3..cd2d826 100644
--- a/helpers.py
+++ b/helpers.py
@@ -438,7 +438,7 @@ def area_summary(area):
 
     # Number of words
     try:
-        summary['words'] = len(filter(None, summary['soup'].getText().strip().replace('\n', ' ').replace('  ', ' ').split(' ')))
+        summary['words'] = len(list(filter(None, summary['soup'].getText().strip().replace('\n', ' ').replace('  ', ' ').split(' '))))
     except:
         summary['words'] = 0
 

From 57ff1e2d68ca010e82a3a9947d524a0a84dacd01 Mon Sep 17 00:00:00 2001
From: Ian Ross <ian.ackerman.ross@gmail.com>
Date: Tue, 4 Dec 2018 15:00:01 -0600
Subject: [PATCH 16/19] use local dir for persistent pg data

---
 docker-compose.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index 4bcccf7..1271809 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -19,4 +19,4 @@ services:
       - POSTGRES_DB=blackstack
     volumes:
       - ./setup:/docker-entrypoint-initdb.d/
-
+      - ./postgres-data:/var/lib/postgresql/data

From 794c4739408954dc079c5958bb8563e2bd03a58b Mon Sep 17 00:00:00 2001
From: Ian Ross <ian.ackerman.ross@gmail.com>
Date: Tue, 4 Dec 2018 15:00:20 -0600
Subject: [PATCH 17/19] readme updates

---
 README.md | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/README.md b/README.md
index 84f48e3..7443987 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,10 @@ A machine learning approach to table and figure extraction. Uses SciKit Learn's
 Whereas other approaches to table and figure reading depend on content to be well-structured, Blackstack ignores the issue of table and figure _data_ extraction (see [Fonduer](https://github.com/HazyResearch/fonduer)) and instead uses a format-agnostic approach to extracting entities as images that can then be used for future analysis or querying.
 
 ## Installation
+### Recommended
+We recommend using Blackstack via docker-compose (https://docs.docker.com/compose/install/)
+
+### Local
 
 Blackstack relies on a few libraries that you probably need to install, including [ghostscript](https://www.ghostscript.com) and [tesseract](https://github.com/tesseract-ocr/tesseract). If you are using MacOS and [Homebrew](https://brew.sh) you can install them as so:
 
@@ -38,6 +42,37 @@ pip install -r requirements.txt
 
 ## Getting started
 
+### Docker
+
+The recommended way of running Blackstack is via the supplied `docker-compose.yml`. It can
+be run in either `training` mode to train a new model or in `classified` mode
+to apply the trained model (or the default included one) to a set of documents. The mode of running can be provided
+as an environmental variable when invoking docker-compose:
+
+````
+BLACKSTACK_MODE=classified docker-compose up --no-deps --build --force-recreate
+````
+to apply a model.
+
+To train a new one, the first step is to move the default data so that it doesn't
+interfere with your training.
+
+````
+mv setup/02_example_data.sql setup/02_example_data.sql_bk
+BLACKSTACK_MODE=training docker-compose up --no-deps --build --force-recreate
+````
+to train one. 
+
+On startup, Blackstack will preprocess any documents in the `./input/` directory
+and either serve them up for annotation (in training mode) or apply the model
+(in classified mode).
+
+Preprocessed output will be stored to the `./docs/` directory, and, if running in
+classified mode, extractions will be stored per-document in `./docs/classified/`.
+
+### Standalone
+The tools can also be run individually if the prerequisites are installed locally.
+
 #### Preprocessing
 Before a model can be trained, documents to use as training data must be selected. If you are attempting to extract entities from a specific journal or publisher it is recommended that your training data also come from that journal or publisher. If you are trying to create a general classifier you should have a good sample of documents from across disciplines, publishers, etc.
 

From c3b885f74045a0e81d45a84646cefa46889377bd Mon Sep 17 00:00:00 2001
From: Ian Ross <ian.ackerman.ross@gmail.com>
Date: Wed, 12 Dec 2018 14:58:41 -0600
Subject: [PATCH 18/19] don't create unnecessary dirs; no need to cp orig
 (everything in input stays untouched

---
 preprocess.sh | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/preprocess.sh b/preprocess.sh
index 1ab4b1b..412e0e7 100755
--- a/preprocess.sh
+++ b/preprocess.sh
@@ -13,10 +13,6 @@ fi
 filename=$(basename "$2")
 docname="${filename%.*}"
 
-mkdir -p docs/$docname
-mkdir -p docs/$docname/png
-
-mkdir -p docs/$1/$docname
 mkdir -p docs/$1/$docname/png
 mkdir -p docs/$1/$docname/tesseract
 if [ "$1" == "classified" ]
@@ -27,8 +23,6 @@ fi
 
 gs -dBATCH -dNOPAUSE -sDEVICE=png16m -dGraphicsAlphaBits=4 -dTextAlphaBits=4 -r600 -sOutputFile="./docs/$1/$docname/png/page_%d.png" $2
 
-cp $2 ./docs/$docname/orig.pdf
-
 ls ./docs/$1/$docname/png | grep -o '[0-9]\+' | parallel -j 4 "./process.sh $1 $docname {}"
 
 if [ "$1" == "training" ]

From 0d92fb2f3a9a2008db0a7a17d4c9cf7c47b7b0a4 Mon Sep 17 00:00:00 2001
From: Ian Ross <ian.ackerman.ross@gmail.com>
Date: Wed, 12 Dec 2018 14:59:06 -0600
Subject: [PATCH 19/19] Add tesseract bbox visualization step

---
 annotate.py | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 process.sh  |   1 +
 2 files changed, 114 insertions(+)
 create mode 100644 annotate.py

diff --git a/annotate.py b/annotate.py
new file mode 100644
index 0000000..3e19f1d
--- /dev/null
+++ b/annotate.py
@@ -0,0 +1,113 @@
+import sys
+from bs4 import BeautifulSoup
+from PIL import Image
+import matplotlib as mpl
+from matplotlib import pyplot
+import codecs
+mpl.use('TkAgg')
+
+def makeBox(bbox):
+    return {
+        '_left': int(bbox[0]),
+        '_top': int(bbox[1]),
+        '_right': int(bbox[2]),
+        '_bottom': int(bbox[3]),
+        'width': int(bbox[2]) - int(bbox[0]),
+        'height': int(bbox[3]) - int(bbox[1])
+    }
+
+def getbbox(title):
+    title_parts = title.split(';')
+    for part in title_parts:
+        if part.strip()[0:4] == 'bbox':
+            return part.replace('bbox', '').strip().split()
+
+    return
+
+def tess(infile, outfile):
+    with codecs.open(infile, "r", "utf-8") as hocr:
+        text = hocr.read()
+
+    soup = BeautifulSoup(text, "html.parser")
+    pages = soup.find_all('div', 'ocr_page')
+    careas = soup.find_all('div', 'ocr_carea')
+    pars = soup.find_all('p', 'ocr_par')
+    lines = soup.find_all('span', 'ocr_line')
+    words = soup.find_all('span', 'ocrx_word')
+
+    page_boxes = [makeBox(getbbox(page.get('title'))) for page in pages]
+    carea_boxes = [makeBox(getbbox(carea.get('title'))) for carea in careas]
+    par_boxes = [makeBox(getbbox(par.get('title'))) for par in pars]
+    line_boxes = [makeBox(getbbox(line.get('title'))) for line in lines]
+    word_boxes = [makeBox(getbbox(word.get('title'))) for word in words]
+
+    fig = pyplot.figure()
+    ax = fig.add_subplot(111, aspect='equal')
+
+    for box in page_boxes:
+        ax.add_patch(mpl.patches.Rectangle(
+            (box['_left'], box['_top']),
+            box['_right'] - box['_left'],
+            box['_bottom'] - box['_top'],
+            fill=False,
+            linewidth=0.5,
+            edgecolor="#FF00FF"
+            )
+            )
+
+    for box in carea_boxes:
+        ax.add_patch(mpl.patches.Rectangle(
+            (box['_left'], box['_top']),
+            box['_right'] - box['_left'],
+            box['_bottom'] - box['_top'],
+            fill=False,
+            linewidth=0.5,
+            edgecolor="#0000FF"
+            )
+            )
+
+    for box in par_boxes:
+        ax.add_patch(mpl.patches.Rectangle(
+            (box['_left'], box['_top']),
+            box['_right'] - box['_left'],
+            box['_bottom'] - box['_top'],
+            fill=False,
+            linewidth=0.1,
+            edgecolor="#F0F0F0"
+            )
+            )
+
+    for box in line_boxes:
+        ax.add_patch(mpl.patches.Rectangle(
+            (box['_left'], box['_top']),
+            box['_right'] - box['_left'],
+            box['_bottom'] - box['_top'],
+            fill=False,
+            linewidth=0.1,
+            edgecolor="#FF0000"
+            )
+            )
+    for box in word_boxes:
+        ax.add_patch(mpl.patches.Rectangle(
+            (box['_left'], box['_top']),
+            box['_right'] - box['_left'],
+            box['_bottom'] - box['_top'],
+            fill=False,
+            linewidth=0.1,
+            edgecolor="#000000"
+            )
+            )
+
+    pyplot.ylim(0,page_boxes[0]['_bottom'])
+    pyplot.xlim(0,page_boxes[0]['_right'])
+    pyplot.axis("off")
+    ax = pyplot.gca()
+    ax.invert_yaxis()
+    pyplot.axis('off')
+    fig.savefig(outfile, dpi=400, bbox_inches='tight', pad_inches=0)
+
+
+if len(sys.argv) == 3:
+    tess(sys.argv[1], sys.argv[2])
+else:
+    print('Script requires two parameters: an input Tesseract HOCR file and an output file name and location')
diff --git a/process.sh b/process.sh
index bee3a86..e8a355e 100755
--- a/process.sh
+++ b/process.sh
@@ -2,3 +2,4 @@
 
 tesseract ./docs/$1/$2/png/page_$3.png ./docs/$1/$2/tesseract/page_$3.html hocr
 mv ./docs/$1/$2/tesseract/page_$3.html.hocr ./docs/$1/$2/tesseract/page_$3.html
+python3 annotate.py ./docs/$1/$2/tesseract/page_$3.html ./docs/$1/$2/tesseract/page_$3.png