UW-xDD · iross · Dec 12, 2018 · Nov 13, 2018 · Nov 13, 2018 · Nov 14, 2018
diff --git a/Dockerfile b/Dockerfile
@@ -22,14 +22,11 @@ COPY *sh $PDF/
 COPY *py $PDF/
 COPY annotator $PDF/
 COPY config.py.env $PDF/config.py
-
-COPY test/WH897R_29453_000452.pdf $PDF/test/
+COPY input/ $PDF/input/
 
 RUN mkdir out
 WORKDIR $PDF
 
 EXPOSE 5555
 
-CMD bash -c "sleep 10; $PDF/preprocess.sh training test/WH897R_29453_000452.pdf; python3 $PDF/server.py"
-#CMD ["./preprocess.sh", "training", "test/WH897R_29453_000452.pdf"]
-
+CMD ["./blackstack_wrapper.sh"]
diff --git a/README.md b/README.md
@@ -5,6 +5,10 @@ A machine learning approach to table and figure extraction. Uses SciKit Learn's
 Whereas other approaches to table and figure reading depend on content to be well-structured, Blackstack ignores the issue of table and figure _data_ extraction (see [Fonduer](https://github.com/HazyResearch/fonduer)) and instead uses a format-agnostic approach to extracting entities as images that can then be used for future analysis or querying.
 
 ## Installation
+### Recommended
+We recommend using Blackstack via docker-compose (https://docs.docker.com/compose/install/)
+
+### Local
 
 Blackstack relies on a few libraries that you probably need to install, including [ghostscript](https://www.ghostscript.com) and [tesseract](https://github.com/tesseract-ocr/tesseract). If you are using MacOS and [Homebrew](https://brew.sh) you can install them as so:
 
@@ -38,6 +42,37 @@ pip install -r requirements.txt
 
 ## Getting started
 
+### Docker
+
+The recommended way of running Blackstack is via the supplied `docker-compose.yml`. It can
+be run in either `training` mode to train a new model or in `classified` mode
+to apply the trained model (or the default included one) to a set of documents. The mode of running can be provided
+as an environmental variable when invoking docker-compose:
+
+````
+BLACKSTACK_MODE=classified docker-compose up --no-deps --build --force-recreate
+````
+to apply a model.
+
+To train a new one, the first step is to move the default data so that it doesn't
+interfere with your training.
+
+````
+mv setup/02_example_data.sql setup/02_example_data.sql_bk
+BLACKSTACK_MODE=training docker-compose up --no-deps --build --force-recreate
+````
+to train one. 
+
+On startup, Blackstack will preprocess any documents in the `./input/` directory
+and either serve them up for annotation (in training mode) or apply the model
+(in classified mode).
+
+Preprocessed output will be stored to the `./docs/` directory, and, if running in
+classified mode, extractions will be stored per-document in `./docs/classified/`.
+
+### Standalone
+The tools can also be run individually if the prerequisites are installed locally.
+
 #### Preprocessing
 Before a model can be trained, documents to use as training data must be selected. If you are attempting to extract entities from a specific journal or publisher it is recommended that your training data also come from that journal or publisher. If you are trying to create a general classifier you should have a good sample of documents from across disciplines, publishers, etc.
 

diff --git a/annotate.py b/annotate.py
@@ -0,0 +1,113 @@
+import sys
+from bs4 import BeautifulSoup
+from PIL import Image
+import matplotlib as mpl
+from matplotlib import pyplot
+import codecs
+mpl.use('TkAgg')
+
+def makeBox(bbox):
+    return {
+        '_left': int(bbox[0]),
+        '_top': int(bbox[1]),
+        '_right': int(bbox[2]),
+        '_bottom': int(bbox[3]),
+        'width': int(bbox[2]) - int(bbox[0]),
+        'height': int(bbox[3]) - int(bbox[1])
+    }
+
+def getbbox(title):
+    title_parts = title.split(';')
+    for part in title_parts:
+        if part.strip()[0:4] == 'bbox':
+            return part.replace('bbox', '').strip().split()
+
+    return
+
+def tess(infile, outfile):
+    with codecs.open(infile, "r", "utf-8") as hocr:
+        text = hocr.read()
+
+    soup = BeautifulSoup(text, "html.parser")
+    pages = soup.find_all('div', 'ocr_page')
+    careas = soup.find_all('div', 'ocr_carea')
+    pars = soup.find_all('p', 'ocr_par')
+    lines = soup.find_all('span', 'ocr_line')
+    words = soup.find_all('span', 'ocrx_word')
+
+    page_boxes = [makeBox(getbbox(page.get('title'))) for page in pages]
+    carea_boxes = [makeBox(getbbox(carea.get('title'))) for carea in careas]
+    par_boxes = [makeBox(getbbox(par.get('title'))) for par in pars]
+    line_boxes = [makeBox(getbbox(line.get('title'))) for line in lines]
+    word_boxes = [makeBox(getbbox(word.get('title'))) for word in words]
+
+    fig = pyplot.figure()
+    ax = fig.add_subplot(111, aspect='equal')
+
+    for box in page_boxes:
+        ax.add_patch(mpl.patches.Rectangle(
+            (box['_left'], box['_top']),
+            box['_right'] - box['_left'],
+            box['_bottom'] - box['_top'],
+            fill=False,
+            linewidth=0.5,
+            edgecolor="#FF00FF"
+            )
+            )
+
+    for box in carea_boxes:
+        ax.add_patch(mpl.patches.Rectangle(
+            (box['_left'], box['_top']),
+            box['_right'] - box['_left'],
+            box['_bottom'] - box['_top'],
+            fill=False,
+            linewidth=0.5,
+            edgecolor="#0000FF"
+            )
+            )
+
+    for box in par_boxes:
+        ax.add_patch(mpl.patches.Rectangle(
+            (box['_left'], box['_top']),
+            box['_right'] - box['_left'],
+            box['_bottom'] - box['_top'],
+            fill=False,
+            linewidth=0.1,
+            edgecolor="#F0F0F0"
+            )
+            )
+
+    for box in line_boxes:
+        ax.add_patch(mpl.patches.Rectangle(
+            (box['_left'], box['_top']),
+            box['_right'] - box['_left'],
+            box['_bottom'] - box['_top'],
+            fill=False,
+            linewidth=0.1,
+            edgecolor="#FF0000"
+            )
+            )
+    for box in word_boxes:
+        ax.add_patch(mpl.patches.Rectangle(
+            (box['_left'], box['_top']),
+            box['_right'] - box['_left'],
+            box['_bottom'] - box['_top'],
+            fill=False,
+            linewidth=0.1,
+            edgecolor="#000000"
+            )
+            )
+
+    pyplot.ylim(0,page_boxes[0]['_bottom'])
+    pyplot.xlim(0,page_boxes[0]['_right'])
+    pyplot.axis("off")
+    ax = pyplot.gca()
+    ax.invert_yaxis()
+    pyplot.axis('off')
+    fig.savefig(outfile, dpi=400, bbox_inches='tight', pad_inches=0)
+
+
+if len(sys.argv) == 3:
+    tess(sys.argv[1], sys.argv[2])
+else:
+    print('Script requires two parameters: an input Tesseract HOCR file and an output file name and location')
diff --git a/annotator/server.py b/annotator/server.py
@@ -9,6 +9,7 @@
 import matplotlib.pyplot as plt
 import matplotlib.patches as patches
 import numpy as np
+from collections import defaultdict
 from sklearn import svm
 
 # Import database credentials
@@ -69,9 +70,14 @@ def random_area():
     area = cursor.fetchall()[0]
     q = list(area[7:])
 
-    estimated_label = clf.predict([q])[0]
-
-    p = zip(clf.classes_, clf.predict_proba([q])[0])
+    if clf is not None:
+        estimated_label = clf.predict([q])[0]
+        p = list(zip(clf.classes_, clf.predict_proba([q])[0]))
+    else:
+        estimated_label = "dummy"
+        cursor.execute("SELECT name FROM labels")
+        labels = [i[0] for i in cursor.fetchall()]
+        p = list(zip(labels, ["-"] * len(labels)))
 
     bad = False
     for each in p:
@@ -81,17 +87,17 @@ def random_area():
     if bad:
         return random_area()
 
-    new_area = {
-        'area_id': area[0],
-        'doc_id': area[1],
-        'page_no': area[2],
-        'x1': area[3],
-        'y1': area[4],
-        'x2': area[5],
-        'y2': area[6]
-    }
+    new_area = defaultdict(float)
+    new_area['area_id'] = area[0]
+    new_area['doc_id'] = area[1]
+    new_area['page_no'] = area[2]
+    new_area['x1'] = area[3]
+    new_area['y1'] = area[4]
+    new_area['x2'] = area[5]
+    new_area['y2'] = area[6]
 
     for each in p:
+        # should be label, probability
         new_area[each[0]] = each[1]
 
     new_area['img'] = get_area_image(area[1], area[2], { 'x1': area[3], 'y1': area[4], 'x2': area[5], 'y2': area[6] })
@@ -112,7 +118,7 @@ def random_area():
 
 def get_area_image(doc, page, extract):
     img_name = random_name()
-    image = np.array(Image.open('../docs/%s/png/page_%s.png' % (doc, page)), dtype=np.uint8)
+    image = np.array(Image.open('./docs/training/%s/png/page_%s.png' % (doc, page)), dtype=np.uint8)
     fig,ax = plt.subplots(1)
     ax.imshow(image)
     ax.add_patch(patches.Rectangle(
@@ -199,21 +205,25 @@ def learn():
 
 """)
 data = cursor.fetchall()
-
-# Omit area_id, doc_id, page_no, and label_name
-train = [ list(d[4:]) for d in data ]
-
-label = np.array([ d[3] for d in data ])
-index = [ d[0:3] for d in data ]
-
-# gamma - influence of a single training example. low = far, high = close
-# C - low = less freedom, high = more freedom
-#clf = svm.SVC(gamma=0.001, C=100., probability=True, cache_size=500)
-clf = svm.SVC(gamma=1, C=100, probability=True, cache_size=500, kernel='rbf')
-
-clf.fit(train, label)
+if data == []:
+    print("Unable to initialize model! You'll still be able to train a new one, but you will not see classification probabilities as you do so.")
+    clf = None
+else:
+    # Omit area_id, doc_id, page_no, and label_name
+    train = [ list(d[4:]) for d in data ]
+
+    label = np.array([ d[3] for d in data ])
+    index = [ d[0:3] for d in data ]
+
+    # gamma - influence of a single training example. low = far, high = close
+    # C - low = less freedom, high = more freedom
+    #clf = svm.SVC(gamma=0.001, C=100., probability=True, cache_size=500)
+    clf = svm.SVC(gamma=1, C=100, probability=True, cache_size=500, kernel='rbf')
+    clf.fit(train, label)
 
 # print clf.classes_
 
 if __name__ == '__main__':
+    if not os.path.exists("./tmp"):
+        os.mkdir("tmp")
     app.run(host='0.0.0.0', port=5555)
diff --git a/blackstack_wrapper.sh b/blackstack_wrapper.sh
@@ -0,0 +1,36 @@
+#!/bin/sh
+
+
+if [ -z "${BLACKSTACK_MODE}" ]
+then
+    echo "No BLACKSTACK_MODE specified -- assuming classification mode on prebuilt model."
+    BLACKSTACK_MODE='classified'
+else
+    BLACKSTACK_MODE=${BLACKSTACK_MODE}
+fi
+
+echo Running blackstack in $BLACKSTACK_MODE mode.
+if [ "$BLACKSTACK_MODE" = "classified" ] 
+then
+    for doc in input/*.pdf;
+    do
+        filename=$(basename "$doc")
+        docname="${filename%.*}"
+        echo ./preprocess.sh classified input/$filename
+        ./preprocess.sh classified input/$filename
+        echo python3 extract.py ./docs/classified/$docname/
+        python3 extract.py ./docs/classified/$docname/
+    done
+elif [ "$BLACKSTACK_MODE" = "training" ] 
+then
+    for doc in input/*.pdf;
+    do
+        filename=$(basename "$doc")
+        docname="${filename%.*}"
+        ./preprocess.sh training input/$filename
+    done
+    python3 server.py
+else
+    echo "Unknown blackstack mode specified. Please choose classified or training."
+    exit 1
+fi
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -6,8 +6,11 @@ services:
     environment:
       - PG_PASSWORD=blackstack
       - PG_USERNAME=postgres
+      - BLACKSTACK_MODE=${BLACKSTACK_MODE}
     ports:
       - 5555:5555
+    volumes:
+      - ./docs/:/app/pdf/docs/
 
   postgres:
     image: postgres:10.5-alpine
@@ -16,4 +19,4 @@ services:
       - POSTGRES_DB=blackstack
     volumes:
       - ./setup:/docker-entrypoint-initdb.d/
-
+      - ./postgres-data:/var/lib/postgresql/data