From 0a66b9be3efdd1a0043ff172f2ed62d6fd30c51e Mon Sep 17 00:00:00 2001 From: Ian Ross Date: Tue, 13 Nov 2018 15:13:33 -0600 Subject: [PATCH 01/19] Bugfixes; few changes for tesseract4 quirks; extract should work in py3 now --- extract.py | 22 +++++++++++----------- helpers.py | 16 ++++++++++++---- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/extract.py b/extract.py index 17a5012..e0bbe4a 100644 --- a/extract.py +++ b/extract.py @@ -3,6 +3,7 @@ from bs4 import BeautifulSoup import psycopg2 import math +import codecs import re import numpy as np import itertools @@ -82,11 +83,11 @@ def find_above_and_below(extract): def expand_extraction(extract_idx, props): # Iterate on above and below areas for each extract - for direction, areas in extract_relations[extract_idx].iteritems(): + for direction, areas in extract_relations[extract_idx].items(): stopped = False for area_idx in extract_relations[extract_idx][direction]: # Iterate on all other extracts, making sure that extending the current one won't run into any of the others - for extract_idx2, props2 in extract_relations.iteritems(): + for extract_idx2, props2 in extract_relations.items(): if extract_idx != extract_idx2: will_intersect = helpers.rectangles_intersect(extracts[extract_idx2], helpers.enlarge_extract(extracts[extract_idx], page['areas'][area_idx])) if will_intersect: @@ -271,7 +272,7 @@ def expand_extraction(extract_idx, props): # Sanity check the caption-area assignments - for caption, areas in caption_areas.iteritems(): + for caption, areas in caption_areas.items(): # Only check if the caption is assigned to more than one area if len(areas) > 1: # draw a line through the middle of the caption that spans the page @@ -335,7 +336,7 @@ def expand_extraction(extract_idx, props): # Extracts are bounding boxes that will be used to actually extract the tables extracts = [] - for caption, areas in caption_areas.iteritems(): + for caption, areas in caption_areas.items(): print(indicator_lines[caption]) area_of_interest_centroid_y_mean = np.mean([ helpers.centroid(page['areas'][area])['y'] for area in areas ]) indicator_line_centroid_y = helpers.centroid(indicator_lines[caption])['y'] @@ -362,7 +363,7 @@ def expand_extraction(extract_idx, props): # Make sure each table was assigned a caption assigned_tables = [] unassigned_tables = [] - for caption_idx, areas in caption_areas.iteritems(): + for caption_idx, areas in caption_areas.items(): assigned_tables = assigned_tables + areas all_tables = [] @@ -400,7 +401,7 @@ def expand_extraction(extract_idx, props): for extract_idx, extract in enumerate(extracts): expand_extraction(extract_idx, find_above_and_below(extract)) - # for extract_idx, props in extract_relations.iteritems(): + # for extract_idx, props in extract_relations.items(): # expand_extraction(extract_idx, props) for extract in orphan_extracts: @@ -439,7 +440,7 @@ def extract_tables(document_path): # Read in each tesseract page with BeautifulSoup so we can look at the document holistically for page_no, page in enumerate(page_paths): - with open(page) as hocr: + with codecs.open(page, "r", "utf-8") as hocr: text = hocr.read() soup = BeautifulSoup(text, 'html.parser') merged_areas = helpers.merge_areas(soup.find_all('div', 'ocr_carea')) @@ -467,11 +468,10 @@ def extract_tables(document_path): # Use the model to assign an area type and probabilty of that area type probabilities = clf.predict_proba([ heuristics.classify_list(area, doc_stats, page['areas']) ]) # Apply a label to each probability - classifications = zip(clf.classes_, probabilities) - # Sort by highest probability - classifications.sort(key=lambda x: x[1], reverse=True) + classifications = zip(clf.classes_, probabilities[0]) + classifications = sorted(classifications, key = lambda x: x[1], reverse=True) - area['classification_p'] = classifications[0][0] + area['classification_p'] = classifications[0][1] area['type'] = clf.predict([ heuristics.classify_list(area, doc_stats, page['areas']) ]) diff --git a/helpers.py b/helpers.py index 08d13ab..f9cc25a 100644 --- a/helpers.py +++ b/helpers.py @@ -473,12 +473,21 @@ def area_summary(area): for word_idx, word in enumerate(words): wordbbox = extractbbox(word.get('title')) + word_area = (wordbbox['x2'] - wordbbox['x1']) * (wordbbox['y2'] - wordbbox['y1']) + if word_area > summary['area'] or \ + wordbbox['x2'] > summary['x2'] or \ + wordbbox['x1'] < summary['x1'] or \ + wordbbox['y1'] < summary['y1'] or \ + wordbbox['y2'] > summary['y2']: + print("Word outside of the enclosing area! Tesseract's black box strikes again!") + continue + # Record the x coordinate of the first word of each line if word_idx == 0: summary['first_word_x'] = wordbbox['x1'] summary['word_heights'].append(wordbbox['y2'] - wordbbox['y1']) - summary['word_areas'].append((wordbbox['x2'] - wordbbox['x1']) * (wordbbox['y2'] - wordbbox['y1'])) + summary['word_areas'].append(word_area) for x in range(wordbbox['x1'] - summary['x1'], wordbbox['x2'] - summary['x1']): summary['x_gaps'][x] = 1 @@ -524,9 +533,8 @@ def summarize_document(area_stats): 'word_height_avg': np.nanmean([area['word_height_avg'] for area in area_stats if area['words'] > 0 and area['lines'] > 1]), 'word_height_avg_median': np.nanmedian([area['word_height_avg'] for area in area_stats if area['words'] > 0 and area['lines'] > 1]), 'word_height_avg_std': np.nanstd([area['word_height_avg'] for area in area_stats if area['words'] > 0 and area['lines'] > 1]), - - 'line_height_avg': np.nanmean([a for a in area['line_heights'] for area in area_stats]), - 'line_height_std': np.nanstd([a for a in area['line_heights'] for area in area_stats]), + 'line_height_avg': np.nanmean([height for area in area_stats for height in area["line_heights"]]), + 'line_height_std': np.nanstd([height for area in area_stats for height in area["line_heights"]]), 'max_area': max([ area['area'] for area in area_stats ]), 'max_lines': max([ area['lines'] for area in area_stats ]), 'max_gaps': max([ len(area['gaps']) for area in area_stats ]) From 6be3a1ff139d1ecd2abdf1eb5bd05e25b616f806 Mon Sep 17 00:00:00 2001 From: Ian Ross Date: Tue, 13 Nov 2018 16:38:33 -0600 Subject: [PATCH 02/19] visualize areas and classifications --- extract.py | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++- preprocess.sh | 4 +++- 2 files changed, 56 insertions(+), 2 deletions(-) diff --git a/extract.py b/extract.py index e0bbe4a..e8b56db 100644 --- a/extract.py +++ b/extract.py @@ -9,6 +9,9 @@ import itertools import glob import logging +import matplotlib.pyplot as plt +import matplotlib.patches as patches +from scipy.misc import imread np.set_printoptions(threshold=np.inf) @@ -422,6 +425,7 @@ def expand_extraction(extract_idx, props): # Entry into table extraction +@profile def extract_tables(document_path): # Connect to Postgres connection = psycopg2.connect( @@ -473,7 +477,7 @@ def extract_tables(document_path): area['classification_p'] = classifications[0][1] - area['type'] = clf.predict([ heuristics.classify_list(area, doc_stats, page['areas']) ]) + area['type'] = clf.predict([ heuristics.classify_list(area, doc_stats, page['areas']) ])[0] # Attempt to identify all charts/tables/etc in the paper by looking at the text layer @@ -526,6 +530,54 @@ def extract_tables(document_path): for ttype in figure_idx: print(' ', ttype, figure_idx[ttype]) + colormap = { + 'other' : '#26547C', + 'header / footer': '#EF476F', + 'graphic caption' : '#FFD166', + 'graphic' : '#06D6A0', + 'reference' : '#3E92CC', + 'body' : '#F4FAFF' + } + for page in pages: + fig = plt.figure() + print(document_path + "/png/page_%s.png" % page['page_no']) + img = plt.imread(document_path + "/png/page_%s.png" % page['page_no']) + ax = fig.add_subplot(111, aspect='equal') + for area in page['areas']: + box = { + '_left': int(area['x1']), + '_top': int(area['y1']), + '_right': int(area['x2']), + '_bottom': int(area['y2']), + 'width': int(area['x2']) - int(area['x1']), + 'height': int(area['y2']) - int(area['y1']) + } + ax.add_patch(patches.Rectangle( + (box['_left'], box['_top']), + box['_right'] - box['_left'], + box['_bottom'] - box['_top'], + fill=True, + linewidth=0.5, + facecolor=colormap[area['type']], + label=area['type'], + alpha = 0.2 + ) + ) + plt.ylim(0,pages[0]['page']['y2']) + plt.xlim(0,pages[0]['page']['x2']) + plt.axis("off") + plt.imshow(img, zorder=0) + ax = plt.gca() + ax.invert_yaxis() + patchlist = [ + patches.Patch(color = color, label = label, alpha=0.2) + for label,color in colormap.items() + ] + fig.legend(patchlist, colormap.keys(), loc='lower center', fontsize='x-small', ncol=int(len(colormap)/2), bbox_transform=fig.transFigure) + plt.axis('off') + fig.savefig(document_path + "/annotated/page_%s_with_areatypes.png" % page['page_no'], dpi=400, bbox_inches='tight', pad_inches=0) + plt.close(fig) + for page in pages: page_extracts = process_page(doc_stats, page) diff --git a/preprocess.sh b/preprocess.sh index 6af1905..1ab4b1b 100755 --- a/preprocess.sh +++ b/preprocess.sh @@ -20,7 +20,9 @@ mkdir -p docs/$1/$docname mkdir -p docs/$1/$docname/png mkdir -p docs/$1/$docname/tesseract if [ "$1" == "classified" ] - then mkdir -p docs/$1/$docname/extracts +then + mkdir -p docs/$1/$docname/extracts + mkdir -p docs/$1/$docname/annotated fi gs -dBATCH -dNOPAUSE -sDEVICE=png16m -dGraphicsAlphaBits=4 -dTextAlphaBits=4 -r600 -sOutputFile="./docs/$1/$docname/png/page_%d.png" $2 From 69328a466ad324a3034b9e1ae40f7cdaf4ad1ed9 Mon Sep 17 00:00:00 2001 From: Ian Ross Date: Tue, 13 Nov 2018 19:41:47 -0600 Subject: [PATCH 03/19] fix that mem garbage --- extract.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/extract.py b/extract.py index e8b56db..b38ec99 100644 --- a/extract.py +++ b/extract.py @@ -576,7 +576,8 @@ def extract_tables(document_path): fig.legend(patchlist, colormap.keys(), loc='lower center', fontsize='x-small', ncol=int(len(colormap)/2), bbox_transform=fig.transFigure) plt.axis('off') fig.savefig(document_path + "/annotated/page_%s_with_areatypes.png" % page['page_no'], dpi=400, bbox_inches='tight', pad_inches=0) - plt.close(fig) + fig.clf() + plt.close() for page in pages: page_extracts = process_page(doc_stats, page) From e839444a8db1f1bce694d2cdfbc40f765135f6f6 Mon Sep 17 00:00:00 2001 From: Ian Ross Date: Thu, 15 Nov 2018 09:26:35 -0600 Subject: [PATCH 04/19] don't require a model to be present already to do training --- annotator/server.py | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/annotator/server.py b/annotator/server.py index 7f47a34..d0db1db 100644 --- a/annotator/server.py +++ b/annotator/server.py @@ -69,9 +69,14 @@ def random_area(): area = cursor.fetchall()[0] q = list(area[7:]) - estimated_label = clf.predict([q])[0] - - p = zip(clf.classes_, clf.predict_proba([q])[0]) + if clf is not None: + estimated_label = clf.predict([q])[0] + p = list(zip(clf.classes_, clf.predict_proba([q])[0])) + else: + estimated_label = "dummy" + cursor.execute("SELECT name FROM labels") + labels = [i[0] for i in cursor.fetchall()] + p = list(zip(labels, ["-"] * len(labels))) bad = False for each in p: @@ -92,6 +97,7 @@ def random_area(): } for each in p: + # should be label, probability new_area[each[0]] = each[1] new_area['img'] = get_area_image(area[1], area[2], { 'x1': area[3], 'y1': area[4], 'x2': area[5], 'y2': area[6] }) @@ -112,7 +118,7 @@ def random_area(): def get_area_image(doc, page, extract): img_name = random_name() - image = np.array(Image.open('../docs/%s/png/page_%s.png' % (doc, page)), dtype=np.uint8) + image = np.array(Image.open('./docs/training/%s/png/page_%s.png' % (doc, page)), dtype=np.uint8) fig,ax = plt.subplots(1) ax.imshow(image) ax.add_patch(patches.Rectangle( @@ -199,19 +205,21 @@ def learn(): """) data = cursor.fetchall() - -# Omit area_id, doc_id, page_no, and label_name -train = [ list(d[4:]) for d in data ] - -label = np.array([ d[3] for d in data ]) -index = [ d[0:3] for d in data ] - -# gamma - influence of a single training example. low = far, high = close -# C - low = less freedom, high = more freedom -#clf = svm.SVC(gamma=0.001, C=100., probability=True, cache_size=500) -clf = svm.SVC(gamma=1, C=100, probability=True, cache_size=500, kernel='rbf') - -clf.fit(train, label) +if data == []: + print("Unable to initialize model! You'll still be able to train a new one, but you will not see classification probabilities as you do so.") + clf = None +else: + # Omit area_id, doc_id, page_no, and label_name + train = [ list(d[4:]) for d in data ] + + label = np.array([ d[3] for d in data ]) + index = [ d[0:3] for d in data ] + + # gamma - influence of a single training example. low = far, high = close + # C - low = less freedom, high = more freedom + #clf = svm.SVC(gamma=0.001, C=100., probability=True, cache_size=500) + clf = svm.SVC(gamma=1, C=100, probability=True, cache_size=500, kernel='rbf') + clf.fit(train, label) # print clf.classes_ From aaa02be74fa22a720be39106b9b8ba293bf23c4c Mon Sep 17 00:00:00 2001 From: Ian Ross Date: Thu, 15 Nov 2018 09:27:05 -0600 Subject: [PATCH 05/19] rm function profiler --- extract.py | 1 - 1 file changed, 1 deletion(-) diff --git a/extract.py b/extract.py index b38ec99..7a3469d 100644 --- a/extract.py +++ b/extract.py @@ -425,7 +425,6 @@ def expand_extraction(extract_idx, props): # Entry into table extraction -@profile def extract_tables(document_path): # Connect to Postgres connection = psycopg2.connect( From 5dfbbe4c981c2a7c6c55038943ce41721e0d01d3 Mon Sep 17 00:00:00 2001 From: Ian Ross Date: Thu, 15 Nov 2018 09:27:38 -0600 Subject: [PATCH 06/19] rm extraneous (?) classifier creation --- helpers.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/helpers.py b/helpers.py index f9cc25a..f46d5d3 100644 --- a/helpers.py +++ b/helpers.py @@ -11,10 +11,6 @@ from difflib import SequenceMatcher from bs4 import BeautifulSoup -import classifier - -clf = classifier.create() - def similar(a, b): return SequenceMatcher(None, a, b).ratio() From ad49509f334214b57058de286921d60922c2dc2b Mon Sep 17 00:00:00 2001 From: Ian Ross Date: Thu, 15 Nov 2018 09:28:34 -0600 Subject: [PATCH 07/19] fix path, utf encoding --- summarize.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/summarize.py b/summarize.py index 7a89cc3..2af7087 100644 --- a/summarize.py +++ b/summarize.py @@ -3,6 +3,7 @@ import heuristics import helpers from bs4 import BeautifulSoup +import codecs import psycopg2 from psycopg2.extensions import AsIs @@ -23,12 +24,12 @@ doc_id = sys.argv[1] -page_paths = glob.glob('./docs/' + doc_id + '/tesseract/*.html') +page_paths = glob.glob('./docs/training/' + doc_id + '/tesseract/*.html') pages = [] for page_no, page in enumerate(page_paths): # Read in each tesseract page with BeautifulSoup so we can look at the document holistically - with open(page) as hocr: + with codecs.open(page, "r", "utf-8") as hocr: text = hocr.read() soup = BeautifulSoup(text, 'html.parser') merged_areas = helpers.merge_areas(soup.find_all('div', 'ocr_carea')) From ae0e8bf9eb2752db33b7f3ee5aedc0a461ccaff0 Mon Sep 17 00:00:00 2001 From: Ian Ross Date: Thu, 15 Nov 2018 10:24:02 -0600 Subject: [PATCH 08/19] Add rudimentary wrapper + envvar toggle --- Dockerfile | 7 +++---- blackstack_wrapper.sh | 24 ++++++++++++++++++++++++ docker-compose.yml | 1 + 3 files changed, 28 insertions(+), 4 deletions(-) create mode 100755 blackstack_wrapper.sh diff --git a/Dockerfile b/Dockerfile index 10dc9e7..86c1819 100644 --- a/Dockerfile +++ b/Dockerfile @@ -23,13 +23,12 @@ COPY *py $PDF/ COPY annotator $PDF/ COPY config.py.env $PDF/config.py -COPY test/WH897R_29453_000452.pdf $PDF/test/ +COPY 1-s2.0-0031018280900164-main.pdf $PDF/test/ +ARG BLACKSTACK_MODE RUN mkdir out WORKDIR $PDF EXPOSE 5555 -CMD bash -c "sleep 10; $PDF/preprocess.sh training test/WH897R_29453_000452.pdf; python3 $PDF/server.py" -#CMD ["./preprocess.sh", "training", "test/WH897R_29453_000452.pdf"] - +CMD ["./blackstack_wrapper.sh"] diff --git a/blackstack_wrapper.sh b/blackstack_wrapper.sh new file mode 100755 index 0000000..8904870 --- /dev/null +++ b/blackstack_wrapper.sh @@ -0,0 +1,24 @@ +#!/bin/sh + + +if [ -z "${BLACKSTACK_MODE}" ] +then + echo "Please specify BLACKSTACK_MODE as an envvar" + exit 1 +else + BLACKSTACK_MODE=${BLACKSTACK_MODE} + echo Running blackstack in $BLACKSTACK_MODE mode. + if [ "$BLACKSTACK_MODE" = "classified" ] + then + echo classified + ./preprocess.sh classified test/1-s2.0-0031018280900164-main.pdf ; + python3 extract.py ./docs/classified/1-s2*/ + elif [ "$BLACKSTACK_MODE" = "training" ] + then + echo training + ./preprocess.sh training test/1-s2.0-0031018280900164-main.pdf + python3 server.py + else + echo "Unknown blackstack mode specified. Please choose classified or training." + fi +fi diff --git a/docker-compose.yml b/docker-compose.yml index de5a777..3728827 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -6,6 +6,7 @@ services: environment: - PG_PASSWORD=blackstack - PG_USERNAME=postgres + - BLACKSTACK_MODE=${BLACKSTACK_MODE} ports: - 5555:5555 From 3bb790e57b900d442ef895e2a030ee6bec95a189 Mon Sep 17 00:00:00 2001 From: Ian Ross Date: Thu, 15 Nov 2018 11:19:50 -0600 Subject: [PATCH 09/19] run classification/extraction mode by default --- blackstack_wrapper.sh | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/blackstack_wrapper.sh b/blackstack_wrapper.sh index 8904870..5d290b9 100755 --- a/blackstack_wrapper.sh +++ b/blackstack_wrapper.sh @@ -3,22 +3,34 @@ if [ -z "${BLACKSTACK_MODE}" ] then - echo "Please specify BLACKSTACK_MODE as an envvar" - exit 1 + echo "No BLACKSTACK_MODE specified -- assuming classification mode on prebuilt model." + BLACKSTACK_MODE='classified' else BLACKSTACK_MODE=${BLACKSTACK_MODE} - echo Running blackstack in $BLACKSTACK_MODE mode. - if [ "$BLACKSTACK_MODE" = "classified" ] - then - echo classified - ./preprocess.sh classified test/1-s2.0-0031018280900164-main.pdf ; - python3 extract.py ./docs/classified/1-s2*/ - elif [ "$BLACKSTACK_MODE" = "training" ] - then - echo training - ./preprocess.sh training test/1-s2.0-0031018280900164-main.pdf +fi + +echo Running blackstack in $BLACKSTACK_MODE mode. +if [ "$BLACKSTACK_MODE" = "classified" ] +then + for doc in input/*.pdf; + do + filename=$(basename "$doc") + docname="${filename%.*}" + echo ./preprocess.sh classified input/$filename + ./preprocess.sh classified input/$filename + echo python3 extract.py ./docs/classified/$docname/ + python3 extract.py ./docs/classified/$docname/ + done +elif [ "$BLACKSTACK_MODE" = "training" ] +then + for doc in input/*.pdf; + do + filename=$(basename "$doc") + docname="${filename%.*}" + ./preprocess.sh training input/$filename python3 server.py - else - echo "Unknown blackstack mode specified. Please choose classified or training." - fi + done +else + echo "Unknown blackstack mode specified. Please choose classified or training." + exit 1 fi From 89cadd420cd54daeba8bdc37059a7a893efa1d07 Mon Sep 17 00:00:00 2001 From: Ian Ross Date: Thu, 15 Nov 2018 11:21:26 -0600 Subject: [PATCH 10/19] Use input/ and docs/ to manage input and output --- docker-compose.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker-compose.yml b/docker-compose.yml index 3728827..4bcccf7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -9,6 +9,8 @@ services: - BLACKSTACK_MODE=${BLACKSTACK_MODE} ports: - 5555:5555 + volumes: + - ./docs/:/app/pdf/docs/ postgres: image: postgres:10.5-alpine From 830926122ab67647fe0e6d94e675f401fb3a7664 Mon Sep 17 00:00:00 2001 From: Ian Ross Date: Fri, 16 Nov 2018 09:38:17 -0600 Subject: [PATCH 11/19] Use input/ --- Dockerfile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 86c1819..9dad3f9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,9 +22,7 @@ COPY *sh $PDF/ COPY *py $PDF/ COPY annotator $PDF/ COPY config.py.env $PDF/config.py - -COPY 1-s2.0-0031018280900164-main.pdf $PDF/test/ -ARG BLACKSTACK_MODE +COPY input/ $PDF/input/ RUN mkdir out WORKDIR $PDF From cd8e029d4d0b7af80c3bf913ec3d5c46b7f66f35 Mon Sep 17 00:00:00 2001 From: Ian Ross Date: Fri, 16 Nov 2018 09:46:35 -0600 Subject: [PATCH 12/19] fix - move server start to a sensible place --- blackstack_wrapper.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blackstack_wrapper.sh b/blackstack_wrapper.sh index 5d290b9..43779bb 100755 --- a/blackstack_wrapper.sh +++ b/blackstack_wrapper.sh @@ -28,8 +28,8 @@ then filename=$(basename "$doc") docname="${filename%.*}" ./preprocess.sh training input/$filename - python3 server.py done + python3 server.py else echo "Unknown blackstack mode specified. Please choose classified or training." exit 1 From 098a02ed6f8a529c621dae8a9c63dc7b5478d7b7 Mon Sep 17 00:00:00 2001 From: Ian Ross Date: Fri, 16 Nov 2018 11:33:02 -0600 Subject: [PATCH 13/19] use defaultdict -- fixes crash that happens if server is restarted before all labels have been seen in training --- annotator/server.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/annotator/server.py b/annotator/server.py index d0db1db..0aadaf6 100644 --- a/annotator/server.py +++ b/annotator/server.py @@ -9,6 +9,7 @@ import matplotlib.pyplot as plt import matplotlib.patches as patches import numpy as np +from collections import defaultdict from sklearn import svm # Import database credentials @@ -86,15 +87,14 @@ def random_area(): if bad: return random_area() - new_area = { - 'area_id': area[0], - 'doc_id': area[1], - 'page_no': area[2], - 'x1': area[3], - 'y1': area[4], - 'x2': area[5], - 'y2': area[6] - } + new_area = defaultdict(float) + new_area['area_id'] = area[0] + new_area['doc_id'] = area[1] + new_area['page_no'] = area[2] + new_area['x1'] = area[3] + new_area['y1'] = area[4] + new_area['x2'] = area[5] + new_area['y2'] = area[6] for each in p: # should be label, probability From ffceeed9b34d271f5458bab274d03e3e8d12953b Mon Sep 17 00:00:00 2001 From: Ian Ross Date: Fri, 16 Nov 2018 11:35:00 -0600 Subject: [PATCH 14/19] make tmp dir for training images if it doesn't exist --- annotator/server.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/annotator/server.py b/annotator/server.py index 0aadaf6..9443df7 100644 --- a/annotator/server.py +++ b/annotator/server.py @@ -224,4 +224,6 @@ def learn(): # print clf.classes_ if __name__ == '__main__': + if not os.path.exists("./tmp"): + os.mkdir("tmp") app.run(host='0.0.0.0', port=5555) From 5dcda56ad872dfb0adfa2fa7e9c1537074dc2047 Mon Sep 17 00:00:00 2001 From: Ian Ross Date: Tue, 4 Dec 2018 11:45:00 -0600 Subject: [PATCH 15/19] make things work in python3 (filter is a generator, so this silently mucked up the works) --- helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helpers.py b/helpers.py index f46d5d3..cd2d826 100644 --- a/helpers.py +++ b/helpers.py @@ -438,7 +438,7 @@ def area_summary(area): # Number of words try: - summary['words'] = len(filter(None, summary['soup'].getText().strip().replace('\n', ' ').replace(' ', ' ').split(' '))) + summary['words'] = len(list(filter(None, summary['soup'].getText().strip().replace('\n', ' ').replace(' ', ' ').split(' ')))) except: summary['words'] = 0 From 57ff1e2d68ca010e82a3a9947d524a0a84dacd01 Mon Sep 17 00:00:00 2001 From: Ian Ross Date: Tue, 4 Dec 2018 15:00:01 -0600 Subject: [PATCH 16/19] use local dir for persistent pg data --- docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index 4bcccf7..1271809 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -19,4 +19,4 @@ services: - POSTGRES_DB=blackstack volumes: - ./setup:/docker-entrypoint-initdb.d/ - + - ./postgres-data:/var/lib/postgresql/data From 794c4739408954dc079c5958bb8563e2bd03a58b Mon Sep 17 00:00:00 2001 From: Ian Ross Date: Tue, 4 Dec 2018 15:00:20 -0600 Subject: [PATCH 17/19] readme updates --- README.md | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/README.md b/README.md index 84f48e3..7443987 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,10 @@ A machine learning approach to table and figure extraction. Uses SciKit Learn's Whereas other approaches to table and figure reading depend on content to be well-structured, Blackstack ignores the issue of table and figure _data_ extraction (see [Fonduer](https://github.com/HazyResearch/fonduer)) and instead uses a format-agnostic approach to extracting entities as images that can then be used for future analysis or querying. ## Installation +### Recommended +We recommend using Blackstack via docker-compose (https://docs.docker.com/compose/install/) + +### Local Blackstack relies on a few libraries that you probably need to install, including [ghostscript](https://www.ghostscript.com) and [tesseract](https://github.com/tesseract-ocr/tesseract). If you are using MacOS and [Homebrew](https://brew.sh) you can install them as so: @@ -38,6 +42,37 @@ pip install -r requirements.txt ## Getting started +### Docker + +The recommended way of running Blackstack is via the supplied `docker-compose.yml`. It can +be run in either `training` mode to train a new model or in `classified` mode +to apply the trained model (or the default included one) to a set of documents. The mode of running can be provided +as an environmental variable when invoking docker-compose: + +```` +BLACKSTACK_MODE=classified docker-compose up --no-deps --build --force-recreate +```` +to apply a model. + +To train a new one, the first step is to move the default data so that it doesn't +interfere with your training. + +```` +mv setup/02_example_data.sql setup/02_example_data.sql_bk +BLACKSTACK_MODE=training docker-compose up --no-deps --build --force-recreate +```` +to train one. + +On startup, Blackstack will preprocess any documents in the `./input/` directory +and either serve them up for annotation (in training mode) or apply the model +(in classified mode). + +Preprocessed output will be stored to the `./docs/` directory, and, if running in +classified mode, extractions will be stored per-document in `./docs/classified/`. + +### Standalone +The tools can also be run individually if the prerequisites are installed locally. + #### Preprocessing Before a model can be trained, documents to use as training data must be selected. If you are attempting to extract entities from a specific journal or publisher it is recommended that your training data also come from that journal or publisher. If you are trying to create a general classifier you should have a good sample of documents from across disciplines, publishers, etc. From c3b885f74045a0e81d45a84646cefa46889377bd Mon Sep 17 00:00:00 2001 From: Ian Ross Date: Wed, 12 Dec 2018 14:58:41 -0600 Subject: [PATCH 18/19] don't create unnecessary dirs; no need to cp orig (everything in input stays untouched --- preprocess.sh | 6 ------ 1 file changed, 6 deletions(-) diff --git a/preprocess.sh b/preprocess.sh index 1ab4b1b..412e0e7 100755 --- a/preprocess.sh +++ b/preprocess.sh @@ -13,10 +13,6 @@ fi filename=$(basename "$2") docname="${filename%.*}" -mkdir -p docs/$docname -mkdir -p docs/$docname/png - -mkdir -p docs/$1/$docname mkdir -p docs/$1/$docname/png mkdir -p docs/$1/$docname/tesseract if [ "$1" == "classified" ] @@ -27,8 +23,6 @@ fi gs -dBATCH -dNOPAUSE -sDEVICE=png16m -dGraphicsAlphaBits=4 -dTextAlphaBits=4 -r600 -sOutputFile="./docs/$1/$docname/png/page_%d.png" $2 -cp $2 ./docs/$docname/orig.pdf - ls ./docs/$1/$docname/png | grep -o '[0-9]\+' | parallel -j 4 "./process.sh $1 $docname {}" if [ "$1" == "training" ] From 0d92fb2f3a9a2008db0a7a17d4c9cf7c47b7b0a4 Mon Sep 17 00:00:00 2001 From: Ian Ross Date: Wed, 12 Dec 2018 14:59:06 -0600 Subject: [PATCH 19/19] Add tesseract bbox visualization step --- annotate.py | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++++ process.sh | 1 + 2 files changed, 114 insertions(+) create mode 100644 annotate.py diff --git a/annotate.py b/annotate.py new file mode 100644 index 0000000..3e19f1d --- /dev/null +++ b/annotate.py @@ -0,0 +1,113 @@ +import sys +from bs4 import BeautifulSoup +from PIL import Image +import matplotlib as mpl +from matplotlib import pyplot +import codecs +mpl.use('TkAgg') + +def makeBox(bbox): + return { + '_left': int(bbox[0]), + '_top': int(bbox[1]), + '_right': int(bbox[2]), + '_bottom': int(bbox[3]), + 'width': int(bbox[2]) - int(bbox[0]), + 'height': int(bbox[3]) - int(bbox[1]) + } + +def getbbox(title): + title_parts = title.split(';') + for part in title_parts: + if part.strip()[0:4] == 'bbox': + return part.replace('bbox', '').strip().split() + + return + +def tess(infile, outfile): + with codecs.open(infile, "r", "utf-8") as hocr: + text = hocr.read() + + soup = BeautifulSoup(text, "html.parser") + pages = soup.find_all('div', 'ocr_page') + careas = soup.find_all('div', 'ocr_carea') + pars = soup.find_all('p', 'ocr_par') + lines = soup.find_all('span', 'ocr_line') + words = soup.find_all('span', 'ocrx_word') + + page_boxes = [makeBox(getbbox(page.get('title'))) for page in pages] + carea_boxes = [makeBox(getbbox(carea.get('title'))) for carea in careas] + par_boxes = [makeBox(getbbox(par.get('title'))) for par in pars] + line_boxes = [makeBox(getbbox(line.get('title'))) for line in lines] + word_boxes = [makeBox(getbbox(word.get('title'))) for word in words] + + fig = pyplot.figure() + ax = fig.add_subplot(111, aspect='equal') + + for box in page_boxes: + ax.add_patch(mpl.patches.Rectangle( + (box['_left'], box['_top']), + box['_right'] - box['_left'], + box['_bottom'] - box['_top'], + fill=False, + linewidth=0.5, + edgecolor="#FF00FF" + ) + ) + + for box in carea_boxes: + ax.add_patch(mpl.patches.Rectangle( + (box['_left'], box['_top']), + box['_right'] - box['_left'], + box['_bottom'] - box['_top'], + fill=False, + linewidth=0.5, + edgecolor="#0000FF" + ) + ) + + for box in par_boxes: + ax.add_patch(mpl.patches.Rectangle( + (box['_left'], box['_top']), + box['_right'] - box['_left'], + box['_bottom'] - box['_top'], + fill=False, + linewidth=0.1, + edgecolor="#F0F0F0" + ) + ) + + for box in line_boxes: + ax.add_patch(mpl.patches.Rectangle( + (box['_left'], box['_top']), + box['_right'] - box['_left'], + box['_bottom'] - box['_top'], + fill=False, + linewidth=0.1, + edgecolor="#FF0000" + ) + ) + for box in word_boxes: + ax.add_patch(mpl.patches.Rectangle( + (box['_left'], box['_top']), + box['_right'] - box['_left'], + box['_bottom'] - box['_top'], + fill=False, + linewidth=0.1, + edgecolor="#000000" + ) + ) + + pyplot.ylim(0,page_boxes[0]['_bottom']) + pyplot.xlim(0,page_boxes[0]['_right']) + pyplot.axis("off") + ax = pyplot.gca() + ax.invert_yaxis() + pyplot.axis('off') + fig.savefig(outfile, dpi=400, bbox_inches='tight', pad_inches=0) + + +if len(sys.argv) == 3: + tess(sys.argv[1], sys.argv[2]) +else: + print('Script requires two parameters: an input Tesseract HOCR file and an output file name and location') diff --git a/process.sh b/process.sh index bee3a86..e8a355e 100755 --- a/process.sh +++ b/process.sh @@ -2,3 +2,4 @@ tesseract ./docs/$1/$2/png/page_$3.png ./docs/$1/$2/tesseract/page_$3.html hocr mv ./docs/$1/$2/tesseract/page_$3.html.hocr ./docs/$1/$2/tesseract/page_$3.html +python3 annotate.py ./docs/$1/$2/tesseract/page_$3.html ./docs/$1/$2/tesseract/page_$3.png