forked from andreasvc/activedop
-
Notifications
You must be signed in to change notification settings - Fork 1
/
app.py
1355 lines (1250 loc) · 46.3 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""Web interface for treebank annotation.
TODO (nschneid):
- validate on accept without entering edit mode
- EVENTUALLY: additional metadata fields e.g. comments, original untokenized sentence
Design notes:
- settings.cfg: The sentences, grammar, user accounts, and other fixed
configuration parameters are read from a file called "settings.cfg".
NB: besides the grammar, the treebank the grammar was based on is
required. Make sure that paths in the grammar parameter file are valid
(including headrules).
- <filename>.rankings.json: The order in which sentences are annotated
(prioritization) is fixed before the web service is started, and created by a
separate command (initpriorities) and stored in a JSON file read at startup
of the web app.
- annotate.db, schema.sql: Annotations are stored in an sqlite database;
initialize with "flask initdb".
- session cookie: login status and the per-sentence user interactions are
stored in a cookie.
- Parsing is done in subprocesses, one for each user, in order to isolate the
parsing from the Flask process and from other users. Since these subprocesses
are stored in a global variable (WORKERS), there should only be a single
Flask process, but multiple threads are needed to get actual parallelism:
$ flask run --with-threads
The use of a global variable is a compromise to avoid the complexity of
running a task queue like celery, or a separate webserver for each user.
"""
import os
import re
import sys
import csv
import json
import sqlite3
import logging
import subprocess
import click
from math import log
from time import time
from datetime import datetime
from functools import wraps
from collections import OrderedDict
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures.process import BrokenProcessPool
from urllib.parse import urlparse, urlencode, urljoin
from flask import (Flask, Markup, Response, jsonify, request, session, g, flash, abort,
redirect, url_for, render_template, send_file, send_from_directory,
stream_with_context)
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from discodop.tree import (Tree, DrawTree, DrawDependencies, ParentedTree,
writediscbrackettree)
from discodop.treebank import writetree, writedependencies, exporttree
from discodop.treetransforms import canonicalize
from discodop.treebanktransforms import reversetransform
from discodop.parser import probstr
from discodop.disambiguation import testconstraints
from discodop.heads import applyheadrules
from discodop.eval import editdistance
import worker
from workerattr import workerattr
from activedoptree import ActivedopTree, LABELRE, is_punct_label, cgel_to_ptree
sys.path.append('./cgel')
try:
import cgel
from tree2tex import trees2tex
except ImportError:
cgel = None
load_as_cgel = None
app = Flask(__name__) # pylint: disable=invalid-name
# WORKERS = {} # dict mapping username to process pool
SENTENCES = None
QUEUE = None
ANNOTATIONHELP = """
- If altering the tokenization, ensure that tokens are numbered sequentially.
- Use _. as the token for a gap.
"""
(NBEST, CONSTRAINTS, DECTREE, REATTACH, RELABEL, REPARSE, EDITDIST, TIME
) = range(8)
# e.g., "NN-SB/Nom" => ('NN', '-SB', '/Nom')
# Load default config and override config from an environment variable
app.config.update(
DATABASE=os.path.join(app.root_path, 'annotate.db'),
SECRET_KEY=None, # set in settings.cfg to protect session cookies
DEBUG=False, # whether to enable Flask debug UI
LIMIT=100, # maximum sentence length
FUNCTIONTAGWHITELIST=(), # optional list of function tags to always accept
GRAMMAR=None, # path to a directory with the initial grammar.
SENTENCES=None, # a filename with sentences to annotate, one per line.
ACCOUNTS=None, # dictionary mapping usernames to passwords
ANNOTATIONHELP=None, # plain text file summarizing the annotation scheme
CGELVALIDATE=None, # whether to run the CGEL validator when editing
PUNCT_TAGS={}, # dictionary mapping idiosyncratic punctuation POS tags to their corresponding tokens
SYMBOL_TAG=None, # pos tag for symbols (and symbol sequences) that don't have an idiosyncratic tag (in PUNCT_TAGS)
AMBIG_SYM={}, # 'ambiguous' symbols that can be punctuation or something else depending on context
INITIAL_PUNCT_LABELS={}, # pos-function labels for punctuation tokens associated with their following non-punctuation tokens
WORKERS={}, # dict mapping username to process pool
)
app.config.from_pyfile('settings.cfg', silent=True)
app.config.from_envvar('FLASK_SETTINGS', silent=True)
WORKERS = app.config['WORKERS']
logging.basicConfig()
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logger.handlers[0].setFormatter(logging.Formatter(
fmt='%(asctime)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S'))
@app.cli.command('initpriorities')
@click.option('--username', default='JoeAnnotator', help='Username to initialize priorities for.')
def initpriorities(username):
"""Order sentences by entropy of their parse trees probabilities.
Sentences with saved annotations are included first in the order and are not re-parsed."""
sentfilename = app.config['SENTENCES']
db = getdb()
cur = db.execute(
'SELECT * FROM entries WHERE username = ? ORDER BY sentno ASC',
(username, )
)
dbentries = cur.fetchall()
dbentryids = {a[0] for a in dbentries}
if sentfilename is None:
raise ValueError('SENTENCES not configured')
sentences = []
with open(sentfilename, mode='r') as csv_file:
csv_reader = csv.DictReader(csv_file, delimiter='\t', quoting=csv.QUOTE_NONE)
for row in csv_reader:
if any(field.strip() for field in row.values()):
sentences.append(row)
# NB: here we do not use a subprocess to do the parsing
worker.loadgrammar(app.config['GRAMMAR'], app.config['LIMIT'])
queue = []
already_annotated = []
for n, entry in enumerate(sentences):
sent = entry['sentence']
id = entry['id']
if id in dbentryids:
app.logger.info('%d. [already annotated] %s',
n + 1, sent)
already_annotated.append((n, 0, sent, id))
else:
try:
senttok, parsetrees, _messages, _elapsed = worker.getparses(sent)
except ValueError:
parsetrees = []
senttok = []
app.logger.info('%d. [parse trees=%d] %s',
n + 1, len(parsetrees), sent)
ent = 0
if parsetrees:
probs = [prob for prob, _tree, _treestr, _deriv in parsetrees]
try:
ent = entropy(probs) # / log(len(parsetrees), 2)
except (ValueError, ZeroDivisionError):
pass
queue.append((n, ent, sent, id))
queue.sort(key=lambda x: x[1], reverse=True)
queue = already_annotated + queue
rankingfilename = '%s.rankings.json' % sentfilename
with open(rankingfilename, 'w') as rankingfile:
json.dump(queue, rankingfile, indent=4)
@app.before_first_request
def initapp():
"""Load sentences, check config."""
global SENTENCES, QUEUE, ANNOTATIONHELP
sentfilename = app.config['SENTENCES']
if sentfilename is None:
raise ValueError('SENTENCES not configured')
if app.config['GRAMMAR'] is None:
raise ValueError('GRAMMAR not configured')
if app.config['ACCOUNTS'] is None:
raise ValueError('ACCOUNTS not configured')
# read sentences to annotate
sentences = []
with open(sentfilename, mode='r') as csv_file:
csv_reader = csv.DictReader(csv_file, delimiter='\t', quoting=csv.QUOTE_NONE)
for row in csv_reader:
if any(field.strip() for field in row.values()):
sentences.append(row['sentence'])
SENTENCES = sentences
rankingfilename = '%s.rankings.json' % sentfilename
if (os.path.exists(rankingfilename) and
os.stat(rankingfilename).st_mtime
> os.stat(sentfilename).st_mtime):
with open(rankingfilename) as rankingfile:
QUEUE = json.load(rankingfile)
else:
raise ValueError('no rankings for sentences, or sentences have\n'
'been modified; run "flask initpriorities"')
if app.config['ANNOTATIONHELP'] is not None:
with open(app.config['ANNOTATIONHELP']) as inp:
ANNOTATIONHELP = inp.read()
# Database functions
@app.cli.command('initdb')
def initdb():
"""Initializes the database."""
db = getdb()
with app.open_resource('schema.sql', mode='r') as inp:
db.cursor().executescript(inp.read())
db.commit()
app.logger.info('Initialized the database.')
def connectdb():
"""Connects to the specific database."""
result = sqlite3.connect(app.config['DATABASE'])
# result.row_factory = sqlite3.Row
return result
def getdb():
"""Opens a new database connection if there is none yet for the
current application context."""
if not hasattr(g, 'sqlitedb'):
g.sqlitedb = connectdb()
return g.sqlitedb
@app.teardown_appcontext
def closedb(error):
"""Closes the database again at the end of the request."""
if hasattr(g, 'sqlitedb'):
g.sqlitedb.close()
@app.route('/annotate/get_data_psv')
def get_data_psv():
username = session['username']
db = getdb()
cur = db.execute(
'SELECT * FROM entries WHERE username = ? ORDER BY sentno ASC',
(username, )
)
rows = cur.fetchall()
output_dir = "tmp"
if not os.path.exists(output_dir):
os.makedirs(output_dir)
csv_file_path = output_dir + '/output.csv'
with open(csv_file_path, 'w', newline='') as out_file:
csv_writer = csv.writer(out_file, delimiter='|')
column_headers = [description[0] for description in cur.description]
csv_writer.writerow(column_headers)
for row in rows:
csv_writer.writerow(row)
return send_file(csv_file_path, as_attachment=True, attachment_filename='data.csv')
def firstunannotated(username):
"""Return index of first unannotated sentence,
according to the prioritized order."""
db = getdb()
cur = db.execute(
'select id from entries where username = ? '
'order by sentno asc',
(username, ))
entries = {a[0] for a in cur}
# sentno=prioritized index, lineno=original index
for sentno, (_, _, _, id) in enumerate(QUEUE, 1):
if id not in entries:
return sentno
return 1
def numannotated(username):
"""Number of unannotated sentences for an annotator."""
db = getdb()
cur = db.execute('select count(sentno) from entries where username = ?',
(username, ))
result = cur.fetchone()
return result[0]
def getannotation(username, id):
"""Fetch annotation of a single sentence from database."""
db = getdb()
if app.config['CGELVALIDATE'] is None:
cur = db.execute(
'select tree, nbest '
'from entries '
'where username = ? and id = ? ',
(username, id))
else:
cur = db.execute(
'select cgel_tree, nbest '
'from entries '
'where username = ? and id = ? ',
(username, id))
entry = cur.fetchone()
return (None, 0) if entry is None else (entry[0], entry[1])
def readannotations(username=None):
"""Get all annotations, or ones by a given annotator."""
db = getdb()
if username is None:
cur = db.execute(
'select sentno, tree from entries '
'order by sentno asc')
else:
cur = db.execute(
'select sentno, tree from entries where username = ? '
'order by sentno asc',
(username, ))
entries = cur.fetchall()
return OrderedDict(entries)
def addentry(id, sentno, tree, cgel_tree, actions):
"""Add an annotation to the database."""
db = getdb()
db.execute(
'insert or replace into entries '
'values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)',
(id, sentno, session['username'], tree, cgel_tree, *actions,
datetime.now().strftime('%F %H:%M:%S')))
db.commit()
def loginrequired(f):
"""Decorator for views that require a login and running worker pool."""
@wraps(f)
def decorated_function(*args, **kwargs):
if 'username' not in session:
flash('please log in.')
return redirect(url_for('login', next=request.url))
elif session['username'] not in WORKERS:
return redirect(url_for('dologin', next=request.url))
return f(*args, **kwargs)
return decorated_function
def is_safe_url(target):
"""http://flask.pocoo.org/snippets/62/"""
ref_url = urlparse(request.host_url)
test_url = urlparse(urljoin(request.host_url, target))
return (test_url.scheme in ('http', 'https')
and ref_url.netloc == test_url.netloc)
# View functions
# NB: when using reverse proxy, the base url should match,
# e.g., https://external.com/annotate/... => http://localhost:5000/annotate/...
@app.route('/')
@app.route('/annotate/')
def main():
"""Redirect to main page."""
return redirect(url_for('login'))
@app.route('/annotate/login', methods=['GET', 'POST'])
def login():
"""Check authentication."""
error = None
if request.method == 'POST':
username = request.form['username']
if (username not in app.config['ACCOUNTS']
or request.form['password']
!= app.config['ACCOUNTS'][username]):
error = 'Invalid username/password'
else: # authentication valid
session['username'] = username
if request.args.get('next'):
return redirect(url_for(
'dologin', next=request.args.get('next')))
return redirect(url_for('dologin'))
elif 'username' in session:
if request.args.get('next'):
return redirect(url_for(
'dologin', next=request.args.get('next')))
return redirect(url_for('dologin'))
return render_template(
'login.html', error=error, totalsents=len(SENTENCES))
@app.route('/annotate/dologin')
def dologin():
"""Start worker pool and redirect when done."""
def generate(url):
yield (
'<!doctype html>'
'<title>redirect</title>'
'You were logged in successfully. ')
if username in WORKERS:
try:
_ = WORKERS[username].submit(
worker.getprop, 'headrules').result()
except BrokenProcessPool:
pass # fall through
else:
yield "<script>window.location.replace('%s');</script>" % url
return
yield 'Loading grammar; this will take a few seconds ...'
_, lang = os.path.split(os.path.basename(app.config['GRAMMAR']))
app.logger.info('Loading grammar %r', lang)
pool = ProcessPoolExecutor(max_workers=1)
if False and app.config['DEBUG']:
from discodop.treesearch import NoFuture
future = NoFuture(
worker.loadgrammar,
app.config['GRAMMAR'], app.config['LIMIT'])
else:
future = pool.submit(
worker.loadgrammar,
app.config['GRAMMAR'], app.config['LIMIT'])
future.result()
app.logger.info('Grammar %r loaded.', lang)
# train on annotated sentences
annotations = readannotations()
if annotations:
app.logger.info('training on %d previously annotated sentences',
len(annotations))
trees, sents = [], []
headrules = pool.submit(worker.getprop, 'headrules').result()
for block in annotations.values():
# HOTFIX for ROOT error
blocklns = block.splitlines()
for iln,blockln in enumerate(blocklns):
if '\tROOT\t' in blockln and '\t0\t' not in blockln:
blocklns[iln] = blocklns[iln].replace('\tROOT\t', '\tXXX-XXX\t')
block = '\n'.join(blocklns)
item = exporttree(block.splitlines())
canonicalize(item.tree)
if headrules:
applyheadrules(item.tree, headrules)
trees.append(item.tree)
sents.append(item.sent)
if False and app.config['DEBUG']:
future = NoFuture(worker.augment, trees, sents)
else:
future = pool.submit(worker.augment, trees, sents)
future.result()
WORKERS[username] = pool
yield "<script>window.location.replace('%s');</script>" % url
# return "<script>window.location.replace('%s');</script>" % url
nexturl = request.args.get('next')
if not is_safe_url(nexturl) or 'username' not in session:
return abort(400)
username = session['username']
return Response(stream_with_context(generate(
nexturl or url_for('annotate'))))
# return generate(nexturl or url_for('annotate'))
# FIXME: add automatic session expiration?
@app.route('/annotate/logout')
def logout():
"""Log out: clear session, shut down worker pool."""
if 'username' in session and session['username'] in WORKERS:
pool = WORKERS.pop(session['username'])
pool.shutdown(wait=False)
session.pop('username', None)
flash('You were logged out')
return redirect(url_for('main'))
@app.route('/annotate/annotate/', defaults={'sentno': -1})
@app.route('/annotate/annotate/<int:sentno>')
@loginrequired
def annotate(sentno):
"""Serve the main annotation page for a sentence."""
username = session['username']
if sentno == -1:
sentno = firstunannotated(username)
redirect(url_for('annotate', sentno=sentno))
session['actions'] = [0, 0, 0, 0, 0, 0, 0, time()]
lineno = QUEUE[sentno - 1][0]
id = QUEUE[sentno - 1][3]
sent = SENTENCES[lineno]
senttok, _ = worker.postokenize(sent)
annotation, n = getannotation(username, id)
if annotation is not None: # a tree is saved in the database
# go directly to edit mode
return redirect(url_for(
'edit', sentno=sentno, annotated=1, n=n))
# render annotate mode: browsing parser outputs for the sentence
return render_template(
'annotate.html',
prevlink=str(sentno - 1) if sentno > 1 else str(len(SENTENCES)),
nextlink=str(sentno + 1) if sentno < len(SENTENCES) else str(1),
sentno=sentno, lineno=lineno + 1,
totalsents=len(SENTENCES),
numannotated=numannotated(username),
annotationhelp=ANNOTATIONHELP,
sent=' '.join(senttok)) # includes any gaps
@app.route('/undoaccept', methods=['POST'])
def undoaccept():
sentid = request.json.get('sentid', 0)
username = session['username']
db = getdb()
db.execute(
'DELETE FROM entries WHERE username = ? AND id = ?',
(username, sentid))
db.commit()
return jsonify({"success": True})
@app.route('/retokenize', methods=['POST'])
def retokenize():
sentno = int(request.json.get('sentno', 0))
newtext = request.json.get('newtext', 0)
lineno = QUEUE[sentno - 1][0]
SENTENCES[lineno] = newtext
return jsonify({"success": True})
@app.route('/annotate/parse')
@loginrequired
def parse():
"""Display parse. To be invoked by an AJAX call."""
sentno = int(request.args.get('sentno')) # 1-indexed
sent = SENTENCES[QUEUE[sentno - 1][0]]
username = session['username']
require = request.args.get('require', '')
block = request.args.get('block', '')
urlprm = dict(sentno=sentno)
if require and require != '':
urlprm['require'] = require
if block and block != '':
urlprm['block'] = block
require, block = parseconstraints(require, block)
if require or block:
session['actions'][CONSTRAINTS] += 1
session.modified = True
if False and app.config['DEBUG']:
resp = worker.getparses(sent, require, block)
else:
resp = WORKERS[username].submit(
worker.getparses,
sent, require, block).result()
senttok, parsetrees, messages, elapsed = resp
maxdepth = ''
if not parsetrees:
result = ('no parse! reload page to clear constraints, '
'or continue with next sentence.')
nbest = dep = depsvg = ''
else:
dep = depsvg = ''
if workerattr('headrules'):
dep = writedependencies(parsetrees[0][1], senttok, 'conll')
depsvg = Markup(DrawDependencies.fromconll(dep).svg())
result = ''
dectree, maxdepth, _ = decisiontree(parsetrees, senttok, urlprm)
prob, ptree, _treestr, _fragments = parsetrees[0]
treeobj = ActivedopTree(ptree = ptree, senttok = senttok)
nbest = Markup('%s\nbest tree: %s' % (
dectree,
('%(n)d. [%(prob)s] '
'<a href="/annotate/accept?%(urlprm)s">accept this tree</a>; '
'<a href="/annotate/edit?%(urlprm)s">edit</a>; '
'<a href="/annotate/deriv?%(urlprm)s">derivation</a>\n\n'
'%(tree)s'
% dict(
n=1,
prob=probstr(prob),
urlprm=urlencode(dict(urlprm, n=1)),
tree=treeobj.gtree()))))
msg = '\n'.join(messages)
elapsed = 'CPU time elapsed: %s => %gs' % (
' '.join('%gs' % a for a in elapsed), sum(elapsed))
info = '\n'.join((
'length: %d;' % len(senttok), msg, elapsed,
'most probable parse trees:',
''.join('%d. [%s] %s' % (n + 1, probstr(prob),
writediscbrackettree(treestr, senttok))
for n, (prob, _tree, treestr, _deriv)
in enumerate(parsetrees)
if treestr is not None)
+ '\n'))
return render_template('annotatetree.html', sent=sent, result=result,
nbest=nbest, info=info, dep=dep, depsvg=depsvg, maxdepth=maxdepth,
msg='%d parse trees' % len(parsetrees))
@app.route('/annotate/filter')
@loginrequired
def filterparsetrees():
"""For a parse tree in the cache, return a filtered set of its n-best
parses matching current constraints."""
username = session['username']
session['actions'][CONSTRAINTS] += 1
session.modified = True
sentno = int(request.args.get('sentno')) # 1-indexed
sent = SENTENCES[QUEUE[sentno - 1][0]]
urlprm = dict(sentno=sentno)
require = request.args.get('require', '')
block = request.args.get('block', '')
if require and require != '':
urlprm['require'] = require
if block and block != '':
urlprm['block'] = block
require, block = parseconstraints(require, block)
frequire = request.args.get('frequire', '')
fblock = request.args.get('fblock', '')
frequire, fblock = parseconstraints(frequire, fblock)
resp = WORKERS[username].submit(
worker.getparses,
sent, require, block).result()
senttok, parsetrees, _messages, _elapsed = resp
parsetrees_ = [(n, prob, ActivedopTree(ptree = ptree, senttok = senttok), treestr, frags)
for n, (prob, ptree, treestr, frags) in enumerate(parsetrees)
if treestr is None or testconstraints(treestr, frequire, fblock)]
if len(parsetrees_) == 0:
return ('No parse trees after filtering; try pressing Re-parse, '
'or reload page to clear constraints.\n')
nbest = Markup('%d parse trees\n%s' % (
len(parsetrees_),
'\n'.join('%(n)d. [%(prob)s] '
'<a href="/annotate/accept?%(urlprm)s">accept this tree</a>; '
'<a href="/annotate/edit?%(urlprm)s">edit</a>; '
'<a href="/annotate/deriv?%(urlprm)s">derivation</a>\n\n'
'%(tree)s' % dict(
n=n + 1,
prob=probstr(prob),
urlprm=urlencode(dict(urlprm, n=n + 1)),
# ad_tree: ActivedopTree object
tree=ad_tree.gtree())
for n, prob, ad_tree, _treestr, fragments in parsetrees_)))
return nbest
@app.route('/annotate/deriv')
@loginrequired
def showderiv():
"""Render derivation for a given parse tree in cache."""
username = session['username']
n = int(request.args.get('n')) # 1-indexed
sentno = int(request.args.get('sentno')) # 1-indexed
sent = SENTENCES[QUEUE[sentno - 1][0]]
require = request.args.get('require', '')
block = request.args.get('block', '')
require, block = parseconstraints(require, block)
resp = WORKERS[username].submit(
worker.getparses,
sent, require, block).result()
senttok, parsetrees, _messages, _elapsed = resp
_prob, tree, _treestr, fragments = parsetrees[n - 1]
return Markup(
'<pre>Fragments used in the highest ranked derivation'
' of this parse tree:\n%s\n%s</pre>' % (
'\n\n'.join(
'%s\n%s' % (w, DrawTree(frag).text(unicodelines=True, html=True))
for frag, w in fragments or ()),
DrawTree(tree, senttok).text(
unicodelines=True, html=True, funcsep='-', maxwidth=30)))
@app.route('/annotate/edit')
@loginrequired
def edit():
"""Edit tree manually."""
sentno = int(request.args.get('sentno')) # 1-indexed
lineno = QUEUE[sentno - 1][0]
id = QUEUE[sentno - 1][3]
sent = SENTENCES[lineno]
senttok, _ = worker.postokenize(sent)
username = session['username']
if 'dec' in request.args:
session['actions'][DECTREE] += int(request.args.get('dec', 0))
session.modified = True
msg = ''
if request.args.get('annotated') == '1': # there is a saved tree
msg = Markup('<font color=red>You have already annotated '
'this sentence.</font><button id="undo" onclick="undoAccept()">Delete tree from database</button>')
id = QUEUE[sentno - 1][3]
treestr, n = getannotation(username, id) # get tree from database
treeobj = ActivedopTree.from_str(treestr)
elif 'n' in request.args: # edit the nth automatic parse
msg = Markup('<button id="undo" onclick="goback()">Go back</button>')
n = int(request.args.get('n', 1))
session['actions'][NBEST] = n
require = request.args.get('require', '')
block = request.args.get('block', '')
require, block = parseconstraints(require, block)
resp = WORKERS[username].submit(
worker.getparses,
sent, require, block).result()
senttok, parsetrees, _messages, _elapsed = resp
ptree = parsetrees[n - 1][1]
treeobj = ActivedopTree(ptree = ptree, senttok = senttok)
else:
return 'ERROR: pass n or tree argument.'
rows = max(5, treeobj.treestr().count('\n') + 1)
return render_template('edittree.html',
prevlink=('/annotate/annotate/%d' % (sentno - 1))
if sentno > 1 else '/annotate/annotate/%d' % (len(SENTENCES)),
nextlink=('/annotate/annotate/%d' % (sentno + 1))
if sentno < len(SENTENCES) else '/annotate/annotate/1',
unextlink=('/annotate/annotate/%d' % firstunannotated(username))
if sentno < len(SENTENCES) else '#',
treestr=treeobj.treestr(), senttok=' '.join(senttok), id=id,
sentno=sentno, lineno=lineno + 1, totalsents=len(SENTENCES),
numannotated=numannotated(username),
poslabels=sorted(t for t in workerattr('poslabels') if ('@' not in t) and (t not in app.config['PUNCT_TAGS']) and (t not in app.config['PUNCT_TAGS'].values()) and (t != app.config['SYMBOL_TAG'])),
phrasallabels=sorted(t for t in workerattr('phrasallabels') if '}' not in t),
functiontags=sorted(t for t in (workerattr('functiontags')
| set(app.config['FUNCTIONTAGWHITELIST'])) if '}' not in t and '@' not in t and t != "p"),
morphtags=sorted(workerattr('morphtags')),
annotationhelp=ANNOTATIONHELP,
rows=rows, cols=100,
msg=msg)
@app.route('/annotate/redraw', methods=['POST'])
@loginrequired
def redraw():
"""Validate and re-draw tree."""
data = request.get_json()
sentno = int(data.get('sentno')) # 1-indexed
has_error = False
link = ('''<a href="#" onclick="accept()">accept this tree</a>
<input type="hidden" id="sentno" value="%d">'''
% (sentno))
try:
treeobj = ActivedopTree.from_str(data.get('tree'))
msg = treeobj.validate()
except ValueError as err:
msg = str(err)
treeobj = None
has_error = True
return jsonify({'html': Markup('%s\n\n%s\n\n%s' % (
msg,
link,
'',
)), 'has_error': has_error})
tree_to_accept = treeobj.treestr()
tree_for_editdist = re.sub(r'\s+', ' ', str(tree_to_accept))
oldtree = request.args.get('oldtree', '')
oldtree = re.sub(r'\s+', ' ', oldtree)
if oldtree and tree_for_editdist != oldtree:
session['actions'][EDITDIST] += editdistance(tree_for_editdist, oldtree)
session.modified = True
return jsonify({'html': Markup('%s\n\n%s\n\n%s' % (
msg,
link,
treeobj.gtree(add_editable_attr=True)
)), 'has_error': has_error})
def graphical_operation_preamble(treestr):
treeobj = ActivedopTree.from_str(treestr)
if app.config['CGELVALIDATE'] is None:
cgel_tree_terminals = None
else:
cgel_tree_terminals = treeobj.cgel_tree.terminals(gaps=True)
return treeobj, cgel_tree_terminals
def graphical_operation_postamble(dt, senttok, cgel_tree_terminals, sentno):
ptree = ParentedTree.convert(canonicalize(dt.nodes[0]))
treeobj = ActivedopTree(ptree = ptree, senttok = senttok,
cgel_tree_terminals = cgel_tree_terminals)
msg = treeobj.validate()
link = ('<a href="/annotate/accept?%s">accept this tree</a>'
% urlencode(dict(sentno=sentno, tree=treeobj.treestr())))
return treeobj, link, msg
@app.route('/annotate/newlabel', methods=['POST'])
@loginrequired
def newlabel():
"""Re-draw tree with newly picked label."""
data = request.get_json()
treestr = data.get('tree')
try:
treeobj, cgel_tree_terminals = graphical_operation_preamble(treestr)
except ValueError as err:
return Markup(str(err))
senttok = treeobj.senttok
# FIXME: re-factor; check label AFTER replacing it
# now actually replace label at nodeid
_treeid, nodeid = data.get('nodeid', '').lstrip('t').split('_')
nodeid = int(nodeid)
dt = DrawTree(treeobj.ptree, treeobj.senttok)
m = LABELRE.match(dt.nodes[nodeid].label)
error = ""
if data.get('label') is not None:
label = data.get('label', '')
dt.nodes[nodeid].label = (label
+ (m.group(2) or '')
+ (m.group(3) or ''))
elif data.get('function') is not None:
label = data.get('function', '')
if label == '':
dt.nodes[nodeid].label = '%s%s' % (
m.group(1), m.group(3) or '')
else:
dt.nodes[nodeid].label = '%s-%s%s' % (
m.group(1), label, m.group(3) or '')
elif data.get('morph') is not None:
label = data.get('morph', '')
if label == '':
dt.nodes[nodeid].label = '%s%s' % (
m.group(1), m.group(2) or '')
else:
dt.nodes[nodeid].label = '%s%s/%s' % (
m.group(1), m.group(2) or '', label)
else:
raise ValueError('expected label or function argument')
treeobj, link, msg = graphical_operation_postamble(dt, senttok, cgel_tree_terminals, int(data.get('sentno')))
if error == '':
session['actions'][RELABEL] += 1
session.modified = True
return Markup('%s\n\n%s\n\n%s%s\t%s' % (
msg,
link, error,
treeobj.gtree(add_editable_attr=True),
treeobj.treestr()))
@app.route('/annotate/reattach', methods=['POST'])
@loginrequired
def reattach():
"""Re-draw tree after re-attaching node under new parent."""
data = request.get_json()
treestr = data.get('tree')
try:
treeobj, cgel_tree_terminals = graphical_operation_preamble(treestr)
except ValueError as err:
return Markup(str(err))
# kludge (can't deep copy treeobj)
old_treeobj, _ = graphical_operation_preamble(treestr)
try:
dt = DrawTree(treeobj.ptree, treeobj.senttok)
error = ''
senttok = treeobj.senttok
if data.get('newparent') == 'deletenode':
# remove nodeid by replacing it with its children
_treeid, nodeid = data.get('nodeid', '').lstrip('t').split('_')
nodeid = int(nodeid)
x = dt.nodes[nodeid]
if nodeid == 0 or isinstance(x[0], int):
error = 'ERROR: cannot remove ROOT or POS node'
else:
children = list(x)
x[:] = []
for y in dt.nodes[0].subtrees():
if any(child is x for child in y):
i = y.index(x)
y[i:i + 1] = children
tree = canonicalize(dt.nodes[0])
dt = DrawTree(tree, senttok) # kludge..
break
elif data.get('nodeid', '') == 'newproj':
# splice in a new node under parentid
_treeid, newparent = data.get('newparent', ''
).lstrip('t').split('_')
newparent = int(newparent)
y = dt.nodes[newparent]
label = y.label
if isinstance(y[0], int):
error = 'ERROR: cannot add node under POS tag'
else:
children = list(y)
y[:] = []
y[:] = [Tree(label, children)]
tree = canonicalize(dt.nodes[0])
dt = DrawTree(tree, senttok) # kludge..
elif data.get('nodeid', '').startswith('newlabel_'):
# splice in a new node under parentid
_treeid, newparent = data.get('newparent', ''
).lstrip('t').split('_')
newparent = int(newparent)
label = data.get('nodeid').split('_', 1)[1]
y = dt.nodes[newparent]
if isinstance(y[0], int):
error = 'ERROR: cannot add node under POS tag'
else:
children = list(y)
y[:] = []
y[:] = [Tree(label, children)]
tree = canonicalize(dt.nodes[0])
dt = DrawTree(tree, senttok) # kludge..
else: # re-attach existing node at existing new parent
_treeid, nodeid = data.get('nodeid', '').lstrip('t').split('_')
nodeid = int(nodeid)
_treeid, newparent = data.get('newparent', ''
).lstrip('t').split('_')
newparent = int(newparent)
# remove node from old parent
# dt.nodes[nodeid].parent.pop(dt.nodes[nodeid].parent_index)
x = dt.nodes[nodeid]
y = dt.nodes[newparent]
def find_self_and_sisters(tree, subtree):
parent = None
sisters = []
# Helper function to find the parent of the subtree
def find_parent(node, target):
nonlocal parent
if target in node.children:
parent = node
return True
for child in node.children:
if isinstance(child, int):
return False
elif find_parent(child, target):
return True
return False
# Find the parent of the subtree
find_parent(tree, subtree)
if parent:
# Collect all children of the parent node
sisters = [child for child in parent.children]
return sisters
def extract_adjacent_punctuation(arr, target):
# Find the index of the target character
try:
target_index = arr.index(target)
except ValueError:
return [] # If target is not in the list, return an empty list
# Initialize the result list with the target character
result = [target]
# Collect punctuation characters to the left of the target
left_index = target_index - 1
while left_index >= 0 and is_punct_label(arr[left_index].label):
result.insert(0, arr[left_index])
left_index -= 1
# Collect punctuation characters to the right of the target
right_index = target_index + 1
while right_index < len(arr) and is_punct_label(arr[right_index].label):
result.append(arr[right_index])
right_index += 1
return result
for node in x.subtrees():
if node is y:
error = ('ERROR: cannot re-attach subtree'
' under (descendant of) itself\n')
break
else:
for node in dt.nodes[0].subtrees():
if any(child is x for child in node):
if len(node) > 1:
self_and_sisters = find_self_and_sisters(dt.nodes[0], x)
self_and_nearbypunct = extract_adjacent_punctuation(self_and_sisters, x)
for s in self_and_nearbypunct:
# iteratively move all sister punctuation to the target.
# (prevents problematic crossover movement of non-punctuation nodes over punctuation nodes)
# punctuation positions are subsequently re-canonicalized when ActivedopTree is reconstructed
node.remove(s)
dt.nodes[newparent].append(s)
tree = canonicalize(dt.nodes[0])
dt = DrawTree(tree, senttok) # kludge..
else:
error = ('ERROR: re-attaching only child creates'
' empty node %s; remove manually\n' % node)
break
treeobj, link, msg = graphical_operation_postamble(dt, senttok, cgel_tree_terminals, int(data.get('sentno')))
if treeobj.senttok != old_treeobj.senttok:
raise ValueError('movement would result in reordered tokens')
if error == '':
session['actions'][REATTACH] += 1
session.modified = True
return Markup('%s\n\n%s\n\n%s%s\t%s' % (
msg,
link, error + "\n",
treeobj.gtree(add_editable_attr=True),
treeobj.treestr()))
except Exception as err:
msg = old_treeobj.validate()
link = ('<a href="/annotate/accept?%s">accept this tree</a>'
% urlencode(dict(sentno=int(data.get('sentno')), tree=old_treeobj.treestr())))
error = "ERROR: " + str(err)
return Markup('%s\n\n%s\n\n%s%s\t%s' % (
msg,
link, error + "\n",
old_treeobj.gtree(add_editable_attr=True),
old_treeobj.treestr()))
@app.route('/annotate/reparsesubtree')
@loginrequired
def reparsesubtree():
"""Re-parse selected subtree."""
sentno = int(request.args.get('sentno')) # 1-indexed
username = session['username']
treeobj = ActivedopTree.from_str(request.args.get('tree'))
dt = DrawTree(treeobj.ptree, treeobj.senttok)
_treeid, nodeid = request.args.get('nodeid', '').lstrip('t').split('_')
nodeid = int(nodeid)
subseq = sorted(dt.nodes[nodeid].leaves())
subsent = ' '.join(treeobj.senttok[n] for n in subseq)
# FIXME only works when root label of tree matches label in grammar.
# need a single label that works across all stages.
root = dt.nodes[nodeid].label
# root = grammar.tolabel[next(iter(grammar.tblabelmapping[root]))]