From f86f9cbe7761015a3206b0a3413571432e2db80f Mon Sep 17 00:00:00 2001 From: andersc Date: Mon, 14 Mar 2016 00:38:20 +0800 Subject: [PATCH] check spelling by bigrams --- context_checker/norvig_ngrams2.py | 1 + .../checker/templatetags/checker_extras.py | 2 +- web/spell/checker/urls.py | 1 + web/spell/checker/views.py | 109 +++++++++++++----- web/spell/templates/checker/bigram.html | 49 ++++++++ web/spell/templates/checker/index.html | 4 + 6 files changed, 139 insertions(+), 27 deletions(-) create mode 100644 web/spell/templates/checker/bigram.html diff --git a/context_checker/norvig_ngrams2.py b/context_checker/norvig_ngrams2.py index 9891b31..47c1377 100644 --- a/context_checker/norvig_ngrams2.py +++ b/context_checker/norvig_ngrams2.py @@ -251,6 +251,7 @@ def test_corrections(): # ref: http://textblob.readthedocs.org/en/dev/quickstart.html#spelling-correction # TODO: too poor print(corrections('I havv goood speling!')) + print(correct('Thiss')) # Norvig's sample # 13 of 15 are OK, but acommodations and mispellings are left there. diff --git a/web/spell/checker/templatetags/checker_extras.py b/web/spell/checker/templatetags/checker_extras.py index 4350023..72fe900 100644 --- a/web/spell/checker/templatetags/checker_extras.py +++ b/web/spell/checker/templatetags/checker_extras.py @@ -10,8 +10,8 @@ def cur_time(fmt_str): side_pages = [(u'Spelling Checker', 'checker.index'), + (u'Spelling Bigram Checker', 'checker.bigram'), (u'Ngrams Stats', 'checker.stats'), - #(u'Quick Search', 'checker.index'), ] diff --git a/web/spell/checker/urls.py b/web/spell/checker/urls.py index ae75c50..f596328 100644 --- a/web/spell/checker/urls.py +++ b/web/spell/checker/urls.py @@ -4,5 +4,6 @@ urlpatterns = [ url(r'^$', views.index, name='checker.index'), + url(r'^bigram$', views.check_bigram, name='checker.bigram'), url(r'^stats$', views.stats, name='checker.stats'), ] diff --git a/web/spell/checker/views.py b/web/spell/checker/views.py index 2bac520..c4c6d14 100644 --- a/web/spell/checker/views.py +++ b/web/spell/checker/views.py @@ -1,4 +1,5 @@ # coding=utf-8 +import heapq import os import time @@ -7,6 +8,7 @@ from django.shortcuts import render from django.conf import settings +from math import log10 is_cached = False @@ -18,18 +20,27 @@ def get_obj(): def data_dir(): # return os.path.join(settings.BASE_DIR, '../data/') + # TODO: use relative path return r'D:\andersc\github\spellchecker\data' -print(data_dir()) ## Helpers def product(nums): return reduce(operator.mul, nums, 1) +def missing_func(k, n): + return 1. / n + + +def avoid_long_words(key, N): + """Estimate the probability of an unknown word.""" + return 10. / (N * 10 ** len(key)) + + class Pdist(dict): """A probability distribution estimated from counts in datafile.""" - def __init__(self, data=[], N=None, missingfn=None): + def __init__(self, data=[], N=None, missingfn=missing_func): for key, count in data: self[key] = self.get(key, 0) + int(count) self.N = float(N or sum(self.itervalues())) @@ -48,41 +59,43 @@ def datafile(name, sep='\t'): yield line.split(sep) -def avoid_long_words(key, N): - """Estimate the probability of an unknown word.""" - return 10. / (N * 10 ** len(key)) - - def get_primary_dict(): return Pdist(datafile('ngrams/primary_dict.txt'), None, avoid_long_words) +def get_bigram_dict(): + return Pdist(datafile('../data/ngrams/bigram_dict_less.txt'), None) + + def get_valid_prefixes(): return set(w[:i] for w in Pw for i in xrange(len(w)+1)) def get_p1edit(): """the prob of single edits""" + # TODO: use a better missingfn return Pdist(datafile('norvig/count_1edit.txt')) Pw = cached(get_primary_dict, 'primary_dict', 600) +P2w = cached(get_bigram_dict, 'bigram_dict', 600) PREFIXES = cached(get_valid_prefixes, 'prefixes', 600) -# P1edit = cached(get_p1edit, 'p1edit', 600) -P1edit = get_p1edit() - -# def cPw(word, prev): -# """ -# The conditional probability P(word | previous word) -# """ -# try: -# return P2w[prev + ' ' + word] / float(Pw[prev]) -# except KeyError: -# return Pw(word) -# -# # TODO: cachable -# # P2w = Pdist(datafile('../data/norvig/count_2w.txt'), N) -# P2w = Pdist(datafile('../data/ngrams/bigram_dict_less.txt'), None) +P1edit = cached(get_p1edit, 'p1edit', 600) + + +def cPw(word, prev): + """ + The conditional probability P(word | previous word) + """ + try: + return P2w[prev + ' ' + word] / float(Pw[prev]) + except KeyError: + return Pw(word) + + +def sPw(w, prev, next): + print(' '.join([prev, w, next, str(log10(cPw(w, prev)) + log10(cPw(next, w)))])) + return log10(cPw(w, prev)) + log10(cPw(next, w)) # common error rate @@ -147,11 +160,37 @@ def ed(L, R): return results -def correct(w): +def correct(w, n=1): candicates = edits(w).items() - c, edit = max(candicates, key=lambda (c, e): Pedit(e) * Pw(c)) - return [c] + most_likely = heapq.nlargest(n, candicates, key=lambda (c, e): Pedit(e) * Pw(c)) + # c, edit = max(candicates, key=lambda (c, e): Pedit(e) * Pw(c)) + return [c for c, edit in most_likely] + + +def correct_bigram(words, n=1): + + print(words) + print(len(words)) + print() + + idx = -1 + for i, elem in enumerate(words): + if elem not in Pw: + idx = i + break + print('idx: ' + str(idx)) + if idx < 0: + return [' '.join(words)] + + candicates = edits(words[idx]).items() + prev = words[idx-1] if idx > 0 else '' + next = words[idx+1] if idx < (len(words) - 1) else '' + most_likely = heapq.nlargest(n, candicates, key=lambda (c, e): log10(Pedit(e)) + sPw(c, prev, next)) + return [' '.join(words[0:idx] + [c] + words[idx+1:]) for c, edit in most_likely] + + +# view functions ### def index(request): @@ -160,8 +199,26 @@ def index(request): if not word: return render(request, 'checker/index.html', {'page_name': 'checker.index'}) - c = correct(word) + n = request.GET.get('n', '5') + grams = request.GET.get('grams', 1) + + in_dict = 'real word' if word in Pw else 'non-word' + c = correct(word, int(n)) return render(request, 'checker/index.html', {'page_name': 'checker.index', + 'word': word, + 'in_dict': in_dict, + 'corrections': c}) + + +def check_bigram(request): + + word = request.GET.get('word', '') + if not word: + return render(request, 'checker/bigram.html', {'page_name': 'checker.bigram'}) + + n = request.GET.get('n', '5') + c = correct_bigram(word.split(' '), int(n)) + return render(request, 'checker/bigram.html', {'page_name': 'checker.bigram', 'word': word, 'corrections': c}) diff --git a/web/spell/templates/checker/bigram.html b/web/spell/templates/checker/bigram.html new file mode 100644 index 0000000..27ac93b --- /dev/null +++ b/web/spell/templates/checker/bigram.html @@ -0,0 +1,49 @@ +{% extends 'base.html' %} +{% load staticfiles %} + +{% block title %}Spell Checker{% endblock %} + +{% block page_header %} +{% endblock %} + +{% block body_block %} + +
+ +
+ + + + + +
+
+ +
+ +
+ + {% if word %} + {% if corrections %} + +
+ + +

{{ in_dict }}

+
+ + {% for p in corrections %} + +

{{ p }}

+
+ {% endfor %} +
+ {% else %} +
No words found.
+ {% endif %} + {% endif %} +
+ +{% endblock %} diff --git a/web/spell/templates/checker/index.html b/web/spell/templates/checker/index.html index 63d8d54..bdd2fc0 100644 --- a/web/spell/templates/checker/index.html +++ b/web/spell/templates/checker/index.html @@ -30,6 +30,10 @@
+ +

{{ in_dict }}

+
+ {% for p in corrections %}

{{ p }}