Skip to content

Commit

Permalink
check spelling by bigrams
Browse files Browse the repository at this point in the history
  • Loading branch information
anderscui committed Mar 13, 2016
1 parent e3e3468 commit f86f9cb
Show file tree
Hide file tree
Showing 6 changed files with 139 additions and 27 deletions.
1 change: 1 addition & 0 deletions context_checker/norvig_ngrams2.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@ def test_corrections():
# ref: http://textblob.readthedocs.org/en/dev/quickstart.html#spelling-correction
# TODO: too poor
print(corrections('I havv goood speling!'))
print(correct('Thiss'))

# Norvig's sample
# 13 of 15 are OK, but acommodations and mispellings are left there.
Expand Down
2 changes: 1 addition & 1 deletion web/spell/checker/templatetags/checker_extras.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ def cur_time(fmt_str):


side_pages = [(u'Spelling Checker', 'checker.index'),
(u'Spelling Bigram Checker', 'checker.bigram'),
(u'Ngrams Stats', 'checker.stats'),
#(u'Quick Search', 'checker.index'),
]


Expand Down
1 change: 1 addition & 0 deletions web/spell/checker/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@

urlpatterns = [
url(r'^$', views.index, name='checker.index'),
url(r'^bigram$', views.check_bigram, name='checker.bigram'),
url(r'^stats$', views.stats, name='checker.stats'),
]
109 changes: 83 additions & 26 deletions web/spell/checker/views.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# coding=utf-8
import heapq
import os
import time

Expand All @@ -7,6 +8,7 @@
from django.shortcuts import render

from django.conf import settings
from math import log10

is_cached = False

Expand All @@ -18,18 +20,27 @@ def get_obj():

def data_dir():
# return os.path.join(settings.BASE_DIR, '../data/')
# TODO: use relative path
return r'D:\andersc\github\spellchecker\data'


print(data_dir())
## Helpers
def product(nums):
return reduce(operator.mul, nums, 1)


def missing_func(k, n):
return 1. / n


def avoid_long_words(key, N):
"""Estimate the probability of an unknown word."""
return 10. / (N * 10 ** len(key))


class Pdist(dict):
"""A probability distribution estimated from counts in datafile."""
def __init__(self, data=[], N=None, missingfn=None):
def __init__(self, data=[], N=None, missingfn=missing_func):
for key, count in data:
self[key] = self.get(key, 0) + int(count)
self.N = float(N or sum(self.itervalues()))
Expand All @@ -48,41 +59,43 @@ def datafile(name, sep='\t'):
yield line.split(sep)


def avoid_long_words(key, N):
"""Estimate the probability of an unknown word."""
return 10. / (N * 10 ** len(key))


def get_primary_dict():
return Pdist(datafile('ngrams/primary_dict.txt'), None, avoid_long_words)


def get_bigram_dict():
return Pdist(datafile('../data/ngrams/bigram_dict_less.txt'), None)


def get_valid_prefixes():
return set(w[:i] for w in Pw for i in xrange(len(w)+1))


def get_p1edit():
"""the prob of single edits"""
# TODO: use a better missingfn
return Pdist(datafile('norvig/count_1edit.txt'))


Pw = cached(get_primary_dict, 'primary_dict', 600)
P2w = cached(get_bigram_dict, 'bigram_dict', 600)
PREFIXES = cached(get_valid_prefixes, 'prefixes', 600)
# P1edit = cached(get_p1edit, 'p1edit', 600)
P1edit = get_p1edit()

# def cPw(word, prev):
# """
# The conditional probability P(word | previous word)
# """
# try:
# return P2w[prev + ' ' + word] / float(Pw[prev])
# except KeyError:
# return Pw(word)
#
# # TODO: cachable
# # P2w = Pdist(datafile('../data/norvig/count_2w.txt'), N)
# P2w = Pdist(datafile('../data/ngrams/bigram_dict_less.txt'), None)
P1edit = cached(get_p1edit, 'p1edit', 600)


def cPw(word, prev):
"""
The conditional probability P(word | previous word)
"""
try:
return P2w[prev + ' ' + word] / float(Pw[prev])
except KeyError:
return Pw(word)


def sPw(w, prev, next):
print(' '.join([prev, w, next, str(log10(cPw(w, prev)) + log10(cPw(next, w)))]))
return log10(cPw(w, prev)) + log10(cPw(next, w))


# common error rate
Expand Down Expand Up @@ -147,11 +160,37 @@ def ed(L, R):
return results


def correct(w):
def correct(w, n=1):

candicates = edits(w).items()
c, edit = max(candicates, key=lambda (c, e): Pedit(e) * Pw(c))
return [c]
most_likely = heapq.nlargest(n, candicates, key=lambda (c, e): Pedit(e) * Pw(c))
# c, edit = max(candicates, key=lambda (c, e): Pedit(e) * Pw(c))
return [c for c, edit in most_likely]


def correct_bigram(words, n=1):

print(words)
print(len(words))
print()

idx = -1
for i, elem in enumerate(words):
if elem not in Pw:
idx = i
break
print('idx: ' + str(idx))
if idx < 0:
return [' '.join(words)]

candicates = edits(words[idx]).items()
prev = words[idx-1] if idx > 0 else '<S>'
next = words[idx+1] if idx < (len(words) - 1) else '</S>'
most_likely = heapq.nlargest(n, candicates, key=lambda (c, e): log10(Pedit(e)) + sPw(c, prev, next))
return [' '.join(words[0:idx] + [c] + words[idx+1:]) for c, edit in most_likely]


# view functions ###


def index(request):
Expand All @@ -160,8 +199,26 @@ def index(request):
if not word:
return render(request, 'checker/index.html', {'page_name': 'checker.index'})

c = correct(word)
n = request.GET.get('n', '5')
grams = request.GET.get('grams', 1)

in_dict = 'real word' if word in Pw else 'non-word'
c = correct(word, int(n))
return render(request, 'checker/index.html', {'page_name': 'checker.index',
'word': word,
'in_dict': in_dict,
'corrections': c})


def check_bigram(request):

word = request.GET.get('word', '')
if not word:
return render(request, 'checker/bigram.html', {'page_name': 'checker.bigram'})

n = request.GET.get('n', '5')
c = correct_bigram(word.split(' '), int(n))
return render(request, 'checker/bigram.html', {'page_name': 'checker.bigram',
'word': word,
'corrections': c})

Expand Down
49 changes: 49 additions & 0 deletions web/spell/templates/checker/bigram.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
{% extends 'base.html' %}
{% load staticfiles %}

{% block title %}Spell Checker{% endblock %}

{% block page_header %}
{% endblock %}

{% block body_block %}

<form id="search_form" role="form" method="get" action="{% url 'checker.bigram' %}">

<div class="input-group">
<input type="text" id="word" name="word" class="form-control"
placeholder="Please input your words" value="{{ word }}" autocomplete="on"
maxlength="64" tabindex="1" width="570px"/>

<span class="input-group-btn">
<input type="submit" id="check_button" class="btn btn-default" value="Check"/>
</span>
</div>
<br/>

</form>

<div id="search_result">

{% if word %}
{% if corrections %}

<div class="list-group">

<span class="list-group-item">
<strong><p class="list-group-item-text">{{ in_dict }}</p></strong>
</span>

{% for p in corrections %}
<span class="list-group-item">
<p class="list-group-item-text">{{ p }}</p>
</span>
{% endfor %}
</div>
{% else %}
<div>No words found.</div>
{% endif %}
{% endif %}
</div>

{% endblock %}
4 changes: 4 additions & 0 deletions web/spell/templates/checker/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@

<div class="list-group">

<span class="list-group-item">
<strong><p class="list-group-item-text">{{ in_dict }}</p></strong>
</span>

{% for p in corrections %}
<span class="list-group-item">
<p class="list-group-item-text">{{ p }}</p>
Expand Down

0 comments on commit f86f9cb

Please sign in to comment.