Skip to content

Commit

Permalink
add web interface for spell checker.
Browse files Browse the repository at this point in the history
  • Loading branch information
anderscui committed Mar 13, 2016
1 parent b674dc2 commit e3e3468
Show file tree
Hide file tree
Showing 3 changed files with 180 additions and 57 deletions.
164 changes: 160 additions & 4 deletions web/spell/checker/views.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,169 @@
# coding=utf-8
import os
import datetime
import re
import time

import operator
from caching.base import cached
from django.shortcuts import render

from django.conf import settings

# Create your views here.
is_cached = False


def get_obj():
time.sleep(2)
return 'hello'


def data_dir():
# return os.path.join(settings.BASE_DIR, '../data/')
return r'D:\andersc\github\spellchecker\data'


print(data_dir())
## Helpers
def product(nums):
return reduce(operator.mul, nums, 1)


class Pdist(dict):
"""A probability distribution estimated from counts in datafile."""
def __init__(self, data=[], N=None, missingfn=None):
for key, count in data:
self[key] = self.get(key, 0) + int(count)
self.N = float(N or sum(self.itervalues()))
self.missingfn = missingfn or (lambda k, N: 1. / N)

def __call__(self, key):
if key in self:
return self[key] / self.N
else:
return self.missingfn(key, self.N)


def datafile(name, sep='\t'):
"""Read key, vlaue pairs from file."""
for line in file(os.path.join(data_dir(), name)):
yield line.split(sep)


def avoid_long_words(key, N):
"""Estimate the probability of an unknown word."""
return 10. / (N * 10 ** len(key))


def get_primary_dict():
return Pdist(datafile('ngrams/primary_dict.txt'), None, avoid_long_words)


def get_valid_prefixes():
return set(w[:i] for w in Pw for i in xrange(len(w)+1))


def get_p1edit():
"""the prob of single edits"""
return Pdist(datafile('norvig/count_1edit.txt'))


Pw = cached(get_primary_dict, 'primary_dict', 600)
PREFIXES = cached(get_valid_prefixes, 'prefixes', 600)
# P1edit = cached(get_p1edit, 'p1edit', 600)
P1edit = get_p1edit()

# def cPw(word, prev):
# """
# The conditional probability P(word | previous word)
# """
# try:
# return P2w[prev + ' ' + word] / float(Pw[prev])
# except KeyError:
# return Pw(word)
#
# # TODO: cachable
# # P2w = Pdist(datafile('../data/norvig/count_2w.txt'), N)
# P2w = Pdist(datafile('../data/ngrams/bigram_dict_less.txt'), None)


# common error rate
p_spell_error = 1./20

def Pedit(edit):
# prob of 'right is right'
if edit == '':
return 1. - p_spell_error

return p_spell_error * product(P1edit(e) for e in edit.split('+'))


alphabet = 'abcdefghijklmnopqrstuvwxyz'


def edits(word, d=2):
results = {}

def editsR(hd, tl, d, edits):
def ed(L, R):
return edits + [R + '|' + L]
C = hd+tl
# if C is in the dict
if C in Pw:
e = '+'.join(edits)
if C not in results:
results[C] = e
else:
results[C] = max(results[C], e, key=Pedit)

if d <= 0:
return

extensions = [hd+c for c in alphabet if hd+c in PREFIXES]
# previous char
p = (hd[-1] if hd else '<')

# insertion
for h in extensions:
editsR(h, tl, d-1, ed(p+h[-1], p))
if not tl:
return

# deletion
editsR(hd, tl[1:], d-1, ed(p, p+tl[0]))

# replacement
for h in extensions:
# match
if h[-1] == tl[0]:
editsR(h, tl[1:], d, edits)
else:
editsR(h, tl[1:], d-1, ed(h[-1], tl[0]))

# transpose
if len(tl) >= 2 and tl[0] != tl[1] and hd+tl[1] in PREFIXES:
editsR(hd+tl[1], tl[0]+tl[2:], d-1, ed(tl[1]+tl[0], tl[0:2]))

# body of edits
editsR('', word, d, [])
return results


def correct(w):

candicates = edits(w).items()
c, edit = max(candicates, key=lambda (c, e): Pedit(e) * Pw(c))
return [c]


def index(request):
return render(request, 'checker/index.html', {'page_name': 'checker.index'})

word = request.GET.get('word', '')
if not word:
return render(request, 'checker/index.html', {'page_name': 'checker.index'})

c = correct(word)
return render(request, 'checker/index.html', {'page_name': 'checker.index',
'word': word,
'corrections': c})


def stats(request):
Expand Down
12 changes: 12 additions & 0 deletions web/spell/spell/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,18 @@
}
}

CACHES = {
'default': {
'BACKEND': 'django.core.cache.backends.locmem.LocMemCache',
'LOCATION': 'unique-snowflake',
},
'cache_machine': {
'BACKEND': 'django.core.cache.backends.locmem.LocMemCache',
'LOCATION': 'unique-machine',
'KEY_PREFIX': 'spell:',
}
}


# Password validation
# https://docs.djangoproject.com/en/1.9/ref/settings/#auth-password-validators
Expand Down
61 changes: 8 additions & 53 deletions web/spell/templates/checker/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
<form id="search_form" role="form" method="get" action="{% url 'checker.index' %}">

<div class="input-group">
<input type="text" id="query" name="q" class="form-control"
placeholder="Please input your words" value="{{ query }}" autocomplete="on"
<input type="text" id="word" name="word" class="form-control"
placeholder="Please input your words" value="{{ word }}" autocomplete="on"
maxlength="64" tabindex="1" width="570px"/>

<span class="input-group-btn">
Expand All @@ -25,62 +25,17 @@

<div id="search_result">

{% if query %}
{% if pos_list %}
<nav>
<ul class="pagination">
<li class="{% if page <= 1 %}disabled{% endif %}">
<a href="{% url 'search.index' %}?q={{ query }}&pn={{ prev }}&pl={{ plen }}" aria-label="Previous">
<span aria-hidden="true">&laquo;</span>
</a>
</li>
{% for p in pages %}
<li class="{% if p == page %}active{% endif %}">
<a href="{% url 'search.index' %}?q={{ query }}&pn={{ p }}&pl={{ plen }}">{{ p }}</a> </li>
{% endfor %}
<li class="{% if page >= total_pages %}disabled{% endif %}">
<a href="{% url 'search.index' %}?q={{ query }}&pn={{ next }}&pl={{ plen }}" aria-label="Next">
<span aria-hidden="true">&raquo;</span>
</a>
</li>
<li>
<span>{{ page }} / {{ total_pages }}</span>
</li>
</ul>
</nav>
{% if word %}
{% if corrections %}

<div class="list-group">

{% for p in pos_list %}
<a href="http://www.lagou.com/jobs/{{ p.id }}.html?source=search" class="list-group-item">
<h4 class="list-group-item-heading">{{ p.name }} ({{ p.salary }})</h4>
<p class="list-group-item-text">{{ p.com_name }}</p>
<p class="list-group-item-text">{{ p.advantage }}</p>
</a>
{% for p in corrections %}
<span class="list-group-item">
<p class="list-group-item-text">{{ p }}</p>
</span>
{% endfor %}
</div>

<nav>
<ul class="pagination">
<li class="{% if page <= 1 %}disabled{% endif %}">
<a href="{% url 'search.index' %}?q={{ query }}&pn={{ prev }}&pl={{ plen }}" aria-label="Previous">
<span aria-hidden="true">&laquo;</span>
</a>
</li>
{% for p in pages %}
<li class="{% if p == page %}active{% endif %}">
<a href="{% url 'search.index' %}?q={{ query }}&pn={{ p }}&pl={{ plen }}">{{ p }}</a> </li>
{% endfor %}
<li class="{% if page >= total_pages %}disabled{% endif %}">
<a href="{% url 'search.index' %}?q={{ query }}&pn={{ next }}&pl={{ plen }}" aria-label="Next">
<span aria-hidden="true">&raquo;</span>
</a>
</li>
<li>
<span>{{ page }} / {{ total_pages }}</span>
</li>
</ul>
</nav>
{% else %}
<div>No words found.</div>
{% endif %}
Expand Down

0 comments on commit e3e3468

Please sign in to comment.