From f86f9cbe7761015a3206b0a3413571432e2db80f Mon Sep 17 00:00:00 2001
From: andersc <anderscui@gmail.com>
Date: Mon, 14 Mar 2016 00:38:20 +0800
Subject: [PATCH] check spelling by bigrams

---
 context_checker/norvig_ngrams2.py             |   1 +
 .../checker/templatetags/checker_extras.py    |   2 +-
 web/spell/checker/urls.py                     |   1 +
 web/spell/checker/views.py                    | 109 +++++++++++++-----
 web/spell/templates/checker/bigram.html       |  49 ++++++++
 web/spell/templates/checker/index.html        |   4 +
 6 files changed, 139 insertions(+), 27 deletions(-)
 create mode 100644 web/spell/templates/checker/bigram.html

diff --git a/context_checker/norvig_ngrams2.py b/context_checker/norvig_ngrams2.py
index 9891b31..47c1377 100644
--- a/context_checker/norvig_ngrams2.py
+++ b/context_checker/norvig_ngrams2.py
@@ -251,6 +251,7 @@ def test_corrections():
     # ref: http://textblob.readthedocs.org/en/dev/quickstart.html#spelling-correction
     # TODO: too poor
     print(corrections('I havv goood speling!'))
+    print(correct('Thiss'))
 
     # Norvig's sample
     # 13 of 15 are OK, but acommodations and mispellings are left there.
diff --git a/web/spell/checker/templatetags/checker_extras.py b/web/spell/checker/templatetags/checker_extras.py
index 4350023..72fe900 100644
--- a/web/spell/checker/templatetags/checker_extras.py
+++ b/web/spell/checker/templatetags/checker_extras.py
@@ -10,8 +10,8 @@ def cur_time(fmt_str):
 
 
 side_pages = [(u'Spelling Checker', 'checker.index'),
+              (u'Spelling Bigram Checker', 'checker.bigram'),
               (u'Ngrams Stats', 'checker.stats'),
-              #(u'Quick Search', 'checker.index'),
               ]
 
 
diff --git a/web/spell/checker/urls.py b/web/spell/checker/urls.py
index ae75c50..f596328 100644
--- a/web/spell/checker/urls.py
+++ b/web/spell/checker/urls.py
@@ -4,5 +4,6 @@
 
 urlpatterns = [
     url(r'^$', views.index, name='checker.index'),
+    url(r'^bigram$', views.check_bigram, name='checker.bigram'),
     url(r'^stats$', views.stats, name='checker.stats'),
 ]
diff --git a/web/spell/checker/views.py b/web/spell/checker/views.py
index 2bac520..c4c6d14 100644
--- a/web/spell/checker/views.py
+++ b/web/spell/checker/views.py
@@ -1,4 +1,5 @@
 # coding=utf-8
+import heapq
 import os
 import time
 
@@ -7,6 +8,7 @@
 from django.shortcuts import render
 
 from django.conf import settings
+from math import log10
 
 is_cached = False
 
@@ -18,18 +20,27 @@ def get_obj():
 
 def data_dir():
     # return os.path.join(settings.BASE_DIR, '../data/')
+    # TODO: use relative path
     return r'D:\andersc\github\spellchecker\data'
 
 
-print(data_dir())
 ## Helpers
 def product(nums):
     return reduce(operator.mul, nums, 1)
 
 
+def missing_func(k, n):
+    return 1. / n
+
+
+def avoid_long_words(key, N):
+    """Estimate the probability of an unknown word."""
+    return 10. / (N * 10 ** len(key))
+
+
 class Pdist(dict):
     """A probability distribution estimated from counts in datafile."""
-    def __init__(self, data=[], N=None, missingfn=None):
+    def __init__(self, data=[], N=None, missingfn=missing_func):
         for key, count in data:
             self[key] = self.get(key, 0) + int(count)
         self.N = float(N or sum(self.itervalues()))
@@ -48,41 +59,43 @@ def datafile(name, sep='\t'):
         yield line.split(sep)
 
 
-def avoid_long_words(key, N):
-    """Estimate the probability of an unknown word."""
-    return 10. / (N * 10 ** len(key))
-
-
 def get_primary_dict():
     return Pdist(datafile('ngrams/primary_dict.txt'), None, avoid_long_words)
 
 
+def get_bigram_dict():
+    return Pdist(datafile('../data/ngrams/bigram_dict_less.txt'), None)
+
+
 def get_valid_prefixes():
     return set(w[:i] for w in Pw for i in xrange(len(w)+1))
 
 
 def get_p1edit():
     """the prob of single edits"""
+    # TODO: use a better missingfn
     return Pdist(datafile('norvig/count_1edit.txt'))
 
 
 Pw = cached(get_primary_dict, 'primary_dict', 600)
+P2w = cached(get_bigram_dict, 'bigram_dict', 600)
 PREFIXES = cached(get_valid_prefixes, 'prefixes', 600)
-# P1edit = cached(get_p1edit, 'p1edit', 600)
-P1edit = get_p1edit()
-
-# def cPw(word, prev):
-#     """
-#     The conditional probability P(word | previous word)
-#     """
-#     try:
-#         return P2w[prev + ' ' + word] / float(Pw[prev])
-#     except KeyError:
-#         return Pw(word)
-#
-# # TODO: cachable
-# # P2w = Pdist(datafile('../data/norvig/count_2w.txt'), N)
-# P2w = Pdist(datafile('../data/ngrams/bigram_dict_less.txt'), None)
+P1edit = cached(get_p1edit, 'p1edit', 600)
+
+
+def cPw(word, prev):
+    """
+    The conditional probability P(word | previous word)
+    """
+    try:
+        return P2w[prev + ' ' + word] / float(Pw[prev])
+    except KeyError:
+        return Pw(word)
+
+
+def sPw(w, prev, next):
+    print(' '.join([prev, w, next, str(log10(cPw(w, prev)) + log10(cPw(next, w)))]))
+    return log10(cPw(w, prev)) + log10(cPw(next, w))
 
 
 # common error rate
@@ -147,11 +160,37 @@ def ed(L, R):
     return results
 
 
-def correct(w):
+def correct(w, n=1):
 
     candicates = edits(w).items()
-    c, edit = max(candicates, key=lambda (c, e): Pedit(e) * Pw(c))
-    return [c]
+    most_likely = heapq.nlargest(n, candicates, key=lambda (c, e): Pedit(e) * Pw(c))
+    # c, edit = max(candicates, key=lambda (c, e): Pedit(e) * Pw(c))
+    return [c for c, edit in most_likely]
+
+
+def correct_bigram(words, n=1):
+
+    print(words)
+    print(len(words))
+    print()
+
+    idx = -1
+    for i, elem in enumerate(words):
+        if elem not in Pw:
+            idx = i
+            break
+    print('idx: ' + str(idx))
+    if idx < 0:
+        return [' '.join(words)]
+
+    candicates = edits(words[idx]).items()
+    prev = words[idx-1] if idx > 0 else '<S>'
+    next = words[idx+1] if idx < (len(words) - 1) else '</S>'
+    most_likely = heapq.nlargest(n, candicates, key=lambda (c, e): log10(Pedit(e)) + sPw(c, prev, next))
+    return [' '.join(words[0:idx] + [c] + words[idx+1:]) for c, edit in most_likely]
+
+
+# view functions ###
 
 
 def index(request):
@@ -160,8 +199,26 @@ def index(request):
     if not word:
         return render(request, 'checker/index.html', {'page_name': 'checker.index'})
 
-    c = correct(word)
+    n = request.GET.get('n', '5')
+    grams = request.GET.get('grams', 1)
+
+    in_dict = 'real word' if word in Pw else 'non-word'
+    c = correct(word, int(n))
     return render(request, 'checker/index.html', {'page_name': 'checker.index',
+                                                  'word': word,
+                                                  'in_dict': in_dict,
+                                                  'corrections': c})
+
+
+def check_bigram(request):
+
+    word = request.GET.get('word', '')
+    if not word:
+        return render(request, 'checker/bigram.html', {'page_name': 'checker.bigram'})
+
+    n = request.GET.get('n', '5')
+    c = correct_bigram(word.split(' '), int(n))
+    return render(request, 'checker/bigram.html', {'page_name': 'checker.bigram',
                                                   'word': word,
                                                   'corrections': c})
 
diff --git a/web/spell/templates/checker/bigram.html b/web/spell/templates/checker/bigram.html
new file mode 100644
index 0000000..27ac93b
--- /dev/null
+++ b/web/spell/templates/checker/bigram.html
@@ -0,0 +1,49 @@
+{% extends 'base.html' %}
+{% load staticfiles %}
+
+{% block title %}Spell Checker{% endblock %}
+
+{% block page_header %}
+{% endblock %}
+
+{% block body_block %}
+
+    <form id="search_form" role="form" method="get" action="{% url 'checker.bigram' %}">
+
+        <div class="input-group">
+            <input type="text" id="word" name="word" class="form-control"
+                   placeholder="Please input your words" value="{{ word }}" autocomplete="on"
+                   maxlength="64" tabindex="1" width="570px"/>
+
+            <span class="input-group-btn">
+                <input type="submit" id="check_button" class="btn btn-default" value="Check"/>
+            </span>
+        </div>
+        <br/>
+
+    </form>
+
+    <div id="search_result">
+
+        {% if word %}
+        {% if corrections %}
+
+        <div class="list-group">
+
+            <span class="list-group-item">
+                <strong><p class="list-group-item-text">{{ in_dict }}</p></strong>
+            </span>
+
+            {% for p in corrections %}
+            <span class="list-group-item">
+                <p class="list-group-item-text">{{ p }}</p>
+            </span>
+            {% endfor %}
+        </div>
+        {% else %}
+            <div>No words found.</div>
+        {% endif %}
+        {% endif %}
+    </div>
+
+{% endblock %}
diff --git a/web/spell/templates/checker/index.html b/web/spell/templates/checker/index.html
index 63d8d54..bdd2fc0 100644
--- a/web/spell/templates/checker/index.html
+++ b/web/spell/templates/checker/index.html
@@ -30,6 +30,10 @@
 
         <div class="list-group">
 
+            <span class="list-group-item">
+                <strong><p class="list-group-item-text">{{ in_dict }}</p></strong>
+            </span>
+
             {% for p in corrections %}
             <span class="list-group-item">
                 <p class="list-group-item-text">{{ p }}</p>