New release v6.5

1. Better SymSpell.LookupCompound correction quality with existing single term dictionary by using Naive Bayes probability for selecting best word splitting. 2. Even better SymSpell.LookupCompound correction quality, when using the optional bigram dictionary in order to use sentence level context information for selecting best spelling correction. 3. English bigram frequency dictionary included
wolfgarbe · Sep 11, 2019 · 736034a · 736034a
1 parent fa703f3
commit 736034a
Show file tree

Hide file tree

Showing 11 changed files with 242,500 additions and 39 deletions.
diff --git a/README.md b/README.md
@@ -21,7 +21,7 @@ but SymSpell needs to generate **only 25 deletes** to cover them all, both at pr
 
 ```
 Copyright (c) 2019 Wolf Garbe
-Version: 6.4
+Version: 6.5
 Author: Wolf Garbe <[email protected]>
 Maintainer: Wolf Garbe <[email protected]>
 URL: https://github.com/wolfgarbe/symspell
@@ -214,6 +214,18 @@ foreach (var suggestion in suggestions)
 }
 
 
+//load bigram dictionary
+string dictionaryPath= baseDirectory + "../../../../SymSpell/frequency_bigramdictionary_en_243_342.txt";
+int termIndex = 0; //column of the term in the dictionary text file
+int countIndex = 2; //column of the term frequency in the dictionary text file
+if (!symSpell.LoadBigramDictionary(dictionaryPath, termIndex, countIndex))
+{
+  Console.WriteLine("File not found!");
+  //press any key to exit program
+  Console.ReadKey();
+  return;
+}
+
 //lookup suggestions for multi-word input strings (supports compound splitting & merging)
 inputTerm="whereis th elove hehad dated forImuch of thepast who couqdn'tread in sixtgrade and ins pired him";
 maxEditDistanceLookup = 2; //max edit distance per lookup (per single word, not per whole input string)
@@ -348,6 +360,12 @@ https://github.com/Archivus/SymSpell
 2. Option to preserve case (upper/lower case) of input term.
 3. Open source the code for creating custom frequency dictionaries in any language and size as intersection between Google Books Ngram data (Provides representative word frequencies) and SCOWL Spell Checker Oriented Word Lists (Ensures genuine English vocabulary).
 
+#### Changes in v6.5
+
+1. IMPROVEMENT: Better SymSpell.LookupCompound correction quality with existing single term dictionary by using Naive Bayes probability for selecting best word splitting.<br>
+2. IMPROVEMENT: Even better SymSpell.LookupCompound correction quality, when using the optional bigram dictionary in order to use sentence level context information for selecting best spelling correction.<br>
+3. IMPROVEMENT: English bigram frequency dictionary included
+
 #### Changes in v6.4
 
 1.	LoadDictioary(Stream, ...) and CreateDictionary(Stream) methods added (contibution by [ccady](https://github.com/ccady))<br>

diff --git a/SymSpell.CommandLine/bin/Release/netcoreapp2.0/SymSpell.CommandLine.deps.json b/SymSpell.CommandLine/bin/Release/netcoreapp2.0/SymSpell.CommandLine.deps.json
@@ -1,20 +1,20 @@
 {
   "runtimeTarget": {
     "name": ".NETCoreApp,Version=v2.0",
-    "signature": "da39a3ee5e6b4b0d3255bfef95601890afd80709"
+    "signature": ""
   },
   "compilationOptions": {},
   "targets": {
     ".NETCoreApp,Version=v2.0": {
       "SymSpell.CommandLine/1.0.0": {
         "dependencies": {
-          "SymSpell": "6.3.0"
+          "SymSpell": "6.5.0"
         },
         "runtime": {
           "SymSpell.CommandLine.dll": {}
         }
       },
-      "SymSpell/6.3.0": {
+      "SymSpell/6.5.0": {
         "runtime": {
           "SymSpell.dll": {}
         }
@@ -27,7 +27,7 @@
       "serviceable": false,
       "sha512": ""
     },
-    "SymSpell/6.3.0": {
+    "SymSpell/6.5.0": {
       "type": "project",
       "serviceable": false,
       "sha512": ""

diff --git a/SymSpell.CommandLine/bin/Release/netcoreapp2.0/SymSpell.CommandLine.dll b/SymSpell.CommandLine/bin/Release/netcoreapp2.0/SymSpell.CommandLine.dll
diff --git a/SymSpell.CommandLine/bin/Release/netcoreapp2.0/SymSpell.CommandLine.pdb b/SymSpell.CommandLine/bin/Release/netcoreapp2.0/SymSpell.CommandLine.pdb
diff --git a/SymSpell.CommandLine/bin/Release/netcoreapp2.0/SymSpell.CommandLine.runtimeconfig.dev.json b/SymSpell.CommandLine/bin/Release/netcoreapp2.0/SymSpell.CommandLine.runtimeconfig.dev.json
@@ -1,8 +1,8 @@
 {
   "runtimeOptions": {
     "additionalProbingPaths": [
-      "C:\\Users\\Wolf\\.dotnet\\store\\|arch|\\|tfm|",
-      "C:\\Users\\Wolf\\.nuget\\packages",
+      "C:\\Users\\wolfg\\.dotnet\\store\\|arch|\\|tfm|",
+      "C:\\Users\\wolfg\\.nuget\\packages",
       "C:\\Program Files\\dotnet\\sdk\\NuGetFallbackFolder"
     ]
   }

diff --git a/SymSpell.CommandLine/bin/Release/netcoreapp2.0/SymSpell.dll b/SymSpell.CommandLine/bin/Release/netcoreapp2.0/SymSpell.dll
diff --git a/SymSpell.CommandLine/bin/Release/netcoreapp2.0/SymSpell.pdb b/SymSpell.CommandLine/bin/Release/netcoreapp2.0/SymSpell.pdb
diff --git a/SymSpell.CompoundDemo/SymSpell.CompoundDemo.cs b/SymSpell.CompoundDemo/SymSpell.CompoundDemo.cs
@@ -35,6 +35,9 @@ public static void Main(string[] args)
             //string path = "../../frequency_dictionary_en_82_765.txt";  //path when using symspell nuget package (frequency_dictionary_en_82_765.txt is included in nuget package)
             if (!symSpell.LoadDictionary(path, 0, 1)) { Console.Error.WriteLine("\rFile not found: " + Path.GetFullPath(path)); Console.ReadKey();return; }
 
+            string pathBigram = AppDomain.CurrentDomain.BaseDirectory + "frequency_bigramdictionary_en_243_342.txt";
+            if (!symSpell.LoadBigramDictionary(pathBigram, 0, 2)) { Console.Error.WriteLine("\rFile not found: " + Path.GetFullPath(pathBigram)); Console.ReadKey(); return; }
+
             //Alternatively Create the dictionary from a text corpus (e.g. http://norvig.com/big.txt ) 
             //Make sure the corpus does not contain spelling errors, invalid terms and the word frequency is representative to increase the precision of the spelling correction.
             //The dictionary may contain vocabulary from different languages. 

diff --git a/SymSpell/SymSpell.cs b/SymSpell/SymSpell.cs
@@ -12,7 +12,7 @@
 // 3. multiple independent input terms with/without spelling errors
 
 // Copyright (C) 2019 Wolf Garbe
-// Version: 6.4
+// Version: 6.5
 // Author: Wolf Garbe [email protected]
 // Maintainer: Wolf Garbe [email protected]
 // URL: https://github.com/wolfgarbe/symspell
@@ -253,6 +253,60 @@ public bool CreateDictionaryEntry(string key, Int64 count, SuggestionStage stagi
         return true;
     }
 
+    public Dictionary<string, long> bigrams = new Dictionary<string, long>();
+    public long bigramCountMin = long.MaxValue;
+
+    /// <summary>Load multiple dictionary entries from a file of word/frequency count pairs</summary>
+    /// <remarks>Merges with any dictionary data already loaded.</remarks>
+    /// <param name="corpus">The path+filename of the file.</param>
+    /// <param name="termIndex">The column position of the word.</param>
+    /// <param name="countIndex">The column position of the frequency count.</param>
+    /// <returns>True if file loaded, or false if file not found.</returns>
+    public bool LoadBigramDictionary(string corpus, int termIndex, int countIndex)
+    {
+        if (!File.Exists(corpus)) return false;
+        using (Stream corpusStream = File.OpenRead(corpus))
+        {
+            return LoadBigrams(corpusStream, termIndex, countIndex);
+        }
+    }
+
+    /// <summary>Load multiple dictionary entries from a file of word/frequency count pairs</summary>
+    /// <remarks>Merges with any dictionary data already loaded.</remarks>
+    /// <param name="corpus">The path+filename of the file.</param>
+    /// <param name="termIndex">The column position of the word.</param>
+    /// <param name="countIndex">The column position of the frequency count.</param>
+    /// <returns>True if file loaded, or false if file not found.</returns>
+    public bool LoadBigrams(Stream corpusStream, int termIndex, int countIndex)
+    {
+
+        using (StreamReader sr = new StreamReader(corpusStream, System.Text.Encoding.UTF8, false))
+        {
+            String line;
+
+            //process a single line at a time only for memory efficiency
+            while ((line = sr.ReadLine()) != null)
+            {
+                string[] lineParts = line.Split(null);
+                if (lineParts.Length >= 3)
+                {
+                    string key = lineParts[termIndex] + " " + lineParts[termIndex + 1];
+                    //Int64 count;
+                    if (Int64.TryParse(lineParts[countIndex], out Int64 count))
+                    {
+                        //nur solche combis zulassen, die ich beide auch als einzelworte habe
+                        //Console.WriteLine(key+" : "+ count.ToString());
+                        //count = count * 8;//8
+                        bigrams[key] = count;
+                        if (count < bigramCountMin) bigramCountMin = count;
+                    }
+                }
+            }
+
+        }
+        return true;
+    }
+
     /// <summary>Load multiple dictionary entries from a file of word/frequency count pairs</summary>
     /// <remarks>Merges with any dictionary data already loaded.</remarks>
     /// <param name="corpus">The path+filename of the file.</param>
@@ -798,7 +852,6 @@ public List<SuggestItem> LookupCompound(string input, int editDistanceMax)
         //parse input string into single terms
         string[] termList1 = ParseWords(input);
 
-        List<SuggestItem> suggestionsPreviousTerm;                  //suggestions for a single term
         List<SuggestItem> suggestions = new List<SuggestItem>();     //suggestions for a single term
         List<SuggestItem> suggestionParts = new List<SuggestItem>(); //1 line with separate parts
         var distanceComparer = new EditDistance(this.distanceAlgorithm);
@@ -807,10 +860,8 @@ public List<SuggestItem> LookupCompound(string input, int editDistanceMax)
         bool lastCombi = false;
         for (int i = 0; i < termList1.Length; i++)
         {
-            suggestionsPreviousTerm = new List<SuggestItem>(suggestions.Count); for (int k = 0; k < suggestions.Count; k++) suggestionsPreviousTerm.Add(suggestions[k].ShallowCopy());
             suggestions = Lookup(termList1[i], Verbosity.Top, editDistanceMax);
 
-
             //combi check, always before split
             if ((i > 0) && !lastCombi)
             {
@@ -823,17 +874,20 @@ public List<SuggestItem> LookupCompound(string input, int editDistanceMax)
                     if (suggestions.Count > 0)
                     {
                         best2 = suggestions[0];
-
                     }
                     else
                     {
+                        //unknown word
                         best2.term = termList1[i];
+                        //estimated edit distance
                         best2.distance = editDistanceMax + 1;
-                        best2.count = 0;
+                        //estimated word occurrence probability P=10 / (N * 10^word length l)
+                        best2.count = (long)((double)10 / Math.Pow((double)10, (double)best2.term.Length)); // 0;
                     }
-                    //if (suggestionsCombi[0].distance + 1 < DamerauLevenshteinDistance(termList1[i - 1] + " " + termList1[i], best1.term + " " + best2.term))
-                    int distance1 = distanceComparer.Compare(termList1[i - 1] + " " + termList1[i], best1.term + " " + best2.term, editDistanceMax);
-                    if ((distance1>=0)&&(suggestionsCombi[0].distance + 1 < distance1))
+
+                    //distance1=edit distance between 2 split terms und their best corrections : als comparative value for the combination
+                    int distance1 = best1.distance + best2.distance;
+                    if ((distance1 >= 0) && ((suggestionsCombi[0].distance + 1 < distance1) || ((suggestionsCombi[0].distance + 1 == distance1) && ((double)suggestionsCombi[0].count > (double)best1.count / (double)SymSpell.N * (double)best2.count))))
                     {
                         suggestionsCombi[0].distance++;
                         suggestionParts[suggestionParts.Count - 1] = suggestionsCombi[0];
@@ -853,14 +907,13 @@ public List<SuggestItem> LookupCompound(string input, int editDistanceMax)
             else
             {
                 //if no perfect suggestion, split word into pairs
-                List<SuggestItem> suggestionsSplit = new List<SuggestItem>();
+                SuggestItem suggestionSplitBest = null;
 
-                //add original term
-                if (suggestions.Count > 0) suggestionsSplit.Add(suggestions[0]);
+                //add original term 
+                if (suggestions.Count > 0) suggestionSplitBest = suggestions[0];
 
                 if (termList1[i].Length > 1)
                 {
-
                     for (int j = 1; j < termList1[i].Length; j++)
                     {
                         string part1 = termList1[i].Substring(0, j);
@@ -869,36 +922,73 @@ public List<SuggestItem> LookupCompound(string input, int editDistanceMax)
                         List<SuggestItem> suggestions1 = Lookup(part1, Verbosity.Top, editDistanceMax);
                         if (suggestions1.Count > 0)
                         {
-                            if ((suggestions.Count > 0) && (suggestions[0].term == suggestions1[0].term)) break;//if split correction1 == einzelwort correction
                             List<SuggestItem> suggestions2 = Lookup(part2, Verbosity.Top, editDistanceMax);
                             if (suggestions2.Count > 0)
                             {
-                                if ((suggestions.Count > 0) && (suggestions[0].term == suggestions2[0].term)) break;//if split correction1 == einzelwort correction
                                 //select best suggestion for split pair
                                 suggestionSplit.term = suggestions1[0].term + " " + suggestions2[0].term;
-                                int distance2 = distanceComparer.Compare(termList1[i], suggestions1[0].term + " " + suggestions2[0].term, editDistanceMax);
+
+                                int distance2 = distanceComparer.Compare(termList1[i], suggestionSplit.term, editDistanceMax);
                                 if (distance2 < 0) distance2 = editDistanceMax + 1;
+
+                                if (suggestionSplitBest != null)
+                                {
+                                    if (distance2 > suggestionSplitBest.distance) continue;
+                                    if (distance2 < suggestionSplitBest.distance) suggestionSplitBest = null;
+                                }
+
                                 suggestionSplit.distance = distance2;
-                                suggestionSplit.count = Math.Min(suggestions1[0].count, suggestions2[0].count);
-                                suggestionsSplit.Add(suggestionSplit);
+                                //if bigram exists in bigram dictionary
+                                if (bigrams.TryGetValue(suggestionSplit.term, out long bigramCount))
+                                {
+                                    suggestionSplit.count = bigramCount;
+
+                                    //increase count, if split.corrections are part of or identical to input  
+                                    //single term correction exists
+                                    if (suggestions.Count > 0)
+                                    {
+                                        //alternatively remove the single term from suggestionsSplit, but then other splittings could win
+                                        if ((suggestions1[0].term + suggestions2[0].term == termList1[i]))
+                                        {
+                                            //make count bigger than count of single term correction
+                                            suggestionSplit.count = Math.Max(suggestionSplit.count, suggestions[0].count + 2);
+                                        }
+                                        else if ((suggestions1[0].term == suggestions[0].term) || (suggestions2[0].term == suggestions[0].term))
+                                        {
+                                            //make count bigger than count of single term correction
+                                            suggestionSplit.count = Math.Max(suggestionSplit.count, suggestions[0].count + 1);
+                                        }
+                                    }
+                                    //no single term correction exists
+                                    else if ((suggestions1[0].term + suggestions2[0].term == termList1[i]))
+                                    {
+                                        suggestionSplit.count = Math.Max(suggestionSplit.count, Math.Max(suggestions1[0].count, suggestions2[0].count) + 2);
+                                    }
+
+                                }
+                                else
+                                {
+                                    //The Naive Bayes probability of the word combination is the product of the two word probabilities: P(AB) = P(A) * P(B)
+                                    //use it to estimate the frequency count of the combination, which then is used to rank/select the best splitting variant  
+                                    suggestionSplit.count = Math.Min(bigramCountMin, (long)((double)suggestions1[0].count / (double)SymSpell.N * (double)suggestions2[0].count));
+                                }
 
-                                //early termination of split
-                                if (suggestionSplit.distance == 1) break;
+                                if ((suggestionSplitBest == null) || (suggestionSplit.count > suggestionSplitBest.count)) suggestionSplitBest = suggestionSplit;
                             }
                         }
                     }
 
-                    if (suggestionsSplit.Count > 0)
+                    if (suggestionSplitBest != null)
                     {
                         //select best suggestion for split pair
-                        suggestionsSplit.Sort((x, y) => 2 * x.distance.CompareTo(y.distance) - x.count.CompareTo(y.count));
-                        suggestionParts.Add(suggestionsSplit[0]);
+                        suggestionParts.Add(suggestionSplitBest);
                     }
                     else
                     {
                         SuggestItem si = new SuggestItem();
                         si.term = termList1[i];
-                        si.count = 0;
+                        //estimated word occurrence probability P=10 / (N * 10^word length l)
+                        si.count = (long)((double)10 / Math.Pow((double)10, (double)si.term.Length));
                         si.distance = editDistanceMax + 1;
                         suggestionParts.Add(si);
                     }
@@ -907,26 +997,30 @@ public List<SuggestItem> LookupCompound(string input, int editDistanceMax)
                 {
                     SuggestItem si = new SuggestItem();
                     si.term = termList1[i];
-                    si.count = 0;
+                    //estimated word occurrence probability P=10 / (N * 10^word length l)
+                    si.count = (long)((double)10 / Math.Pow((double)10, (double)si.term.Length));
                     si.distance = editDistanceMax + 1;
                     suggestionParts.Add(si);
                 }
             }
-            nextTerm:;
+        nextTerm:;
         }
 
         SuggestItem suggestion = new SuggestItem();
-        suggestion.count = Int64.MaxValue;
-        string s = ""; foreach (SuggestItem si in suggestionParts) { s += si.term + " "; suggestion.count = Math.Min(suggestion.count, si.count); }//Console.WriteLine(s);
-        suggestion.term = s.TrimEnd();
+
+        double count = SymSpell.N;
+        System.Text.StringBuilder s = new System.Text.StringBuilder();
+        foreach (SuggestItem si in suggestionParts) { s.Append(si.term + " "); count *= (double)si.count / (double)SymSpell.N; }
+        suggestion.count = (long)count;
+
+        suggestion.term = s.ToString().TrimEnd();
         suggestion.distance = distanceComparer.Compare(input, suggestion.term, int.MaxValue);
 
         List<SuggestItem> suggestionsLine = new List<SuggestItem>();
         suggestionsLine.Add(suggestion);
         return suggestionsLine;
     }
 
-
     //######
 
     //WordSegmentation divides a string into words by inserting missing spaces at the appropriate positions