Skip to content

Commit

Permalink
New release v6.5
Browse files Browse the repository at this point in the history
1. Better SymSpell.LookupCompound correction quality with existing single term dictionary by using Naive Bayes probability for selecting best word splitting.
2. Even better SymSpell.LookupCompound correction quality, when using the optional bigram dictionary in order to use sentence level context information for selecting best spelling correction.
3. English bigram frequency dictionary included
  • Loading branch information
wolfgarbe committed Sep 11, 2019
1 parent fa703f3 commit 736034a
Show file tree
Hide file tree
Showing 11 changed files with 242,500 additions and 39 deletions.
20 changes: 19 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ but SymSpell needs to generate **only 25 deletes** to cover them all, both at pr

```
Copyright (c) 2019 Wolf Garbe
Version: 6.4
Version: 6.5
Author: Wolf Garbe <[email protected]>
Maintainer: Wolf Garbe <[email protected]>
URL: https://github.com/wolfgarbe/symspell
Expand Down Expand Up @@ -214,6 +214,18 @@ foreach (var suggestion in suggestions)
}


//load bigram dictionary
string dictionaryPath= baseDirectory + "../../../../SymSpell/frequency_bigramdictionary_en_243_342.txt";
int termIndex = 0; //column of the term in the dictionary text file
int countIndex = 2; //column of the term frequency in the dictionary text file
if (!symSpell.LoadBigramDictionary(dictionaryPath, termIndex, countIndex))
{
Console.WriteLine("File not found!");
//press any key to exit program
Console.ReadKey();
return;
}

//lookup suggestions for multi-word input strings (supports compound splitting & merging)
inputTerm="whereis th elove hehad dated forImuch of thepast who couqdn'tread in sixtgrade and ins pired him";
maxEditDistanceLookup = 2; //max edit distance per lookup (per single word, not per whole input string)
Expand Down Expand Up @@ -348,6 +360,12 @@ https://github.com/Archivus/SymSpell
2. Option to preserve case (upper/lower case) of input term.
3. Open source the code for creating custom frequency dictionaries in any language and size as intersection between Google Books Ngram data (Provides representative word frequencies) and SCOWL Spell Checker Oriented Word Lists (Ensures genuine English vocabulary).

#### Changes in v6.5

1. IMPROVEMENT: Better SymSpell.LookupCompound correction quality with existing single term dictionary by using Naive Bayes probability for selecting best word splitting.<br>
2. IMPROVEMENT: Even better SymSpell.LookupCompound correction quality, when using the optional bigram dictionary in order to use sentence level context information for selecting best spelling correction.<br>
3. IMPROVEMENT: English bigram frequency dictionary included

#### Changes in v6.4

1. LoadDictioary(Stream, ...) and CreateDictionary(Stream) methods added (contibution by [ccady](https://github.com/ccady))<br>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
{
"runtimeTarget": {
"name": ".NETCoreApp,Version=v2.0",
"signature": "da39a3ee5e6b4b0d3255bfef95601890afd80709"
"signature": ""
},
"compilationOptions": {},
"targets": {
".NETCoreApp,Version=v2.0": {
"SymSpell.CommandLine/1.0.0": {
"dependencies": {
"SymSpell": "6.3.0"
"SymSpell": "6.5.0"
},
"runtime": {
"SymSpell.CommandLine.dll": {}
}
},
"SymSpell/6.3.0": {
"SymSpell/6.5.0": {
"runtime": {
"SymSpell.dll": {}
}
Expand All @@ -27,7 +27,7 @@
"serviceable": false,
"sha512": ""
},
"SymSpell/6.3.0": {
"SymSpell/6.5.0": {
"type": "project",
"serviceable": false,
"sha512": ""
Expand Down
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
{
"runtimeOptions": {
"additionalProbingPaths": [
"C:\\Users\\Wolf\\.dotnet\\store\\|arch|\\|tfm|",
"C:\\Users\\Wolf\\.nuget\\packages",
"C:\\Users\\wolfg\\.dotnet\\store\\|arch|\\|tfm|",
"C:\\Users\\wolfg\\.nuget\\packages",
"C:\\Program Files\\dotnet\\sdk\\NuGetFallbackFolder"
]
}
Expand Down
Binary file modified SymSpell.CommandLine/bin/Release/netcoreapp2.0/SymSpell.dll
Binary file not shown.
Binary file modified SymSpell.CommandLine/bin/Release/netcoreapp2.0/SymSpell.pdb
Binary file not shown.
3 changes: 3 additions & 0 deletions SymSpell.CompoundDemo/SymSpell.CompoundDemo.cs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ public static void Main(string[] args)
//string path = "../../frequency_dictionary_en_82_765.txt"; //path when using symspell nuget package (frequency_dictionary_en_82_765.txt is included in nuget package)
if (!symSpell.LoadDictionary(path, 0, 1)) { Console.Error.WriteLine("\rFile not found: " + Path.GetFullPath(path)); Console.ReadKey();return; }

string pathBigram = AppDomain.CurrentDomain.BaseDirectory + "frequency_bigramdictionary_en_243_342.txt";
if (!symSpell.LoadBigramDictionary(pathBigram, 0, 2)) { Console.Error.WriteLine("\rFile not found: " + Path.GetFullPath(pathBigram)); Console.ReadKey(); return; }

//Alternatively Create the dictionary from a text corpus (e.g. http://norvig.com/big.txt )
//Make sure the corpus does not contain spelling errors, invalid terms and the word frequency is representative to increase the precision of the spelling correction.
//The dictionary may contain vocabulary from different languages.
Expand Down
154 changes: 124 additions & 30 deletions SymSpell/SymSpell.cs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
// 3. multiple independent input terms with/without spelling errors

// Copyright (C) 2019 Wolf Garbe
// Version: 6.4
// Version: 6.5
// Author: Wolf Garbe [email protected]
// Maintainer: Wolf Garbe [email protected]
// URL: https://github.com/wolfgarbe/symspell
Expand Down Expand Up @@ -253,6 +253,60 @@ public bool CreateDictionaryEntry(string key, Int64 count, SuggestionStage stagi
return true;
}

public Dictionary<string, long> bigrams = new Dictionary<string, long>();
public long bigramCountMin = long.MaxValue;

/// <summary>Load multiple dictionary entries from a file of word/frequency count pairs</summary>
/// <remarks>Merges with any dictionary data already loaded.</remarks>
/// <param name="corpus">The path+filename of the file.</param>
/// <param name="termIndex">The column position of the word.</param>
/// <param name="countIndex">The column position of the frequency count.</param>
/// <returns>True if file loaded, or false if file not found.</returns>
public bool LoadBigramDictionary(string corpus, int termIndex, int countIndex)
{
if (!File.Exists(corpus)) return false;
using (Stream corpusStream = File.OpenRead(corpus))
{
return LoadBigrams(corpusStream, termIndex, countIndex);
}
}

/// <summary>Load multiple dictionary entries from a file of word/frequency count pairs</summary>
/// <remarks>Merges with any dictionary data already loaded.</remarks>
/// <param name="corpus">The path+filename of the file.</param>
/// <param name="termIndex">The column position of the word.</param>
/// <param name="countIndex">The column position of the frequency count.</param>
/// <returns>True if file loaded, or false if file not found.</returns>
public bool LoadBigrams(Stream corpusStream, int termIndex, int countIndex)
{

using (StreamReader sr = new StreamReader(corpusStream, System.Text.Encoding.UTF8, false))
{
String line;

//process a single line at a time only for memory efficiency
while ((line = sr.ReadLine()) != null)
{
string[] lineParts = line.Split(null);
if (lineParts.Length >= 3)
{
string key = lineParts[termIndex] + " " + lineParts[termIndex + 1];
//Int64 count;
if (Int64.TryParse(lineParts[countIndex], out Int64 count))
{
//nur solche combis zulassen, die ich beide auch als einzelworte habe
//Console.WriteLine(key+" : "+ count.ToString());
//count = count * 8;//8
bigrams[key] = count;
if (count < bigramCountMin) bigramCountMin = count;
}
}
}

}
return true;
}

/// <summary>Load multiple dictionary entries from a file of word/frequency count pairs</summary>
/// <remarks>Merges with any dictionary data already loaded.</remarks>
/// <param name="corpus">The path+filename of the file.</param>
Expand Down Expand Up @@ -798,7 +852,6 @@ public List<SuggestItem> LookupCompound(string input, int editDistanceMax)
//parse input string into single terms
string[] termList1 = ParseWords(input);

List<SuggestItem> suggestionsPreviousTerm; //suggestions for a single term
List<SuggestItem> suggestions = new List<SuggestItem>(); //suggestions for a single term
List<SuggestItem> suggestionParts = new List<SuggestItem>(); //1 line with separate parts
var distanceComparer = new EditDistance(this.distanceAlgorithm);
Expand All @@ -807,10 +860,8 @@ public List<SuggestItem> LookupCompound(string input, int editDistanceMax)
bool lastCombi = false;
for (int i = 0; i < termList1.Length; i++)
{
suggestionsPreviousTerm = new List<SuggestItem>(suggestions.Count); for (int k = 0; k < suggestions.Count; k++) suggestionsPreviousTerm.Add(suggestions[k].ShallowCopy());
suggestions = Lookup(termList1[i], Verbosity.Top, editDistanceMax);


//combi check, always before split
if ((i > 0) && !lastCombi)
{
Expand All @@ -823,17 +874,20 @@ public List<SuggestItem> LookupCompound(string input, int editDistanceMax)
if (suggestions.Count > 0)
{
best2 = suggestions[0];

}
else
{
//unknown word
best2.term = termList1[i];
//estimated edit distance
best2.distance = editDistanceMax + 1;
best2.count = 0;
//estimated word occurrence probability P=10 / (N * 10^word length l)
best2.count = (long)((double)10 / Math.Pow((double)10, (double)best2.term.Length)); // 0;
}
//if (suggestionsCombi[0].distance + 1 < DamerauLevenshteinDistance(termList1[i - 1] + " " + termList1[i], best1.term + " " + best2.term))
int distance1 = distanceComparer.Compare(termList1[i - 1] + " " + termList1[i], best1.term + " " + best2.term, editDistanceMax);
if ((distance1>=0)&&(suggestionsCombi[0].distance + 1 < distance1))

//distance1=edit distance between 2 split terms und their best corrections : als comparative value for the combination
int distance1 = best1.distance + best2.distance;
if ((distance1 >= 0) && ((suggestionsCombi[0].distance + 1 < distance1) || ((suggestionsCombi[0].distance + 1 == distance1) && ((double)suggestionsCombi[0].count > (double)best1.count / (double)SymSpell.N * (double)best2.count))))
{
suggestionsCombi[0].distance++;
suggestionParts[suggestionParts.Count - 1] = suggestionsCombi[0];
Expand All @@ -853,14 +907,13 @@ public List<SuggestItem> LookupCompound(string input, int editDistanceMax)
else
{
//if no perfect suggestion, split word into pairs
List<SuggestItem> suggestionsSplit = new List<SuggestItem>();
SuggestItem suggestionSplitBest = null;

//add original term
if (suggestions.Count > 0) suggestionsSplit.Add(suggestions[0]);
//add original term
if (suggestions.Count > 0) suggestionSplitBest = suggestions[0];

if (termList1[i].Length > 1)
{

for (int j = 1; j < termList1[i].Length; j++)
{
string part1 = termList1[i].Substring(0, j);
Expand All @@ -869,36 +922,73 @@ public List<SuggestItem> LookupCompound(string input, int editDistanceMax)
List<SuggestItem> suggestions1 = Lookup(part1, Verbosity.Top, editDistanceMax);
if (suggestions1.Count > 0)
{
if ((suggestions.Count > 0) && (suggestions[0].term == suggestions1[0].term)) break;//if split correction1 == einzelwort correction
List<SuggestItem> suggestions2 = Lookup(part2, Verbosity.Top, editDistanceMax);
if (suggestions2.Count > 0)
{
if ((suggestions.Count > 0) && (suggestions[0].term == suggestions2[0].term)) break;//if split correction1 == einzelwort correction
//select best suggestion for split pair
suggestionSplit.term = suggestions1[0].term + " " + suggestions2[0].term;
int distance2 = distanceComparer.Compare(termList1[i], suggestions1[0].term + " " + suggestions2[0].term, editDistanceMax);

int distance2 = distanceComparer.Compare(termList1[i], suggestionSplit.term, editDistanceMax);
if (distance2 < 0) distance2 = editDistanceMax + 1;

if (suggestionSplitBest != null)
{
if (distance2 > suggestionSplitBest.distance) continue;
if (distance2 < suggestionSplitBest.distance) suggestionSplitBest = null;
}

suggestionSplit.distance = distance2;
suggestionSplit.count = Math.Min(suggestions1[0].count, suggestions2[0].count);
suggestionsSplit.Add(suggestionSplit);
//if bigram exists in bigram dictionary
if (bigrams.TryGetValue(suggestionSplit.term, out long bigramCount))
{
suggestionSplit.count = bigramCount;

//increase count, if split.corrections are part of or identical to input
//single term correction exists
if (suggestions.Count > 0)
{
//alternatively remove the single term from suggestionsSplit, but then other splittings could win
if ((suggestions1[0].term + suggestions2[0].term == termList1[i]))
{
//make count bigger than count of single term correction
suggestionSplit.count = Math.Max(suggestionSplit.count, suggestions[0].count + 2);
}
else if ((suggestions1[0].term == suggestions[0].term) || (suggestions2[0].term == suggestions[0].term))
{
//make count bigger than count of single term correction
suggestionSplit.count = Math.Max(suggestionSplit.count, suggestions[0].count + 1);
}
}
//no single term correction exists
else if ((suggestions1[0].term + suggestions2[0].term == termList1[i]))
{
suggestionSplit.count = Math.Max(suggestionSplit.count, Math.Max(suggestions1[0].count, suggestions2[0].count) + 2);
}

}
else
{
//The Naive Bayes probability of the word combination is the product of the two word probabilities: P(AB) = P(A) * P(B)
//use it to estimate the frequency count of the combination, which then is used to rank/select the best splitting variant
suggestionSplit.count = Math.Min(bigramCountMin, (long)((double)suggestions1[0].count / (double)SymSpell.N * (double)suggestions2[0].count));
}

//early termination of split
if (suggestionSplit.distance == 1) break;
if ((suggestionSplitBest == null) || (suggestionSplit.count > suggestionSplitBest.count)) suggestionSplitBest = suggestionSplit;
}
}
}

if (suggestionsSplit.Count > 0)
if (suggestionSplitBest != null)
{
//select best suggestion for split pair
suggestionsSplit.Sort((x, y) => 2 * x.distance.CompareTo(y.distance) - x.count.CompareTo(y.count));
suggestionParts.Add(suggestionsSplit[0]);
suggestionParts.Add(suggestionSplitBest);
}
else
{
SuggestItem si = new SuggestItem();
si.term = termList1[i];
si.count = 0;
//estimated word occurrence probability P=10 / (N * 10^word length l)
si.count = (long)((double)10 / Math.Pow((double)10, (double)si.term.Length));
si.distance = editDistanceMax + 1;
suggestionParts.Add(si);
}
Expand All @@ -907,26 +997,30 @@ public List<SuggestItem> LookupCompound(string input, int editDistanceMax)
{
SuggestItem si = new SuggestItem();
si.term = termList1[i];
si.count = 0;
//estimated word occurrence probability P=10 / (N * 10^word length l)
si.count = (long)((double)10 / Math.Pow((double)10, (double)si.term.Length));
si.distance = editDistanceMax + 1;
suggestionParts.Add(si);
}
}
nextTerm:;
nextTerm:;
}

SuggestItem suggestion = new SuggestItem();
suggestion.count = Int64.MaxValue;
string s = ""; foreach (SuggestItem si in suggestionParts) { s += si.term + " "; suggestion.count = Math.Min(suggestion.count, si.count); }//Console.WriteLine(s);
suggestion.term = s.TrimEnd();

double count = SymSpell.N;
System.Text.StringBuilder s = new System.Text.StringBuilder();
foreach (SuggestItem si in suggestionParts) { s.Append(si.term + " "); count *= (double)si.count / (double)SymSpell.N; }
suggestion.count = (long)count;

suggestion.term = s.ToString().TrimEnd();
suggestion.distance = distanceComparer.Compare(input, suggestion.term, int.MaxValue);

List<SuggestItem> suggestionsLine = new List<SuggestItem>();
suggestionsLine.Add(suggestion);
return suggestionsLine;
}


//######

//WordSegmentation divides a string into words by inserting missing spaces at the appropriate positions
Expand Down
Loading

0 comments on commit 736034a

Please sign in to comment.