diff --git a/SymSpell.Test/SymSpellLookupCompoundTests.cs b/SymSpell.Test/SymSpellLookupCompoundTests.cs new file mode 100644 index 0000000..5ad3ea1 --- /dev/null +++ b/SymSpell.Test/SymSpellLookupCompoundTests.cs @@ -0,0 +1,88 @@ +using NUnit.Framework; +using System.Text.RegularExpressions; + +namespace symspell.Test +{ + [TestFixture] + public class SymSpellLookupCompoundTests + { + private SymSpell _symSpell; + + [OneTimeSetUp] + public void Init() + { + _symSpell = new SymSpell(); + _symSpell.CreateDictionaryEntry("in", 5); + _symSpell.CreateDictionaryEntry("the", 10); + _symSpell.CreateDictionaryEntry("third", 10); + _symSpell.CreateDictionaryEntry("quarter", 10); + _symSpell.CreateDictionaryEntry("of", 10); + _symSpell.CreateDictionaryEntry("last", 10); + _symSpell.CreateDictionaryEntry("visit", 10); + _symSpell.CreateDictionaryEntry("our", 10); + _symSpell.CreateDictionaryEntry("offices", 10); + _symSpell.CreateDictionaryEntry("last", 10); + _symSpell.CreateDictionaryEntry("last", 10); + _symSpell.CreateDictionaryEntry("a", 10); + } + + [Test] + public void SuggestWordsInDictionary_ReturnsCorrectedText() + { + var result = _symSpell.LookupCompound("in te dhird qarter oflast"); + Assert.AreEqual(1, result.Count); + Assert.AreEqual("in the third quarter of last", result[0].term); + } + + [Test] + public void NoSuggestForWord_ReturnsUnchanged() + { + var result = _symSpell.LookupCompound("in te dhird qarter oflast jear", 1); + Assert.AreEqual(1, result.Count); + Assert.AreEqual("in the third quarter of last jear", result[0].term); + } + + [Test] + public void SplittedWord_ReturnsCorrectedWord() + { + var result = _symSpell.LookupCompound("in te dhird quar ter oflast"); + Assert.AreEqual(1, result.Count); + Assert.AreEqual("in the third quarter of last", result[0].term); + } + + [Test] + public void DigitsWithoutSkipFunction_Replaced() + { + var result = _symSpell.LookupCompound("visit our offices 24/7"); + Assert.AreEqual(1, result.Count); + Assert.AreEqual("visit our offices of a", result[0].term); + } + + [TestCase("visit our offices 24/7", "visit our offices 24 7")] + [TestCase("th rd", "third")] + [TestCase("th 3 rd", "the 3 of")] + public void SkipDigitWords_ReturnsDigits(string source, string expected) + { + var digitRegex = new Regex("^\\d+$", RegexOptions.Compiled); + var result = _symSpell.LookupCompound(source, 2, digitRegex.IsMatch); + Assert.AreEqual(1, result.Count); + Assert.AreEqual(expected, result[0].term); + } + + [Test] + public void SplittedWordAndFirstPartSkiped_ReturnsSplitted() + { + var result = _symSpell.LookupCompound("in te dhird quar ter oflast", 2, (term) => term == "quar"); + Assert.AreEqual(1, result.Count); + Assert.AreEqual("in the third quar the of last", result[0].term); + } + + [Test] + public void SplittedWordAndSecondPartSkiped_ReturnsSplitted() + { + var result = _symSpell.LookupCompound("in te dhird quar ter oflast", 2, (term) => term == "ter"); + Assert.AreEqual(1, result.Count); + Assert.AreEqual("in the third our ter of last", result[0].term); + } + } +} \ No newline at end of file diff --git a/SymSpell/SymSpell.cs b/SymSpell/SymSpell.cs index 7304dfd..68be222 100644 --- a/SymSpell/SymSpell.cs +++ b/SymSpell/SymSpell.cs @@ -848,9 +848,10 @@ public List LookupCompound(string input) /// Find suggested spellings for a multi-word input string (supports word splitting/merging). /// The string being spell checked. - /// The maximum edit distance between input and suggested words. + /// The maximum edit distance between input and suggested words. + /// The function to check if a term should remain unchanged. /// A List of SuggestItem object representing suggested correct spellings for the input string. - public List LookupCompound(string input, int editDistanceMax) + public List LookupCompound(string input, int editDistanceMax, Func skipSpellcheck = null) { //parse input string into single terms string[] termList1 = ParseWords(input); @@ -858,22 +859,31 @@ public List LookupCompound(string input, int editDistanceMax) List suggestions = new List(); //suggestions for a single term List suggestionParts = new List(); //1 line with separate parts var distanceComparer = new EditDistance(this.distanceAlgorithm); + var termsToSkip = new HashSet(); //translate every term to its best suggestion, otherwise it remains unchanged bool lastCombi = false; for (int i = 0; i < termList1.Length; i++) { + // if skipSpellcheck returns true for term, leave it unchanged + if (skipSpellcheck != null && skipSpellcheck(termList1[i])) + { + termsToSkip.Add(termList1[i]); + suggestionParts.Add(CreateTermSuggestItem(termList1[i], editDistanceMax)); + goto nextTerm; + } + suggestions = Lookup(termList1[i], Verbosity.Top, editDistanceMax); //combi check, always before split - if ((i > 0) && !lastCombi) + if ((i > 0) && !lastCombi && !termsToSkip.Contains(termList1[i - 1])) { List suggestionsCombi = Lookup(termList1[i - 1] + termList1[i], Verbosity.Top, editDistanceMax); if (suggestionsCombi.Count > 0) { SuggestItem best1 = suggestionParts[suggestionParts.Count - 1]; - SuggestItem best2 = new SuggestItem(); + SuggestItem best2; if (suggestions.Count > 0) { best2 = suggestions[0]; @@ -881,11 +891,7 @@ public List LookupCompound(string input, int editDistanceMax) else { //unknown word - best2.term = termList1[i]; - //estimated edit distance - best2.distance = editDistanceMax + 1; - //estimated word occurrence probability P=10 / (N * 10^word length l) - best2.count = (long)((double)10 / Math.Pow((double)10, (double)best2.term.Length)); // 0; + best2 = CreateTermSuggestItem(termList1[i], editDistanceMax); } //distance1=edit distance between 2 split terms und their best corrections : als comparative value for the combination @@ -988,22 +994,12 @@ public List LookupCompound(string input, int editDistanceMax) } else { - SuggestItem si = new SuggestItem(); - si.term = termList1[i]; - //estimated word occurrence probability P=10 / (N * 10^word length l) - si.count = (long)((double)10 / Math.Pow((double)10, (double)si.term.Length)); - si.distance = editDistanceMax + 1; - suggestionParts.Add(si); + suggestionParts.Add(CreateTermSuggestItem(termList1[i], editDistanceMax)); } } else { - SuggestItem si = new SuggestItem(); - si.term = termList1[i]; - //estimated word occurrence probability P=10 / (N * 10^word length l) - si.count = (long)((double)10 / Math.Pow((double)10, (double)si.term.Length)); - si.distance = editDistanceMax + 1; - suggestionParts.Add(si); + suggestionParts.Add(CreateTermSuggestItem(termList1[i], editDistanceMax)); } } nextTerm:; @@ -1024,6 +1020,17 @@ public List LookupCompound(string input, int editDistanceMax) return suggestionsLine; } + private static SuggestItem CreateTermSuggestItem(string term, int editDistanceMax) + { + return new SuggestItem() + { + term = term, + //estimated word occurrence probability P=10 / (N * 10^word length l) + count = (long)((double)10 / Math.Pow((double)10, (double)term.Length)), + distance = editDistanceMax + 1 + }; + } + //###### //WordSegmentation divides a string into words by inserting missing spaces at the appropriate positions