From 8078e701c9cbf5b5629157b150e5f3eed046f227 Mon Sep 17 00:00:00 2001 From: Mikal Stordal Date: Sun, 31 Dec 2023 00:51:57 +0100 Subject: [PATCH] fix: fix fuzzy search for non-ancii text when the search pattern matches partially with the text before forcefully converting it to ascii. --- Shoko.Server/Utilities/SeriesSearch.cs | 45 ++++++++++++-------------- 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/Shoko.Server/Utilities/SeriesSearch.cs b/Shoko.Server/Utilities/SeriesSearch.cs index 30c00cd07..cc385d594 100644 --- a/Shoko.Server/Utilities/SeriesSearch.cs +++ b/Shoko.Server/Utilities/SeriesSearch.cs @@ -40,49 +40,46 @@ public static string SanitizeFuzzy(string value, bool replaceInvalid) return value.CompactWhitespaces(); } + // This forces ASCII, because it's faster to stop caring if ss and ß are the same + // No it's not perfect, but it works better for those who just want to do lazy searching + private static string ForceASCII(this string value) => + value.FilterSearchCharacters() + .Replace('_', ' ') + .Replace('-', ' ') + .CompactWhitespaces() + // Case insensitive. We just removed the fancy characters, so latin + // alphabet lowercase is all we should have. + .ToLowerInvariant(); + private static readonly IStringDistance DiceSearch = new SorensenDice(); public static SearchResult DiceFuzzySearch(string text, string pattern, T value) { if (string.IsNullOrWhiteSpace(text) || string.IsNullOrWhiteSpace(pattern)) return new(); - // This forces ASCII, because it's faster to stop caring if ss and ß are the same - // No it's not perfect, but it works better for those who just want to do lazy searching - string inputString = text.FilterSearchCharacters(); - string query = pattern.FilterSearchCharacters(); - inputString = inputString.Replace('_', ' ').Replace('-', ' '); - query = query.Replace('_', ' ').Replace('-', ' '); - query = query.CompactWhitespaces(); - inputString = inputString.CompactWhitespaces(); - // Case insensitive. We just removed the fancy characters, so latin alphabet lowercase is all we should have - query = query.ToLowerInvariant(); - inputString = inputString.ToLowerInvariant(); - - if (string.IsNullOrWhiteSpace(query) || string.IsNullOrWhiteSpace(inputString)) - return new(); // Shortcut - var lengthDiff = Math.Abs(query.Length - inputString.Length); - int index = inputString.IndexOf(query, StringComparison.Ordinal); + var index = text.IndexOf(pattern, StringComparison.Ordinal); if (index > -1) return new() { ExactMatch = true, Index = index, - LengthDifference = lengthDiff, + LengthDifference = Math.Abs(pattern.Length - text.Length), Match = text, Result = value, }; + var inputString = text.ForceASCII(); + var query = pattern.ForceASCII(); + if (string.IsNullOrWhiteSpace(query) || string.IsNullOrWhiteSpace(inputString)) + return new(); + // always search the longer string for the shorter one if (query.Length > inputString.Length) - { - string temp = query; - query = inputString; - inputString = temp; - } + (inputString, query) = (query, inputString); - double result = DiceSearch.Distance(inputString, query); + var result = DiceSearch.Distance(inputString, query); // Don't count an error as liberally when the title is short if (inputString.Length < 5 && result > 0.5) @@ -94,7 +91,7 @@ public static SearchResult DiceFuzzySearch(string text, string pattern, T return new() { Distance = result, - LengthDifference = lengthDiff, + LengthDifference = Math.Abs(query.Length - inputString.Length), Match = text, Result = value, };