From e4e3cbae3f6bab8ef18cafe654c964015864ea4f Mon Sep 17 00:00:00 2001 From: Mikal Stordal Date: Sun, 7 Jan 2024 18:56:12 +0100 Subject: [PATCH] =?UTF-8?q?fix:=20add=20back=20the=20missing=20sanitizatio?= =?UTF-8?q?n=20=E2=80=A6because=20I=20forgot=20it=20was=20a=20thing=20we?= =?UTF-8?q?=20needed=20for=20the=20strict=20search.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Shoko.Server/Utilities/SeriesSearch.cs | 41 +++++++++++++++----------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/Shoko.Server/Utilities/SeriesSearch.cs b/Shoko.Server/Utilities/SeriesSearch.cs index d35fab928..0361ed9a2 100644 --- a/Shoko.Server/Utilities/SeriesSearch.cs +++ b/Shoko.Server/Utilities/SeriesSearch.cs @@ -40,15 +40,17 @@ public static string SanitizeFuzzy(string value, bool replaceInvalid) return value.CompactWhitespaces(); } + private static string SanitizeSearchInput(this string value) => + value.Replace('_', ' ') + .Replace('-', ' ') + .CompactWhitespaces() + .ToLowerInvariant(); + // This forces ASCII, because it's faster to stop caring if ss and ß are the same // No it's not perfect, but it works better for those who just want to do lazy searching private static string ForceASCII(this string value) => value.FilterSearchCharacters() - .Replace('_', ' ') - .Replace('-', ' ') .CompactWhitespaces() - // Case insensitive. We just removed the fancy characters, so latin - // alphabet lowercase is all we should have. .ToLowerInvariant(); private static readonly IStringDistance DiceSearch = new SorensenDice(); @@ -58,11 +60,13 @@ public static SearchResult DiceFuzzySearch(string text, string pattern, T if (string.IsNullOrWhiteSpace(text) || string.IsNullOrWhiteSpace(pattern)) return new(); - // Case insensitive. - text = text.ToLowerInvariant(); - pattern = pattern.ToLowerInvariant(); + // Sanitize inputs before use. + text = text.SanitizeSearchInput(); + pattern = pattern.SanitizeSearchInput(); + if (string.IsNullOrWhiteSpace(pattern) || string.IsNullOrWhiteSpace(text)) + return new(); - // Shortcut + // Strict search for any text (e.g. ASCII, Japanese Kanji/Kana, etc.). var index = text.IndexOf(pattern, StringComparison.Ordinal); if (index > -1) return new() @@ -74,19 +78,20 @@ public static SearchResult DiceFuzzySearch(string text, string pattern, T Result = value, }; - var inputString = text.ForceASCII(); - var query = pattern.ForceASCII(); - if (string.IsNullOrWhiteSpace(query) || string.IsNullOrWhiteSpace(inputString)) + // If strict search didn't work, then force ASCII and do a fuzzy search. + text = text.ForceASCII(); + pattern = pattern.ForceASCII(); + if (string.IsNullOrWhiteSpace(pattern) || string.IsNullOrWhiteSpace(text)) return new(); - // always search the longer string for the shorter one - if (query.Length > inputString.Length) - (inputString, query) = (query, inputString); + // Always search the longer string for the shorter one. + if (pattern.Length > text.Length) + (text, pattern) = (pattern, text); - var result = DiceSearch.Distance(inputString, query); + var result = DiceSearch.Distance(text, pattern); - // Don't count an error as liberally when the title is short - if (inputString.Length < 5 && result > 0.5) + // Don't count an error as liberally when the title is short. + if (text.Length < 5 && result > 0.5) return new(); if (result >= 0.8) @@ -95,7 +100,7 @@ public static SearchResult DiceFuzzySearch(string text, string pattern, T return new() { Distance = result, - LengthDifference = Math.Abs(query.Length - inputString.Length), + LengthDifference = Math.Abs(pattern.Length - text.Length), Match = text, Result = value, };