Skip to content

Commit

Permalink
Correct language deduction if prefixes remain ambiguous
Browse files Browse the repository at this point in the history
  • Loading branch information
pjkundert committed Nov 12, 2023
1 parent 07a4be4 commit cf7df0a
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 4 deletions.
26 changes: 22 additions & 4 deletions src/mnemonic/mnemonic.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,16 +90,34 @@ def normalize_string(txt: AnyStr) -> str:

@classmethod
def detect_language(cls, code: str) -> str:
"""Scan the Mnemonic until the language becomes unambiguous, including as abbreviation prefixes."""
"""Scan the Mnemonic until the language becomes unambiguous, including as abbreviation prefixes.
Unfortunately, there are valid words that are ambiguous between languages, which are complete words
in one language and are prefixes in another:
english: abandon ... about
french: abandon ... aboutir
If prefixes remain ambiguous, require exactly one language where word(s) match exactly.
"""
code = cls.normalize_string(code)
possible = set(cls(lang) for lang in cls.list_languages())
for word in code.split():
words = set(code.split())
for word in words:
# possible languages have candidate(s) starting with the word/prefix
possible = set(p for p in possible if any(c.startswith( word ) for c in p.wordlist))
if len(possible) == 1:
return possible.pop().language
if not possible:
raise ConfigurationError(f"Language unrecognized for {word!r}")
if len(possible) == 1:
return possible.pop().language
# Multiple languages match: A prefix in many, but an exact match in one determines language.
complete = set()
for word in words:
exact = set(p for p in possible if word in p.wordlist)
if len(exact) == 1:
complete.update(exact)
if len(complete) == 1:
return complete.pop().language
raise ConfigurationError(
f"Language ambiguous between {', '.join( p.language for p in possible)}"
)
Expand Down
12 changes: 12 additions & 0 deletions tests/test_mnemonic.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,20 @@ def test_detection(self) -> None:
"jaguar jaguar"
) # Ambiguous after examining all words

# Allowing word prefixes in language detection presents ambiguity issues. Require exactly
# one language that matches all prefixes, or one language matching some word(s) exactly.
self.assertEqual("english", Mnemonic.detect_language("jaguar security"))
self.assertEqual("french", Mnemonic.detect_language("jaguar aboyer"))
self.assertEqual("english", Mnemonic.detect_language("abandon about"))
self.assertEqual("french", Mnemonic.detect_language("abandon aboutir"))
self.assertEqual("french", Mnemonic.detect_language("fav financer"))
self.assertEqual("czech", Mnemonic.detect_language("fav finance"))
with self.assertRaises(Exception):
Mnemonic.detect_language("favor finan")
self.assertEqual("czech", Mnemonic.detect_language("flanel"))
self.assertEqual("portuguese", Mnemonic.detect_language("flanela"))
with self.assertRaises(Exception):
Mnemonic.detect_language("flane")

def test_utf8_nfkd(self) -> None:
# The same sentence in various UTF-8 forms
Expand Down

0 comments on commit cf7df0a

Please sign in to comment.