Skip to content

Commit

Permalink
Support unambiguous detection of language if only prefixes are supplied
Browse files Browse the repository at this point in the history
o ceases search as soon as ambiguity is resolved

Simplify success exit criteria for detecting language

Correct language deduction if prefixes remain ambiguous
  • Loading branch information
pjkundert authored and matejcik committed Jan 5, 2024
1 parent 264145f commit 4b14fc5
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 3 deletions.
27 changes: 24 additions & 3 deletions src/mnemonic/mnemonic.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,15 +93,36 @@ def normalize_string(txt: t.AnyStr) -> str:

@classmethod
def detect_language(cls, code: str) -> str:
"""Scan the Mnemonic until the language becomes unambiguous."""
"""Scan the Mnemonic until the language becomes unambiguous, including as abbreviation prefixes.
Unfortunately, there are valid words that are ambiguous between languages, which are complete words
in one language and are prefixes in another:
english: abandon ... about
french: abandon ... aboutir
If prefixes remain ambiguous, require exactly one language where word(s) match exactly.
"""
code = cls.normalize_string(code)
possible = set(cls(lang) for lang in cls.list_languages())
for word in code.split():
possible = set(p for p in possible if word in p.wordlist)
words = set(code.split())
for word in words:
# possible languages have candidate(s) starting with the word/prefix
possible = set(
p for p in possible if any(c.startswith(word) for c in p.wordlist)
)
if not possible:
raise ConfigurationError(f"Language unrecognized for {word!r}")
if len(possible) == 1:
return possible.pop().language
# Multiple languages match: A prefix in many, but an exact match in one determines language.
complete = set()
for word in words:
exact = set(p for p in possible if word in p.wordlist)
if len(exact) == 1:
complete.update(exact)
if len(complete) == 1:
return complete.pop().language
raise ConfigurationError(
f"Language ambiguous between {', '.join(p.language for p in possible)}"
)
Expand Down
22 changes: 22 additions & 0 deletions tests/test_mnemonic.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,16 @@ def test_failed_checksum(self) -> None:
def test_detection(self) -> None:
self.assertEqual("english", Mnemonic.detect_language("security"))

self.assertEqual(
"english", Mnemonic.detect_language("fruit wave dwarf")
) # ambiguous up to wave
self.assertEqual(
"english", Mnemonic.detect_language("fru wago dw")
) # ambiguous french/english up to dwarf prefix
self.assertEqual(
"french", Mnemonic.detect_language("fru wago dur enje")
) # ambiguous french/english up to enjeu prefix

with self.assertRaises(Exception):
Mnemonic.detect_language(
"jaguar xxxxxxx"
Expand All @@ -67,8 +77,20 @@ def test_detection(self) -> None:
"jaguar jaguar"
) # Ambiguous after examining all words

# Allowing word prefixes in language detection presents ambiguity issues. Require exactly
# one language that matches all prefixes, or one language matching some word(s) exactly.
self.assertEqual("english", Mnemonic.detect_language("jaguar security"))
self.assertEqual("french", Mnemonic.detect_language("jaguar aboyer"))
self.assertEqual("english", Mnemonic.detect_language("abandon about"))
self.assertEqual("french", Mnemonic.detect_language("abandon aboutir"))
self.assertEqual("french", Mnemonic.detect_language("fav financer"))
self.assertEqual("czech", Mnemonic.detect_language("fav finance"))
with self.assertRaises(Exception):
Mnemonic.detect_language("favor finan")
self.assertEqual("czech", Mnemonic.detect_language("flanel"))
self.assertEqual("portuguese", Mnemonic.detect_language("flanela"))
with self.assertRaises(Exception):
Mnemonic.detect_language("flane")

def test_utf8_nfkd(self) -> None:
# The same sentence in various UTF-8 forms
Expand Down

0 comments on commit 4b14fc5

Please sign in to comment.