nielstron · hwalinga · Sep 16, 2020 · Sep 16, 2020 · Sep 16, 2020 · Sep 16, 2020
diff --git a/quantulum3/_lang/en_US/tests/quantities.json b/quantulum3/_lang/en_US/tests/quantities.json
@@ -1373,5 +1373,15 @@
           "surface": "three million, two hundred & forty"
         }
       ]
+  },
+  {
+    "req": "The battery has 2nw.",
+    "res": [
+        {
+            "value": 2,
+            "unit": "nanowatt",
+            "surface": "2nw"
+        }
+    ]
   }
 ]
diff --git a/quantulum3/classifier.py b/quantulum3/classifier.py
@@ -108,7 +108,6 @@ def _clean_text_lang(lang):
 def train_classifier(
     parameters=None, ngram_range=(1, 1), store=True, lang="en_US", n_jobs=None
 ):
-
     """
     Train the intent classifier
     TODO auto invoke if sklearn version is new or first install or sth
@@ -240,38 +239,37 @@ def disambiguate_entity(key, text, lang="en_US"):
 
 
 ###############################################################################
-def disambiguate_unit(unit, text, lang="en_US"):
-    """
-    Resolve ambiguity between units with same names, symbols or abbreviations.
-    """
 
+
+def attempt_disambiguate_unit(unit, text, lang):
+    """Resolve ambiguity between units with same names, symbols or abbreviations.
+    Returns list of possibilities"""
     new_unit = (
         load.units(lang).symbols.get(unit)
         or load.units(lang).surfaces.get(unit)
         or load.units(lang).surfaces_lower.get(unit.lower())
         or load.units(lang).symbols_lower.get(unit.lower())
     )
     if not new_unit:
-        return load.units(lang).names.get("unk")
-
-    if len(new_unit) > 1:
-        transformed = classifier(lang).tfidf_model.transform([clean_text(text, lang)])
-        scores = classifier(lang).classifier.predict_proba(transformed).tolist()[0]
-        scores = zip(scores, classifier(lang).target_names)
-
-        # Filter for possible names
-        names = [i.name for i in new_unit]
-        scores = [i for i in scores if i[1] in names]
-
-        # Sort by rank
-        scores = sorted(scores, key=lambda x: x[0], reverse=True)
-        try:
-            final = load.units(lang).names[scores[0][1]]
-            _LOGGER.debug('\tAmbiguity resolved for "%s" (%s)' % (unit, scores))
-        except IndexError:
-            _LOGGER.debug('\tAmbiguity not resolved for "%s"' % unit)
-            final = next(iter(new_unit))
-    else:
-        final = next(iter(new_unit))
-
-    return final
+        raise KeyError('Could not find unit "%s" from "%s"' % (unit, text))
+    if len(new_unit) == 1:
+        return new_unit
+
+    # Start scoring
+    transformed = classifier(lang).tfidf_model.transform([clean_text(text, lang)])
+    scores = classifier(lang).classifier.predict_proba(transformed).tolist()[0]
+    scores = zip(scores, classifier(lang).target_names)
+
+    # Filter for possible names
+    names = [i.name for i in new_unit]
+    scores = [i for i in scores if i[1] in names]
+
+    # Sort by rank
+    scores = sorted(scores, key=lambda x: x[0], reverse=True)
+    try:
+        new_unit = [load.units(lang).names[scores[0][1]]]
+        _LOGGER.debug('\tAmbiguity resolved for "%s" (%s)' % (unit, scores))
+        return new_unit
+    except IndexError:
+        _LOGGER.debug('\tAmbiguity not resolved for "%s"' % unit)
+        return new_unit
diff --git a/quantulum3/disambiguate.py b/quantulum3/disambiguate.py
@@ -3,41 +3,95 @@
 :mod:`Quantulum` disambiguation functions.
 """
 
+import logging
+
 from . import classifier as clf
 from . import load
 from . import no_classifier as no_clf
 
+_LOGGER = logging.getLogger(__name__)
 
 ###############################################################################
+
+
 def disambiguate_unit(unit_surface, text, lang="en_US"):
     """
     Resolve ambiguity between units with same names, symbols or abbreviations.
     :returns (str) unit name of the resolved unit
     """
-    if clf.USE_CLF:
-        base = clf.disambiguate_unit(unit_surface, text, lang).name
-    else:
-        base = (
-            load.units(lang).symbols[unit_surface]
-            or load.units(lang).surfaces[unit_surface]
-            or load.units(lang).surfaces_lower[unit_surface.lower()]
-            or load.units(lang).symbols_lower[unit_surface.lower()]
-        )
+    units = attempt_disambiguate_unit(unit_surface, text, lang)
+    if units and len(units) == 1:
+        return next(iter(units)).name
+
+    # Change the capitalization of the last letter to find a better match.
+    # Capitalization is sometimes cause of confusion, but the
+    # capitalization of the prefix is too important to alter.
 
-        if len(base) > 1:
-            base = no_clf.disambiguate_no_classifier(base, text, lang)
-        elif len(base) == 1:
-            base = next(iter(base))
+    # If the unit is longer than two prefixes, we set everything to lower
+    # except the first letter.
+    if len(unit_surface) > 2:
+        unit_changed = unit_surface[0] + unit_surface[1:].lower()
+        if unit_changed == unit_surface:
+            return resolve_ambiguity(units, unit_surface, text)
+        text_changed = text.replace(unit_surface, unit_changed)
+        new_units = attempt_disambiguate_unit(unit_changed, text_changed, lang)
+        units = get_a_better_one(units, new_units)
+        return resolve_ambiguity(units, unit_surface, text)
 
-        if base:
-            base = base.name
+    if not unit_surface or unit_surface[0] not in load.METRIC_PREFIXES.keys():
+        # Only apply next work around if the first letter is a SI-prefix
+        return resolve_ambiguity(units, unit_surface, text)
+
+    unit_changed = unit_surface[:-1] + unit_surface[-1].swapcase()
+    text_changed = text.replace(unit_surface, unit_changed)
+    new_units = attempt_disambiguate_unit(unit_changed, text_changed, lang)
+    units = get_a_better_one(units, new_units)
+    return resolve_ambiguity(units, unit_surface, text)
+
+
+def attempt_disambiguate_unit(unit_surface, text, lang):
+    """Returns list of possibilities"""
+    try:
+        if clf.USE_CLF:
+            return clf.attempt_disambiguate_unit(unit_surface, text, lang)
         else:
-            base = "unk"
+            return no_clf.attempt_disambiguate_no_classifier(unit_surface, text, lang)
+    except KeyError:
+        return None
+
+
+def get_a_better_one(old, new):
+    """Decide if we pick new over old, considering them being None, and
+    preferring the smaller one."""
+    if not new:
+        return old
+    if not old:
+        return new
+    if len(new) < len(old):
+        return new
+    return old
+
 
-    return base
+def resolve_ambiguity(units, unit, text):
+    if not units:
+        if unit and clf.USE_CLF:
+            raise KeyError('Could not find unit "%s" from "%s"' % (unit, text))
+        else:
+            return "unk"
+    if len(units) == 1:
+        return next(iter(units)).name
+    _LOGGER.warning(
+        "Could not resolve ambiguous units: '{}'. For unit '{}' in text '{}'. ".format(
+            ", ".join(str(u) for u in units), unit, text
+        )
+    )
+    # Deterministically getting something out of units.
+    return next(iter(sorted(u.name for u in units)))
 
 
 ###############################################################################
+
+
 def disambiguate_entity(key, text, lang="en_US"):
     """
     Resolve ambiguity between entities with same dimensionality.

diff --git a/quantulum3/no_classifier.py b/quantulum3/no_classifier.py
@@ -33,3 +33,20 @@ def disambiguate_no_classifier(entities, text, lang="en_US"):
         if relative > max_relative or (relative == max_relative and count > max_count):
             max_entity, max_count, max_relative = entity, count, relative
     return max_entity
+
+
+def attempt_disambiguate_no_classifier(unit_surface, text, lang):
+    """Returns list of possibilities"""
+    base = (
+        load.units(lang).symbols[unit_surface]
+        or load.units(lang).surfaces[unit_surface]
+        or load.units(lang).surfaces_lower[unit_surface.lower()]
+        or load.units(lang).symbols_lower[unit_surface.lower()]
+    )
+    if not base:
+        raise KeyError('Could not find unit "%s" from "%s"' % (unit_surface, text))
+    if len(base) > 1:
+        possible_base = disambiguate_no_classifier(base, text, lang)
+        if possible_base:
+            return [possible_base]
+    return base