Skip to content

Commit

Permalink
Move capitalization logic one layer up.
Browse files Browse the repository at this point in the history
  • Loading branch information
hwalinga committed Sep 16, 2020
1 parent 47fc9ff commit 368435b
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 55 deletions.
42 changes: 7 additions & 35 deletions quantulum3/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@

import pkg_resources

# Quantulum
from . import language, load
from .load import cached

# Semi-dependencies
try:
from sklearn.linear_model import SGDClassifier
Expand All @@ -27,10 +31,6 @@
except ImportError:
wikipedia = None

# Quantulum
from . import language, load
from .load import cached


_LOGGER = logging.getLogger(__name__)

Expand Down Expand Up @@ -241,39 +241,11 @@ def disambiguate_entity(key, text, lang="en_US"):


###############################################################################
def disambiguate_unit(unit, text, lang="en_US"):
"""
Resolve ambiguity between units with same names, symbols or abbreviations.
"""
new_unit = disambiguate_unit_by_score(unit, text, lang)
if len(new_unit) == 1:
return next(iter(new_unit))

try:
# Instead of picking a random one now, we first change the
# capitalization of the unit and see if we can improve.
unit_changed = unit[:-1] + unit[-1].swapcase()
text_changed = text.replace(unit, unit_changed)

new_unit_changed = disambiguate_unit_by_score(unit_changed, text_changed, lang)
if len(new_unit_changed) == 1:
return next(iter(new_unit_changed))

if 0 < len(new_unit_changed) < len(new_unit):
# See if we have improved, otherwise we stick with the old new_unit.
new_unit = new_unit_changed

except KeyError:
pass # Attempt failed, we just pick a random from new_unit now.

_LOGGER.warning(
"Could not resolve ambiguous units: '{}'. For unit '{}' in text '{}'. "
"Taking a random.".format(", ".join(str(u) for u in new_unit), unit, text)
)
return next(iter(new_unit))


def disambiguate_unit_by_score(unit, text, lang):
def attempt_disambiguate_unit(unit, text, lang):
"""Resolve ambiguity between units with same names, symbols or abbreviations.
Returns list of possibilities"""
new_unit = (
load.units(lang).symbols.get(unit)
or load.units(lang).surfaces.get(unit)
Expand Down
82 changes: 62 additions & 20 deletions quantulum3/disambiguate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,42 +3,84 @@
:mod:`Quantulum` disambiguation functions.
"""

import logging

# Quantulum
from . import classifier as clf
from . import no_classifier as no_clf
from . import load
from . import no_classifier as no_clf

_LOGGER = logging.getLogger(__name__)

###############################################################################


def disambiguate_unit(unit_surface, text, lang="en_US"):
"""
Resolve ambiguity between units with same names, symbols or abbreviations.
:returns (str) unit name of the resolved unit
"""
if clf.USE_CLF:
base = clf.disambiguate_unit(unit_surface, text, lang).name
else:
base = (
load.units(lang).symbols[unit_surface]
or load.units(lang).surfaces[unit_surface]
or load.units(lang).surfaces_lower[unit_surface.lower()]
or load.units(lang).symbols_lower[unit_surface.lower()]
)

if len(base) > 1:
base = no_clf.disambiguate_no_classifier(base, text, lang)
elif len(base) == 1:
base = next(iter(base))

if base:
base = base.name
units = attempt_disambiguate_unit(unit_surface, text, lang)
if units and len(units) == 1:
return next(iter(units)).name

if len(unit_surface) > 2:
# We will lower case everything except the first letter and see if
# there is a better match.
unit_changed = unit_surface[0] + unit_surface[1:].lower()
text_changed = text.replace(unit_surface, unit_changed)
new_units = attempt_disambiguate_unit(unit_changed, text_changed, lang)
units = get_a_better_one(units, new_units)
return resolve_ambiguity(units, unit_surface, text)

# Change the capitalization of the last letter to find a better match.
# The last better is sometimes cause of confusion, but the
# capitalization of the prefix is too important to alter.
unit_changed = unit_surface[:-1] + unit_surface[-1].swapcase()
text_changed = text.replace(unit_surface, unit_changed)
new_units = attempt_disambiguate_unit(unit_changed, text_changed, lang)
units = get_a_better_one(units, new_units)
return resolve_ambiguity(units, unit_surface, text)


def attempt_disambiguate_unit(unit_surface, text, lang):
"""Returns list of possibilities"""
try:
if clf.USE_CLF:
return clf.attempt_disambiguate_unit(unit_surface, text, lang)
else:
base = "unk"
return no_clf.attempt_disambiguate_no_classifier(unit_surface, text, lang)
except KeyError:
return None


return base
def get_a_better_one(old, new):
"""Decide if we pick new over old, considering them being None, and
preferring the smaller one."""
if not new:
return old
if not old:
return new
if len(new) < len(old):
return new
return old


def resolve_ambiguity(units, unit, text):
if not units:
raise KeyError('Could not find unit "%s" from "%s"' % (unit, text))
if len(units) == 1:
return next(iter(units)).name
_LOGGER.warning(
"Could not resolve ambiguous units: '{}'. For unit '{}' in text '{}'. "
"Taking a random.".format(", ".join(str(u) for u in units), unit, text)
)
return next(iter(units)).name


###############################################################################


def disambiguate_entity(key, text, lang="en_US"):
"""
Resolve ambiguity between entities with same dimensionality.
Expand Down
18 changes: 18 additions & 0 deletions quantulum3/no_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,21 @@ def disambiguate_no_classifier(entities, text, lang="en_US"):
if relative > max_relative or (relative == max_relative and count > max_count):
max_entity, max_count, max_relative = entity, count, relative
return max_entity


def attempt_disambiguate_no_classifier(unit_surface, text, lang):
"""Returns list of possibilities"""
base = (
load.units(lang).symbols[unit_surface]
or load.units(lang).surfaces[unit_surface]
or load.units(lang).surfaces_lower[unit_surface.lower()]
or load.units(lang).symbols_lower[unit_surface.lower()]
)
if not base:
raise KeyError('Could not find unit "%s" from "%s"' % (unit_surface, text))
if len(base) > 1:
possible_base = disambiguate_no_classifier(base, text, lang)
if not possible_base:
return base
else:
return [possible_base]

0 comments on commit 368435b

Please sign in to comment.