Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

T report undefined corpora #20

Draft
wants to merge 3 commits into
base: dev
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
@@ -62,6 +62,11 @@
# Set to True to enable "lab mode", potentially enabling experimental features and access to lab-only corpora
LAB_MODE = False

# When checking the availability of corpora, if True, try to select
# each corpus in CQP (slower), otherwise only check the files in the
# CWB registry directory (somewhat faster)
CHECK_AVAILABLE_CORPORA_STRICTLY = True

# Plugins to load
PLUGINS = []

46 changes: 27 additions & 19 deletions korp/cwb.py
Original file line number Diff line number Diff line change
@@ -22,10 +22,12 @@ def init(self, executable, scan_executable, registry, locale, encoding):
self.locale = locale
self.encoding = encoding

def run_cqp(self, command, attr_ignore=False, abort_event=None):
def run_cqp(self, command, attr_ignore=False, abort_event=None, errors="strict"):
"""Call the CQP binary with the given command, and the request data.
Yield one result line at the time, disregarding empty lines.
If there is an error, raise a CQPError exception.
If there is an error, raise a CQPError exception, unless the
parameter errors is "ignore" or "report" (report errors at the
beginning of the output as lines beginning with "CQP Error:").
"""
env = os.environ.copy()
env["LC_COLLATE"] = self.locale
@@ -57,26 +59,32 @@ def run_cqp(self, command, attr_ignore=False, abort_event=None):
continue
break

if error:
if error and errors != "ignore":
error = error.decode(self.encoding)
# Remove newlines from the error string:
error = re.sub(r"\s+", r" ", error)
# Keep only the first CQP error (the rest are consequences):
error = re.sub(r"^CQP Error: *", r"", error)
error = re.sub(r" *(CQP Error:).*$", r"", error)
# Ignore certain errors:
# 1) "show +attr" for unknown attr,
# 2) querying unknown structural attribute,
# 3) calculating statistics for empty results
if (
not (attr_ignore and "No such attribute:" in error)
and "is not defined for corpus" not in error
and "cl->range && cl->size > 0" not in error
and "neither a positional/structural attribute" not in error
and "CL: major error, cannot compose string: invalid UTF8 string passed to cl_string_canonical..."
not in error
):
raise utils.CQPError(error)
if errors == "report":
# Each error on its own line beginning with "CQP Error:"
error = re.sub(r" +(CQP Error: *)", r"\n\1", error)
for line in error.split("\n"):
yield line
else:
# Keep only the first CQP error (the rest are consequences):
error = re.sub(r"^CQP Error: *", r"", error)
error = re.sub(r" *(CQP Error:).*$", r"", error)
# Ignore certain errors:
# 1) "show +attr" for unknown attr,
# 2) querying unknown structural attribute,
# 3) calculating statistics for empty results
if (
not (attr_ignore and "No such attribute:" in error)
and "is not defined for corpus" not in error
and "cl->range && cl->size > 0" not in error
and "neither a positional/structural attribute" not in error
and "CL: major error, cannot compose string: invalid UTF8 string passed to cl_string_canonical..."
not in error
):
raise utils.CQPError(error)
for line in reply.decode(self.encoding, errors="ignore").split(
"\n"): # We don't use splitlines() since it might split on special characters in the data
if line:
94 changes: 91 additions & 3 deletions korp/views/info.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import os
import re
import time

from flask import Blueprint
from flask import current_app as app
from pymemcache.exceptions import MemcacheError

import korp
@@ -26,9 +29,10 @@ def sleep(args):
@utils.main_handler
def info(args):
"""Get version information about list of available corpora."""
strict = utils.parse_bool(args, "strict", False)
if args["cache"]:
with memcached.get_client() as mc:
result = mc.get("%s:info" % utils.cache_prefix(mc))
result = mc.get("%s:info_%s" % (utils.cache_prefix(mc), int(strict)))
if result:
if "debug" in args:
result.setdefault("DEBUG", {})
@@ -38,6 +42,14 @@ def info(args):

corpora = cwb.run_cqp("show corpora;")
version = next(corpora)
# CQP "show corpora" lists all corpora in the registry, but some
# of them might nevertheless cause a "corpus undefined" error in
# CQP, for example, because of missing data, so filter them out if
# strict=true. However, filtering a large number of corpora slows
# down the info command, so it is disabled by default. Caching in
# _filter_undefined_corpora helps, though.
if strict:
corpora, _ = _filter_undefined_corpora(list(corpora), args["cache"])

protected = utils.get_protected_corpora()

@@ -50,7 +62,7 @@ def info(args):

if args["cache"]:
with memcached.get_client() as mc:
added = mc.add("%s:info" % utils.cache_prefix(mc), result)
added = mc.add("%s:info_%s" % (utils.cache_prefix(mc), int(strict)), result)
if added and "debug" in args:
result.setdefault("DEBUG", {})
result["DEBUG"]["cache_saved"] = True
@@ -65,10 +77,12 @@ def corpus_info(args, no_combined_cache=False):
utils.assert_key("corpus", args, utils.IS_IDENT, True)

corpora = utils.parse_corpora(args)
report_undefined_corpora = utils.parse_bool(
args, "report_undefined_corpora", False)

# Check if whole query is cached
if args["cache"]:
checksum_combined = utils.get_hash((sorted(corpora),))
checksum_combined = utils.get_hash((sorted(corpora), report_undefined_corpora))
save_cache = []
with memcached.get_client() as mc:
combined_cache_key = "%s:info_%s" % (utils.cache_prefix(mc), checksum_combined)
@@ -87,6 +101,10 @@ def corpus_info(args, no_combined_cache=False):

cmd = []

if report_undefined_corpora:
corpora, undefined_corpora = _filter_undefined_corpora(
corpora, args["cache"], app.config["CHECK_AVAILABLE_CORPORA_STRICTLY"])

if args["cache"]:
with memcached.get_client() as mc:
memcached_keys = {}
@@ -154,6 +172,9 @@ def corpus_info(args, no_combined_cache=False):
result["total_size"] = total_size
result["total_sentences"] = total_sentences

if report_undefined_corpora:
result["undefined_corpora"] = undefined_corpora

if args["cache"] and not no_combined_cache:
# Cache whole query
try:
@@ -166,3 +187,70 @@ def corpus_info(args, no_combined_cache=False):
result.setdefault("DEBUG", {})
result["DEBUG"]["cache_saved"] = True
yield result


def _filter_undefined_corpora(corpora, caching=True, strict=True):
"""Return a pair of a list of defined and a list of undefined corpora
in the argument corpora. If caching, check if the result is in the
cache; if not, cache the result. If strict, try to select each
corpus in CQP, otherwise only check the files in the CWB registry
directory.
"""

# Caching
if caching:
checksum_combined = utils.get_hash((corpora, strict))
save_cache = []
with memcached.get_client() as mc:
combined_cache_key = (
"%s:corpora_defined_%s" % (utils.cache_prefix(mc),
checksum_combined))
result = mc.get(combined_cache_key)
if result:
# Since this is not the result of a command, we cannot
# add debug information on using cache to the result.
return result
# TODO: Add per-corpus caching

defined = []
undefined = []
if strict:
# Stricter: detects corpora that have a registry file but
# whose data makes CQP regard them as undefined when trying to
# use them
cqp = [corpus.upper() + ";" for corpus in corpora]
cqp += ["exit"]
lines = cwb.run_cqp(cqp, errors="report")
for line in lines:
if line.startswith("CQP Error:"):
matchobj = re.match(
r"CQP Error: Corpus ``(.+?)'' is undefined", line)
if matchobj:
undefined.append(str(matchobj.group(1)))
else:
# SKip the rest
break
if undefined:
defined = [corpus for corpus in corpora
if corpus not in set(undefined)]
else:
defined = corpora
else:
# It is somewhat faster but less reliable to check the
# registry only
registry_files = set(os.listdir(cwb.registry))
defined = [corpus for corpus in corpora
if corpus.lower() in registry_files]
undefined = [corpus for corpus in corpora
if corpus.lower() not in registry_files]

result = (defined, undefined)

if caching:
try:
with memcached.get_client() as mc:
saved = mc.add(combined_cache_key, result)
except MemcacheError:
pass

return result