From e40ad64df78000e8b7e847ced8eee0547bc9e4a2 Mon Sep 17 00:00:00 2001 From: Jyrki Niemi Date: Wed, 9 Nov 2022 12:49:53 +0200 Subject: [PATCH 1/3] CWB.run_cqp: Optionally report or ignore errors korp/cwb.py: - CWB.run_cqp: Add keyword argument "errors": if "report", report errors at the beginning of the output as lines beginning with "CQP Error:"; if "ignore", ignore errors; if "strict", throw CQPError (the default, as before). --- korp/cwb.py | 46 +++++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/korp/cwb.py b/korp/cwb.py index 831ca54..08ae571 100644 --- a/korp/cwb.py +++ b/korp/cwb.py @@ -22,10 +22,12 @@ def init(self, executable, scan_executable, registry, locale, encoding): self.locale = locale self.encoding = encoding - def run_cqp(self, command, attr_ignore=False, abort_event=None): + def run_cqp(self, command, attr_ignore=False, abort_event=None, errors="strict"): """Call the CQP binary with the given command, and the request data. Yield one result line at the time, disregarding empty lines. - If there is an error, raise a CQPError exception. + If there is an error, raise a CQPError exception, unless the + parameter errors is "ignore" or "report" (report errors at the + beginning of the output as lines beginning with "CQP Error:"). """ env = os.environ.copy() env["LC_COLLATE"] = self.locale @@ -57,26 +59,32 @@ def run_cqp(self, command, attr_ignore=False, abort_event=None): continue break - if error: + if error and errors != "ignore": error = error.decode(self.encoding) # Remove newlines from the error string: error = re.sub(r"\s+", r" ", error) - # Keep only the first CQP error (the rest are consequences): - error = re.sub(r"^CQP Error: *", r"", error) - error = re.sub(r" *(CQP Error:).*$", r"", error) - # Ignore certain errors: - # 1) "show +attr" for unknown attr, - # 2) querying unknown structural attribute, - # 3) calculating statistics for empty results - if ( - not (attr_ignore and "No such attribute:" in error) - and "is not defined for corpus" not in error - and "cl->range && cl->size > 0" not in error - and "neither a positional/structural attribute" not in error - and "CL: major error, cannot compose string: invalid UTF8 string passed to cl_string_canonical..." - not in error - ): - raise utils.CQPError(error) + if errors == "report": + # Each error on its own line beginning with "CQP Error:" + error = re.sub(r" +(CQP Error: *)", r"\n\1", error) + for line in error.split("\n"): + yield line + else: + # Keep only the first CQP error (the rest are consequences): + error = re.sub(r"^CQP Error: *", r"", error) + error = re.sub(r" *(CQP Error:).*$", r"", error) + # Ignore certain errors: + # 1) "show +attr" for unknown attr, + # 2) querying unknown structural attribute, + # 3) calculating statistics for empty results + if ( + not (attr_ignore and "No such attribute:" in error) + and "is not defined for corpus" not in error + and "cl->range && cl->size > 0" not in error + and "neither a positional/structural attribute" not in error + and "CL: major error, cannot compose string: invalid UTF8 string passed to cl_string_canonical..." + not in error + ): + raise utils.CQPError(error) for line in reply.decode(self.encoding, errors="ignore").split( "\n"): # We don't use splitlines() since it might split on special characters in the data if line: From a8aab28256966a4eaa43d42939518f458c72f039 Mon Sep 17 00:00:00 2001 From: Jyrki Niemi Date: Thu, 10 Nov 2022 11:49:37 +0200 Subject: [PATCH 2/3] /corpus_info: Optionally report undefined corpora korp/views/info.py: - /corpus_info: If the parameter "report_undefined_corpora" is true, add to the result the item "undefined_corpora" listing the undefined corpora in the parameter "corpus", instead of an error when a corpus is undefined. The configuration variable CHECK_AVAILABLE_CORPORA_STRICTLY controls how strictly the available corpora are checked. --- config.py | 5 +++ korp/views/info.py | 81 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 85 insertions(+), 1 deletion(-) diff --git a/config.py b/config.py index 3ae9dc4..58a9668 100644 --- a/config.py +++ b/config.py @@ -62,6 +62,11 @@ # Set to True to enable "lab mode", potentially enabling experimental features and access to lab-only corpora LAB_MODE = False +# When checking the availability of corpora, if True, try to select +# each corpus in CQP (slower), otherwise only check the files in the +# CWB registry directory (somewhat faster) +CHECK_AVAILABLE_CORPORA_STRICTLY = True + # Plugins to load PLUGINS = [] diff --git a/korp/views/info.py b/korp/views/info.py index bda61ab..2030622 100644 --- a/korp/views/info.py +++ b/korp/views/info.py @@ -1,6 +1,9 @@ +import os +import re import time from flask import Blueprint +from flask import current_app as app from pymemcache.exceptions import MemcacheError import korp @@ -65,10 +68,12 @@ def corpus_info(args, no_combined_cache=False): utils.assert_key("corpus", args, utils.IS_IDENT, True) corpora = utils.parse_corpora(args) + report_undefined_corpora = utils.parse_bool( + args, "report_undefined_corpora", False) # Check if whole query is cached if args["cache"]: - checksum_combined = utils.get_hash((sorted(corpora),)) + checksum_combined = utils.get_hash((sorted(corpora), report_undefined_corpora)) save_cache = [] with memcached.get_client() as mc: combined_cache_key = "%s:info_%s" % (utils.cache_prefix(mc), checksum_combined) @@ -87,6 +92,10 @@ def corpus_info(args, no_combined_cache=False): cmd = [] + if report_undefined_corpora: + corpora, undefined_corpora = _filter_undefined_corpora( + corpora, args["cache"], app.config["CHECK_AVAILABLE_CORPORA_STRICTLY"]) + if args["cache"]: with memcached.get_client() as mc: memcached_keys = {} @@ -154,6 +163,9 @@ def corpus_info(args, no_combined_cache=False): result["total_size"] = total_size result["total_sentences"] = total_sentences + if report_undefined_corpora: + result["undefined_corpora"] = undefined_corpora + if args["cache"] and not no_combined_cache: # Cache whole query try: @@ -166,3 +178,70 @@ def corpus_info(args, no_combined_cache=False): result.setdefault("DEBUG", {}) result["DEBUG"]["cache_saved"] = True yield result + + +def _filter_undefined_corpora(corpora, caching=True, strict=True): + """Return a pair of a list of defined and a list of undefined corpora + in the argument corpora. If caching, check if the result is in the + cache; if not, cache the result. If strict, try to select each + corpus in CQP, otherwise only check the files in the CWB registry + directory. + """ + + # Caching + if caching: + checksum_combined = utils.get_hash((corpora, strict)) + save_cache = [] + with memcached.get_client() as mc: + combined_cache_key = ( + "%s:corpora_defined_%s" % (utils.cache_prefix(mc), + checksum_combined)) + result = mc.get(combined_cache_key) + if result: + # Since this is not the result of a command, we cannot + # add debug information on using cache to the result. + return result + # TODO: Add per-corpus caching + + defined = [] + undefined = [] + if strict: + # Stricter: detects corpora that have a registry file but + # whose data makes CQP regard them as undefined when trying to + # use them + cqp = [corpus.upper() + ";" for corpus in corpora] + cqp += ["exit"] + lines = cwb.run_cqp(cqp, errors="report") + for line in lines: + if line.startswith("CQP Error:"): + matchobj = re.match( + r"CQP Error: Corpus ``(.+?)'' is undefined", line) + if matchobj: + undefined.append(str(matchobj.group(1))) + else: + # SKip the rest + break + if undefined: + defined = [corpus for corpus in corpora + if corpus not in set(undefined)] + else: + defined = corpora + else: + # It is somewhat faster but less reliable to check the + # registry only + registry_files = set(os.listdir(cwb.registry)) + defined = [corpus for corpus in corpora + if corpus.lower() in registry_files] + undefined = [corpus for corpus in corpora + if corpus.lower() not in registry_files] + + result = (defined, undefined) + + if caching: + try: + with memcached.get_client() as mc: + saved = mc.add(combined_cache_key, result) + except MemcacheError: + pass + + return result From 91de00a9cdccef7529ab47ed4d85967ccc765538 Mon Sep 17 00:00:00 2001 From: Jyrki Niemi Date: Thu, 10 Nov 2022 12:39:10 +0200 Subject: [PATCH 3/3] /info: strict=true to filter out undefined corpora korp/views/info.py: - /info: If the parameter "strict" is true, filter out undefined (inaccessible) corpora from the result. --- korp/views/info.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/korp/views/info.py b/korp/views/info.py index 2030622..b4f41ab 100644 --- a/korp/views/info.py +++ b/korp/views/info.py @@ -29,9 +29,10 @@ def sleep(args): @utils.main_handler def info(args): """Get version information about list of available corpora.""" + strict = utils.parse_bool(args, "strict", False) if args["cache"]: with memcached.get_client() as mc: - result = mc.get("%s:info" % utils.cache_prefix(mc)) + result = mc.get("%s:info_%s" % (utils.cache_prefix(mc), int(strict))) if result: if "debug" in args: result.setdefault("DEBUG", {}) @@ -41,6 +42,14 @@ def info(args): corpora = cwb.run_cqp("show corpora;") version = next(corpora) + # CQP "show corpora" lists all corpora in the registry, but some + # of them might nevertheless cause a "corpus undefined" error in + # CQP, for example, because of missing data, so filter them out if + # strict=true. However, filtering a large number of corpora slows + # down the info command, so it is disabled by default. Caching in + # _filter_undefined_corpora helps, though. + if strict: + corpora, _ = _filter_undefined_corpora(list(corpora), args["cache"]) protected = utils.get_protected_corpora() @@ -53,7 +62,7 @@ def info(args): if args["cache"]: with memcached.get_client() as mc: - added = mc.add("%s:info" % utils.cache_prefix(mc), result) + added = mc.add("%s:info_%s" % (utils.cache_prefix(mc), int(strict)), result) if added and "debug" in args: result.setdefault("DEBUG", {}) result["DEBUG"]["cache_saved"] = True