From d2ba48e589914ff4b3a03f191697d0aaf59ac3f3 Mon Sep 17 00:00:00 2001 From: ampli Date: Wed, 15 May 2024 17:01:33 +0300 Subject: [PATCH 01/11] tracon-set.c place_found(): return true at the end The second invocation of connector_list_equal(slot->clist, c) is not needed, because at this point we know that its result was "true" 2 lines before. So replace it by "true". This is only a very slight speedup, if any, because the compiler (in -O3) knows it is a "pure" function and thus returns its previous result. --- link-grammar/tracon-set.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/link-grammar/tracon-set.c b/link-grammar/tracon-set.c index bac1696ed..ef530b670 100644 --- a/link-grammar/tracon-set.c +++ b/link-grammar/tracon-set.c @@ -170,7 +170,7 @@ static bool place_found(const Connector *c, const clist_slot *slot, if (hash != slot->hash) return false; if (!connector_list_equal(slot->clist, c)) return false; if (ss->shallow && (slot->clist->shallow != c->shallow)) return false; - return connector_list_equal(slot->clist, c); + return true; } /** From 1d2a857ecdaeabb9409500445c00840cf237203f Mon Sep 17 00:00:00 2001 From: ampli Date: Thu, 9 May 2024 14:30:30 +0300 Subject: [PATCH 02/11] connectors.h: Move up the definition of UNLIMITED_LEN --- link-grammar/connectors.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/link-grammar/connectors.h b/link-grammar/connectors.h index 5b4884032..19efc40b0 100644 --- a/link-grammar/connectors.h +++ b/link-grammar/connectors.h @@ -31,6 +31,9 @@ */ #define MAX_SENTENCE 254 /* Maximum number of words in a sentence */ +/* Length-limits for how far connectors can reach out. */ +#define UNLIMITED_LEN 255 + /* Since tracon IDs are unique per sentence, for convenience NULL * connectors (zero-length tracons) have tracon IDs equal to the word * number on which their disjunct resides. To that end an initial block @@ -100,9 +103,6 @@ struct condesc_struct }; typedef struct condesc_struct condesc_t; -/* Length-limits for how far connectors can reach out. */ -#define UNLIMITED_LEN 255 - typedef struct length_limit_def { const char *defword; From 77b1a5cdbd5130b440b289c010775c5b889d5589 Mon Sep 17 00:00:00 2001 From: ampli Date: Thu, 9 May 2024 14:32:15 +0300 Subject: [PATCH 03/11] connectors.h: Move up the definition of hdesc --- link-grammar/connectors.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/link-grammar/connectors.h b/link-grammar/connectors.h index 19efc40b0..79c752c33 100644 --- a/link-grammar/connectors.h +++ b/link-grammar/connectors.h @@ -79,6 +79,14 @@ static inline bool is_connector_subscript_char(unsigned char c) } /* End of connector string character validation. */ +typedef struct condesc_struct condesc_t; + +typedef struct hdesc +{ + condesc_t *desc; + connector_uc_hash_t str_hash; +} hdesc_t; + /* Note: If more byte-size fields are needed, to save space * uc_length and uc_start may be moved to struct hdesc. */ struct condesc_struct @@ -101,7 +109,6 @@ struct condesc_struct uint8_t uc_length; /* uc part length */ uint8_t uc_start; /* uc start position */ }; -typedef struct condesc_struct condesc_t; typedef struct length_limit_def { @@ -111,12 +118,6 @@ typedef struct length_limit_def int length_limit; } length_limit_def_t; -typedef struct hdesc -{ - condesc_t *desc; - connector_uc_hash_t str_hash; -} hdesc_t; - typedef struct { hdesc_t *hdesc; /* Hashed connector descriptors table */ From d421b769798df26bcde2f867c17f26b2f6fb1d30 Mon Sep 17 00:00:00 2001 From: ampli Date: Fri, 10 May 2024 04:16:14 +0300 Subject: [PATCH 04/11] link-generator.c: use_prompt(): Rename parameter to verbosity_setting Due to changes in include files I got a compiler warning that it shadows the global verbosity variable. --- link-parser/link-generator.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/link-parser/link-generator.c b/link-parser/link-generator.c index 350cbf409..c829d7c84 100644 --- a/link-parser/link-generator.c +++ b/link-parser/link-generator.c @@ -35,9 +35,9 @@ int verbosity_level = 1; static const char prompt[] = "linkgenerator> "; -static const char *use_prompt(int verbosity) +static const char *use_prompt(int verbosity_setting) { - return (0 == verbosity)? "" : prompt; + return (0 == verbosity_setting)? "" : prompt; } /* Argument parsing for the generator */ From 2bebbdb6ef79deb1ceab9daa98614dae616c8b60 Mon Sep 17 00:00:00 2001 From: ampli Date: Fri, 10 May 2024 05:48:06 +0300 Subject: [PATCH 05/11] condesc_t: Enumerate and move some fields to hdesc_t Keep condesc_t at 32 bytes after adding con_num, which will be used in connector hash and might later be used for radix sort. --- link-grammar/connectors.c | 63 +++++++++++---------- link-grammar/connectors.h | 44 +++++++++----- link-grammar/dict-atomese/lookup-atomese.cc | 12 ++-- link-grammar/dict-atomese/word-pairs.cc | 2 +- link-grammar/dict-common/dict-locale.c | 4 +- link-grammar/dict-common/dict-utils.c | 4 +- link-grammar/dict-common/print-dict.c | 10 ++-- link-grammar/dict-file/dictionary.c | 2 +- link-grammar/linkage/analyze-linkage.c | 12 ++-- link-grammar/parse/parse.c | 4 +- link-grammar/parse/prune.c | 6 +- link-grammar/prepare/build-disjuncts.c | 2 +- link-grammar/sat-solver/sat-encoder.cpp | 4 +- link-grammar/sat-solver/word-tag.hpp | 2 +- 14 files changed, 96 insertions(+), 75 deletions(-) diff --git a/link-grammar/connectors.c b/link-grammar/connectors.c index ed8852042..ff5122f3b 100644 --- a/link-grammar/connectors.c +++ b/link-grammar/connectors.c @@ -46,7 +46,7 @@ static unsigned int get_connector_length_limit(condesc_t *cd, int short_len = opts->short_length; bool all_short = opts->all_short; - int length_limit = cd->length_limit; + int length_limit = cd->more->length_limit; if ((all_short && (length_limit > short_len)) || (0 == length_limit)) return short_len; @@ -165,7 +165,7 @@ static void set_condesc_length_limit(Dictionary dict, const Exp *e, int length_l if (econlist[en]->uc_num != sdesc[cn]->uc_num) continue; restart_cn = cn; - const char *wc_str = econlist[en]->string; + const char *wc_str = econlist[en]->more->string; char *uc_wildcard = strchr(wc_str, LENGTH_LINIT_WILD_TYPE); for (; cn < ct->num_con; cn++) @@ -181,11 +181,11 @@ static void set_condesc_length_limit(Dictionary dict, const Exp *e, int length_l else { /* The uppercase part is a prefix. */ - if (0 != strncmp(wc_str, sdesc[cn]->string, uc_wildcard - wc_str)) + if (0 != strncmp(wc_str, sdesc[cn]->more->string, uc_wildcard - wc_str)) break; } - sdesc[cn]->length_limit = length_limit; + sdesc[cn]->more->length_limit = length_limit; } } } @@ -221,8 +221,8 @@ static void set_all_condesc_length_limit(Dictionary dict) for (size_t en = 0; en < ct->num_con; en++) { - if (0 == sdesc[en]->length_limit) - sdesc[en]->length_limit = UNLIMITED_LEN; + if (0 == sdesc[en]->more->length_limit) + sdesc[en]->more->length_limit = UNLIMITED_LEN; } } @@ -234,7 +234,7 @@ static void set_all_condesc_length_limit(Dictionary dict) for (size_t n = 0; n < ct->num_con; n++) { prt_error("%5zu %6u %3d %s\n\\", n, ct->sdesc[n]->uc_num, - ct->sdesc[n]->length_limit, ct->sdesc[n]->string); + ct->sdesc[n]->more->length_limit, ct->sdesc[n]->more->string); } prt_error("\n"); } @@ -277,8 +277,8 @@ static void connector_encode_lc(const char *lc_string, condesc_t *desc) lc_string, (int)(s-lc_string), MAX_CONNECTOR_LC_LENGTH); } - desc->lc_mask = (lc_mask << 1) + !!(desc->flags & CD_HEAD_DEPENDENT); - desc->lc_letters = (lc_value << 1) + !!(desc->flags & CD_HEAD); + desc->lc_mask = (lc_mask << 1) + !!(desc->more->flags & CD_HEAD_DEPENDENT); + desc->lc_letters = (lc_value << 1) + !!(desc->more->flags & CD_HEAD); } /** @@ -288,27 +288,27 @@ static void connector_encode_lc(const char *lc_string, condesc_t *desc) * * Note: check_connector() has already validated the connector string. */ -void calculate_connector_info(condesc_t * c) +void calculate_connector_info(hdesc_t *hdesc) { const char *s; - s = c->string; + s = hdesc->string; if (islower((unsigned char)*s)) { - dassert((c->string[0] == 'h') || (c->string[0] == 'd'), - "\'%c\': Bad head/dependent character", c->string[0]); + dassert((hdesc->string[0] == 'h') || (hdesc->string[0] == 'd'), + "\'%hdesc\': Bad head/dependent character", hdesc->string[0]); - if ((*s == 'h') || (*s == 'd')) c->flags |= CD_HEAD_DEPENDENT; - if (*s == 'h') c->flags |= CD_HEAD; + if ((*s == 'h') || (*s == 'd')) hdesc->flags |= CD_HEAD_DEPENDENT; + if (*s == 'h') hdesc->flags |= CD_HEAD; s++; /* Ignore head-dependent indicator. */ } - c->uc_start = (uint8_t)(s - c->string); + hdesc->uc_start = (uint8_t)(s - hdesc->string); /* Skip the uppercase part. */ do { s++; } while (is_connector_name_char(*s)); - c->uc_length = (uint8_t)(s - c->string - c->uc_start); + hdesc->uc_length = (uint8_t)(s - hdesc->string - hdesc->uc_start); - connector_encode_lc(s, c); + connector_encode_lc(s, hdesc->desc); } /* ================= Connector descriptor table. ====================== */ @@ -373,11 +373,11 @@ int condesc_by_uc_constring(const void * a, const void * b) if (NULL == *cda) return (NULL != *cdb); if (NULL == *cdb) return -1; - const char *sa = &(*cda)->string[(*cda)->uc_start]; - const char *sb = &(*cdb)->string[(*cdb)->uc_start]; + const char *sa = &(*cda)->more->string[(*cda)->more->uc_start]; + const char *sb = &(*cdb)->more->string[(*cdb)->more->uc_start]; - int la = (*cda)->uc_length; - int lb = (*cdb)->uc_length; + int la = (*cda)->more->uc_length; + int lb = (*cdb)->more->uc_length; if (la == lb) { @@ -424,7 +424,7 @@ static bool sort_condesc_by_uc_constring(Dictionary dict) condesc_t *condesc = dict->contable.hdesc[n].desc; if (NULL == condesc) continue; - calculate_connector_info(condesc); + calculate_connector_info(&dict->contable.hdesc[n]); sdesc[i++] = dict->contable.hdesc[n].desc; } @@ -439,7 +439,7 @@ static bool sort_condesc_by_uc_constring(Dictionary dict) { condesc_t **condesc = &sdesc[n]; - if (condesc[0]->uc_length != condesc[-1]->uc_length) + if (condesc[0]->more->uc_length != condesc[-1]->more->uc_length) { /* We know that the UC part has been changed. */ @@ -447,9 +447,9 @@ static bool sort_condesc_by_uc_constring(Dictionary dict) } else { - const char *uc1 = &condesc[0]->string[condesc[0]->uc_start]; - const char *uc2 = &condesc[-1]->string[condesc[-1]->uc_start]; - if (0 != strncmp(uc1, uc2, condesc[0]->uc_length)) + const char *uc1 = &condesc[0]->more->string[condesc[0]->more->uc_start]; + const char *uc2 = &condesc[-1]->more->string[condesc[-1]->more->uc_start]; + if (0 != strncmp(uc1, uc2, condesc[0]->more->uc_length)) { uc_num++; } @@ -495,7 +495,7 @@ static hdesc_t *condesc_find(ConTable *ct, const char *constring, uint32_t hash) uint32_t i = hash & (ct->size-1); while ((NULL != ct->hdesc[i].desc) && - !string_set_cmp(constring, ct->hdesc[i].desc->string)) + !string_set_cmp(constring, ct->hdesc[i].string)) { i = (i + 1) & (ct->size-1); } @@ -524,7 +524,7 @@ static bool condesc_grow(ConTable *ct) { hdesc_t *old_h = &old_hdesc[i]; if (NULL == old_h->desc) continue; - hdesc_t *new_h = condesc_find(ct, old_h->desc->string, old_h->str_hash); + hdesc_t *new_h = condesc_find(ct, old_h->string, old_h->str_hash); if (NULL != new_h->desc) { @@ -533,6 +533,7 @@ static bool condesc_grow(ConTable *ct) return false; } *new_h = *old_h; + new_h->desc->more = new_h; } free(old_hdesc); @@ -548,9 +549,11 @@ condesc_t *condesc_add(ConTable *ct, const char *constring) { lgdebug(+11, "Creating connector '%s' (%zu)\n", constring, ct->num_con); h->desc = pool_alloc(ct->mempool); - h->desc->string = constring; + h->string = constring; h->desc->uc_num = UINT32_MAX; h->str_hash = hash; + h->desc->more = h; + h->desc->con_num = ct->num_con; ct->num_con++; if ((8 * ct->num_con) > (3 * ct->size)) diff --git a/link-grammar/connectors.h b/link-grammar/connectors.h index 79c752c33..61c825ab3 100644 --- a/link-grammar/connectors.h +++ b/link-grammar/connectors.h @@ -81,22 +81,14 @@ static inline bool is_connector_subscript_char(unsigned char c) typedef struct condesc_struct condesc_t; +/* The connector descriptors (see below) are pointed from a hash table + * with these elements. */ typedef struct hdesc { condesc_t *desc; - connector_uc_hash_t str_hash; -} hdesc_t; - -/* Note: If more byte-size fields are needed, to save space - * uc_length and uc_start may be moved to struct hdesc. */ -struct condesc_struct -{ - lc_enc_t lc_letters; - lc_enc_t lc_mask; - const char *string; /* The connector name w/o the direction mark, e.g. AB */ - // float *cost; /* Array of cost by connector length (cost[0]: default) */ - connector_uc_hash_t uc_num; /* uc part enumeration. */ + // float *cost; // Array of cost by connector length (cost[0]: default) + connector_uc_hash_t str_hash; uint8_t length_limit; /* If not 0, it gives the limit of the length of the * link that can be used on this connector type. The * value UNLIMITED_LEN specifies no limit. @@ -108,6 +100,23 @@ struct condesc_struct /* For connector match speedup when sorting the connector table. */ uint8_t uc_length; /* uc part length */ uint8_t uc_start; /* uc start position */ +} hdesc_t; + +/* Each connector type has a connector descriptor. The size of this + * struct is 32 byes, to facilitate CPU memory caching during parsing. + * The "more" field points to connector information that is needed in + * other steps. The con_num field is used a lot in steps that need + * connector hashing, and it is included here to avoid extra + * redirections. + * Multi connectors are considering the same type as their non-multi + * version, so the multi indication is kept in Connector_struct. */ +struct condesc_struct +{ + lc_enc_t lc_letters; + lc_enc_t lc_mask; + hdesc_t *more; /* More information, for keeping small struct size. */ + connector_uc_hash_t uc_num; /* uc part enumeration. */ + uint32_t con_num; /* Connector ordinal number. */ }; typedef struct length_limit_def @@ -177,12 +186,17 @@ void condesc_reuse(Dictionary); * accesses connectors */ static inline const char * connector_string(const Connector *c) { - return c->desc->string; + return c->desc->more->string; } static inline unsigned int connector_uc_start(const Connector *c) { - return c->desc->uc_start; + return c->desc->more->uc_start; +} + +static inline unsigned int connector_uc_length(const Connector *c) +{ + return c->desc->more->uc_length; } static inline const condesc_t *connector_desc(const Connector *c) @@ -205,7 +219,7 @@ static inline unsigned int connector_uc_num(const Connector * c) Connector * connector_new(Pool_desc *, const condesc_t *); void set_connector_farthest_word(Exp *, int, int, Parse_Options); void free_connectors(Connector *); -void calculate_connector_info(condesc_t *); +void calculate_connector_info(hdesc_t *); int condesc_by_uc_constring(const void *, const void *); /** diff --git a/link-grammar/dict-atomese/lookup-atomese.cc b/link-grammar/dict-atomese/lookup-atomese.cc index fd055a3a8..ff43056f0 100644 --- a/link-grammar/dict-atomese/lookup-atomese.cc +++ b/link-grammar/dict-atomese/lookup-atomese.cc @@ -422,8 +422,8 @@ static void update_condesc(Dictionary dict) if (NULL == condesc) continue; if (UINT32_MAX != condesc->uc_num) continue; - calculate_connector_info(condesc); - condesc->length_limit = UNLIMITED_LEN; + calculate_connector_info(&ct->hdesc[n]); + condesc->more->length_limit = UNLIMITED_LEN; sdesc[i++] = condesc; } @@ -438,16 +438,16 @@ static void update_condesc(Dictionary dict) { condesc_t **condesc = &sdesc[n]; - if (condesc[0]->uc_length != condesc[-1]->uc_length) + if (condesc[0]->more->uc_length != condesc[-1]->more->uc_length) { /* We know that the UC part has been changed. */ uc_num++; } else { - const char *uc1 = &condesc[0]->string[condesc[0]->uc_start]; - const char *uc2 = &condesc[-1]->string[condesc[-1]->uc_start]; - if (0 != strncmp(uc1, uc2, condesc[0]->uc_length)) + const char *uc1 = &condesc[0]->more->string[condesc[0]->more->uc_start]; + const char *uc2 = &condesc[-1]->more->string[condesc[-1]->more->uc_start]; + if (0 != strncmp(uc1, uc2, condesc[0]->more->uc_length)) { uc_num++; } diff --git a/link-grammar/dict-atomese/word-pairs.cc b/link-grammar/dict-atomese/word-pairs.cc index 1a1f32b50..55d99b2b5 100644 --- a/link-grammar/dict-atomese/word-pairs.cc +++ b/link-grammar/dict-atomese/word-pairs.cc @@ -385,7 +385,7 @@ static Exp* get_sent_pair_exprs(Dictionary dict, const Handle& germ, { assert(CONNECTOR_type == orch->type, "unexpected expression!"); - if (links.end() != links.find(orch->condesc->string)) + if (links.end() != links.find(orch->condesc->more->string)) { Exp* cpe = (Exp*) pool_alloc(pool); *cpe = *orch; diff --git a/link-grammar/dict-common/dict-locale.c b/link-grammar/dict-common/dict-locale.c index 0c8ea862a..ad7a6aea0 100644 --- a/link-grammar/dict-common/dict-locale.c +++ b/link-grammar/dict-common/dict-locale.c @@ -204,7 +204,7 @@ const char * linkgrammar_get_dict_locale(Dictionary dict) } else { - locale = dn->exp->condesc->string; + locale = dn->exp->condesc->more->string; } } @@ -318,7 +318,7 @@ const char * linkgrammar_get_dict_version(Dictionary dict) if (NULL == dn) return "[unknown]"; e = dn->exp; - ver = strdup(&e->condesc->string[1]); + ver = strdup(&e->condesc->more->string[1]); p = strchr(ver, 'v'); while (p) { diff --git a/link-grammar/dict-common/dict-utils.c b/link-grammar/dict-common/dict-utils.c index 9fa6b6f66..d1b5aca11 100644 --- a/link-grammar/dict-common/dict-utils.c +++ b/link-grammar/dict-common/dict-utils.c @@ -94,7 +94,7 @@ static Exp *create_external_exp(const Exp *e, Exp **exp_mem, Parse_Options opts) const char * lg_exp_get_string(const Exp* exp) { - return exp->condesc->string; + return exp->condesc->more->string; } /** @@ -273,7 +273,7 @@ static bool exp_has_connector(const Exp * e, int depth, if (e->type == CONNECTOR_type) { if (direction != e->dir) return false; - return string_set_cmp(e->condesc->string, cs); + return string_set_cmp(e->condesc->more->string, cs); } if (depth == 0) return false; diff --git a/link-grammar/dict-common/print-dict.c b/link-grammar/dict-common/print-dict.c index 7a9c7042c..dc898f62c 100644 --- a/link-grammar/dict-common/print-dict.c +++ b/link-grammar/dict-common/print-dict.c @@ -194,7 +194,7 @@ static void print_expression_parens(Dictionary dict, dyn_str *e, const Exp *n, if (n->type == CONNECTOR_type) { if (n->multi) dyn_strcat(e, "@"); - dyn_strcat(e, n->condesc ? n->condesc->string : "error-null-connector"); + dyn_strcat(e, n->condesc ? n->condesc->more->string : "error-null-connector"); dyn_strcat(e, (const char []){ n->dir, '\0' }); } else if (is_expression_optional(n)) @@ -257,7 +257,7 @@ static bool exp_contains_connector(const Exp *e, int *pos, int find_pos) { #if 0 printf("exp_contains_connector: pos=%d C=%s%s%c %s\n", - *pos,e->multi?"@":"",e->condesc->string,e->dir, + *pos,e->multi?"@":"",e->condesc->more->string,e->dir, (find_pos == *pos) ? "FOUND" : ""); #endif return (find_pos == (*pos)++); @@ -317,7 +317,7 @@ static void print_connector_macros(cmacro_context *cmc, const Exp *n) cmc->is_after_connector = true; if (n->multi) dyn_strcat(cmc->e, "@"); dyn_strcat(cmc->e, - n->condesc ? n->condesc->string : "error-null-connector"); + n->condesc ? n->condesc->more->string : "error-null-connector"); dyn_strcat(cmc->e, (const char []){ n->dir, '\0' }); cmc->find_pos++; /* each expression position is used only once */ } @@ -388,7 +388,7 @@ GNUC_UNUSED void prt_exp(Exp *e, int i) else { for(int j =0; jcondesc->string); + printf("con=%s\n", e->condesc->more->string); } } #endif @@ -503,7 +503,7 @@ static void prt_exp_all(dyn_str *s, Exp *e, int i, Dictionary dict) { append_string(s, " %s%s%c cost=%s%s\n", e->multi ? "@" : "", - e->condesc ? e->condesc->string : "(condesc=(null))", + e->condesc ? e->condesc->more->string : "(condesc=(null))", e->dir, cost_stringify(e->cost), stringify_Exp_tag(e, dict)); } diff --git a/link-grammar/dict-file/dictionary.c b/link-grammar/dict-file/dictionary.c index e2252d400..f27e3940b 100644 --- a/link-grammar/dict-file/dictionary.c +++ b/link-grammar/dict-file/dictionary.c @@ -47,7 +47,7 @@ static const char * word_only_connector(Dict_node * dn) { Exp * e = dn->exp; if (CONNECTOR_type == e->type) - return e->condesc->string; + return e->condesc->more->string; return NULL; } diff --git a/link-grammar/linkage/analyze-linkage.c b/link-grammar/linkage/analyze-linkage.c index 94ec7b526..39a763c18 100644 --- a/link-grammar/linkage/analyze-linkage.c +++ b/link-grammar/linkage/analyze-linkage.c @@ -46,12 +46,12 @@ const char *intersect_strings(String_set *sset, const Connector *c1, lc_enc_t lc_label = lc1_letters | lc2_letters; /* This catches ~95% of the cases (it would work without this). */ - if (lc_label == lc1_letters) return &connector_string(c1)[d1->uc_start]; - if (lc_label == lc2_letters) return &connector_string(c2)[d2->uc_start]; + if (lc_label == lc1_letters) return &connector_string(c1)[d1->more->uc_start]; + if (lc_label == lc2_letters) return &connector_string(c2)[d2->more->uc_start]; - memcpy(l, &connector_string(c1)[d1->uc_start], d1->uc_length); + memcpy(l, &connector_string(c1)[d1->more->uc_start], d1->more->uc_length); - for (size_t i = d1->uc_length; /* see note below */; i++) + for (size_t i = d1->more->uc_length; /* see note below */; i++) { l[i] = lc_label & LC_MASK; if (l[i] == '\0') l[i] = '*'; @@ -67,8 +67,8 @@ const char *intersect_strings(String_set *sset, const Connector *c1, * So after MAX_CONNECTOR_LC_LENGTH shifts lc_label must be 0. */ #ifdef DEBUG - const char *s1 = &connector_string(c1)[d1->uc_start]; - const char *s2 = &connector_string(c1)[d1->uc_start]; + const char *s1 = &connector_string(c1)[d1->more->uc_start]; + const char *s2 = &connector_string(c1)[d1->more->uc_start]; do { assert(is_connector_name_char(*s1) == is_connector_name_char(*s2), diff --git a/link-grammar/parse/parse.c b/link-grammar/parse/parse.c index 16a3d5208..b5231108f 100644 --- a/link-grammar/parse/parse.c +++ b/link-grammar/parse/parse.c @@ -364,7 +364,7 @@ static int linkage_equiv_p(Linkage lpv, Linkage lnx) if (plk->lc != nlk->lc) { if (plk->lc->desc != nlk->lc->desc) - return strcmp(plk->lc->desc->string, nlk->lc->desc->string); + return strcmp(connector_string(plk->lc), connector_string(nlk->lc)); int md = plk->lc->multi - nlk->lc->multi; if (md) return md; @@ -372,7 +372,7 @@ static int linkage_equiv_p(Linkage lpv, Linkage lnx) if (plk->rc != nlk->rc) { if (plk->rc->desc != nlk->rc->desc) - return strcmp(plk->rc->desc->string, nlk->rc->desc->string); + return strcmp(connector_string(plk->rc), connector_string(nlk->rc)); int md = plk->rc->multi - nlk->rc->multi; if (md) return md; diff --git a/link-grammar/parse/prune.c b/link-grammar/parse/prune.c index 1ec8e9950..2bbe83e6a 100644 --- a/link-grammar/parse/prune.c +++ b/link-grammar/parse/prune.c @@ -448,10 +448,14 @@ static void clean_table(unsigned int size, C_list **t) { /* Table entry tombstone. */ #define UC_NUM_TOMBSTONE ((connector_uc_hash_t)-1) - static condesc_t desc_no_match = + static hdesc_t hdesc_no_match = { .string = "TOMBSTONE", + }; + static condesc_t desc_no_match = + { .uc_num = UC_NUM_TOMBSTONE, /* get_power_table_entry() will skip. */ + .more = &hdesc_no_match }; static Connector con_no_match = { diff --git a/link-grammar/prepare/build-disjuncts.c b/link-grammar/prepare/build-disjuncts.c index 45ac3db33..75372a2da 100644 --- a/link-grammar/prepare/build-disjuncts.c +++ b/link-grammar/prepare/build-disjuncts.c @@ -391,7 +391,7 @@ static void print_Tconnector_list(Tconnector *t) for (; t != NULL; t = t->next) { if (t->e->multi) printf("@"); - printf("%s", t->e->condesc->string); + printf("%s", t->e->condesc->more->string); printf("%c", t->e->dir); if (t->next != NULL) printf(" "); } diff --git a/link-grammar/sat-solver/sat-encoder.cpp b/link-grammar/sat-solver/sat-encoder.cpp index d1907d268..6159e2acd 100644 --- a/link-grammar/sat-solver/sat-encoder.cpp +++ b/link-grammar/sat-solver/sat-encoder.cpp @@ -514,7 +514,7 @@ Exp* SATEncoder::join_alternatives(int w) void SATEncoder::generate_link_cw_ordinary_definition(size_t wi, int pi, Exp* e, size_t wj) { - const char* Ci = e->condesc->string; + const char* Ci = e->condesc->more->string; char dir = e->dir; double cost = e->cost; Lit lhs = Lit(_variables->link_cw(wj, wi, pi, Ci)); @@ -1684,7 +1684,7 @@ void SATEncoderConjunctionFreeSentences::determine_satisfaction(int w, char* nam void SATEncoderConjunctionFreeSentences::generate_satisfaction_for_connector( int wi, int pi, Exp *e, char* var) { - const char* Ci = e->condesc->string; + const char* Ci = e->condesc->more->string; char dir = e->dir; bool multi = e->multi; double cost = e->cost; diff --git a/link-grammar/sat-solver/word-tag.hpp b/link-grammar/sat-solver/word-tag.hpp index a033f4278..cde423067 100644 --- a/link-grammar/sat-solver/word-tag.hpp +++ b/link-grammar/sat-solver/word-tag.hpp @@ -28,7 +28,7 @@ struct PositionConnector eps_right(er), eps_left(el), word_xnode(w_xnode) { if (word_xnode == NULL) { - cerr << "Internal error: Word" << w << ": " << "; connector: '" << e->condesc->string << "'; X_node: " << (word_xnode?word_xnode->string: "(null)") << endl; + cerr << "Internal error: Word" << w << ": " << "; connector: '" << e->condesc->more->string << "'; X_node: " << (word_xnode?word_xnode->string: "(null)") << endl; } // Initialize some fields in the connector struct. From 944a3f0936d986ce3fb8848c583ab6d549e89e2b Mon Sep 17 00:00:00 2001 From: ampli Date: Thu, 9 May 2024 00:14:16 +0300 Subject: [PATCH 06/11] connector_hash(): Use the connector descriptor enumeration --- link-grammar/connectors.h | 28 +++++++++------------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/link-grammar/connectors.h b/link-grammar/connectors.h index 61c825ab3..a65b576bb 100644 --- a/link-grammar/connectors.h +++ b/link-grammar/connectors.h @@ -214,6 +214,10 @@ static inline unsigned int connector_uc_num(const Connector * c) return c->desc->uc_num; } +static inline unsigned int connector_num(const Connector * c) +{ + return 2 * c->desc->con_num + c->multi; +} /* Connector utilities ... */ Connector * connector_new(Pool_desc *, const condesc_t *); @@ -328,25 +332,11 @@ typedef uint32_t connector_hash_t; static inline connector_hash_t connector_hash(const Connector *c) { - // The use of (c->desc->lc_mask & 1) during hashing is important; - // See pull req #1487 for details. This raises other questions - // about hashing. Two forms are attempted below. They appear to - // be equivalent, in terms of measured elapsed-time performance. - // (I did not look at the quality of the distribution.) - // The second form uses some mixing bitshifts: - // 266281 == sum of 1 8 32 4096 (256*1024) It is a prime number - // 524429 == sum of 1 4 8 128 (512*1024) and it is a prime number -#ifdef SIMPLE_HASH - return c->desc->uc_num + - (c->multi << 19) + - (((connector_hash_t)c->desc->lc_mask & 1) << 20) + - (connector_hash_t)c->desc->lc_letters; -#else - return c->desc->uc_num + - c->multi * 266281 + - (((connector_hash_t)c->desc->lc_mask & 1) * 524429) + - ((connector_hash_t)c->desc->lc_letters) * 101; -#endif + // connector_num() is different for each connector string + its multi + // attribute, and it naturally depends also on its head-dependent + // attribute, if any. For the importance of considering the + // head-dependent attribute during hashing see pull req #1487; + return connector_num(c); } /** From 8840a37efdeaaa7acbea4fa5f2f2d184f2990e7a Mon Sep 17 00:00:00 2001 From: ampli Date: Tue, 14 May 2024 14:18:30 +0300 Subject: [PATCH 07/11] connector_list_hash(): Better hash --- link-grammar/connectors.h | 8 ++++++-- link-grammar/disjunct-utils.c | 1 + 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/link-grammar/connectors.h b/link-grammar/connectors.h index a65b576bb..0a0177c47 100644 --- a/link-grammar/connectors.h +++ b/link-grammar/connectors.h @@ -342,14 +342,18 @@ static inline connector_hash_t connector_hash(const Connector *c) /** * \p c is assumed to be non-NULL. */ +// To check hash functions, enable the "N" printing in +// eliminate_duplicate_disjuncts(). +#define FEEDBACK_HASH 1 static inline connector_hash_t connector_list_hash(const Connector *c) { connector_hash_t accum = connector_hash(c); for (c = c->next; c != NULL; c = c->next) -#ifdef FEEDBACK_HASH - accum = (19 * accum) + (accum >> 24) + connector_hash(c); +#if FEEDBACK_HASH + accum = (accum<<6) + (accum<<16) + (accum >> 16) - connector_hash(c); #else + // Bad. accum = (19 * accum) + connector_hash(c); #endif diff --git a/link-grammar/disjunct-utils.c b/link-grammar/disjunct-utils.c index eb6fb906b..0235dac00 100644 --- a/link-grammar/disjunct-utils.c +++ b/link-grammar/disjunct-utils.c @@ -363,6 +363,7 @@ unsigned int eliminate_duplicate_disjuncts(Disjunct *dw, bool multi_string) { if (d->dup_hash != dx->dup_hash) continue; if (disjuncts_equal(dx, d, multi_string)) break; + //fprintf(stderr, "N"); // The same hash but a different disjunct. } if (dx != NULL) From 8f35c47d42fe45e06c72d03004a8d3c1db2fbc17 Mon Sep 17 00:00:00 2001 From: ampli Date: Fri, 17 May 2024 01:34:17 +0300 Subject: [PATCH 08/11] eliminate_duplicate_disjuncts(): Add collision debug stats --- link-grammar/disjunct-utils.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/link-grammar/disjunct-utils.c b/link-grammar/disjunct-utils.c index 0235dac00..3901d4d70 100644 --- a/link-grammar/disjunct-utils.c +++ b/link-grammar/disjunct-utils.c @@ -333,6 +333,7 @@ static void disjunct_dup_table_delete(disjunct_dup_table *dt) free(dt); } +#define DEDUP_DEBUG 0 /** * Takes the list of disjuncts pointed to by dw, eliminates all * duplicates. The elimination is done in-place. Because the first @@ -354,6 +355,9 @@ unsigned int eliminate_duplicate_disjuncts(Disjunct *dw, bool multi_string) dt = disjunct_dup_table_new(next_power_of_two_up(2 * count_disjuncts(dw))); +#if DEDUP_DEBUG + unsigned int coll = 0; +#endif for (Disjunct *d = dw; d != NULL; d = d->next) { Disjunct *dx; @@ -402,12 +406,28 @@ unsigned int eliminate_duplicate_disjuncts(Disjunct *dw, bool multi_string) } else { +#if DEDUP_DEBUG + if (dt->dup_table[h]) coll++; +#endif d->dup_table_next = dt->dup_table[h]; dt->dup_table[h] = d; prev = d; } } +#if DEDUP_DEBUG +#if 1 + // For particular words only. + unsigned int pw[] = { 2, 7, 12, 22, 34, 46 , 0}; + for (int i = 0; pw[i] != 0; i++) + if (dw->originating_gword->o_gword->sent_wordidx == pw[i]) +#endif + { + fprintf(stderr, "edd: %.2f%% coll %u/%u\n", + 100.f * coll / count_disjuncts(dw), coll, count_disjuncts(dw)); + } +#endif + lgdebug(+D_DISJ+(0==count)*1024, "w%zu: Killed %u duplicates%s\n", dw->originating_gword == NULL ? 0 : dw->originating_gword->o_gword->sent_wordidx, count, From d325e3ee9eae7dd161b61b16509a25dbafaa77eb Mon Sep 17 00:00:00 2001 From: ampli Date: Fri, 17 May 2024 01:42:22 +0300 Subject: [PATCH 09/11] eliminate_duplicate_disjuncts(), tracon-set.c: Use Fibonacci Hashing Knuth's Method / Fibonacci Hashing --- link-grammar/connectors.h | 3 ++- link-grammar/disjunct-utils.c | 10 +++++++--- link-grammar/tracon-set.c | 2 +- link-grammar/utilities.h | 12 ++++++++++++ 4 files changed, 22 insertions(+), 5 deletions(-) diff --git a/link-grammar/connectors.h b/link-grammar/connectors.h index 0a0177c47..4b534f16f 100644 --- a/link-grammar/connectors.h +++ b/link-grammar/connectors.h @@ -329,6 +329,7 @@ static inline uint32_t string_hash(const char *s) } typedef uint32_t connector_hash_t; +static const connector_hash_t FIBONACCI_MULT = 0x9E3779B9; static inline connector_hash_t connector_hash(const Connector *c) { @@ -351,7 +352,7 @@ static inline connector_hash_t connector_list_hash(const Connector *c) for (c = c->next; c != NULL; c = c->next) #if FEEDBACK_HASH - accum = (accum<<6) + (accum<<16) + (accum >> 16) - connector_hash(c); + accum = (accum<<7) + (accum<<14) + (accum >> 16) - connector_hash(c); #else // Bad. accum = (19 * accum) + connector_hash(c); diff --git a/link-grammar/disjunct-utils.c b/link-grammar/disjunct-utils.c index 3901d4d70..fa80b8c61 100644 --- a/link-grammar/disjunct-utils.c +++ b/link-grammar/disjunct-utils.c @@ -223,7 +223,8 @@ static unsigned int count_connectors(Sentence sent) typedef struct disjunct_dup_table_s disjunct_dup_table; struct disjunct_dup_table_s { - size_t dup_table_size; + unsigned int dup_table_size; + unsigned int log2_size; Disjunct *dup_table[]; }; @@ -244,10 +245,12 @@ static inline unsigned int old_hash_disjunct(disjunct_dup_table *dt, i += 19 * connector_list_hash(d->right); if (string_too) i += string_hash(d->word_string); - //i += (i>>10); d->dup_hash = i; - return (i & (dt->dup_table_size-1)); + + i *= FIBONACCI_MULT; + // Feed back log2(table_size) MSBs. + return ((i ^ (i>>(32-dt->log2_size))) & (dt->dup_table_size-1)); } /** @@ -322,6 +325,7 @@ static disjunct_dup_table * disjunct_dup_table_new(size_t sz) dt = malloc(sz * sizeof(Disjunct *) + sizeof(disjunct_dup_table)); dt->dup_table_size = sz; + dt->log2_size = power_of_2_log2(sz); memset(dt->dup_table, 0, sz * sizeof(Disjunct *)); diff --git a/link-grammar/tracon-set.c b/link-grammar/tracon-set.c index ef530b670..a30afb3c8 100644 --- a/link-grammar/tracon-set.c +++ b/link-grammar/tracon-set.c @@ -52,7 +52,7 @@ static tid_hash_t hash_connectors(const Connector *c, unsigned int shallow) { tid_hash_t accum = (shallow && c->shallow) ? 1000003 : 0; - return accum + connector_list_hash(c); + return (accum + connector_list_hash(c)) * FIBONACCI_MULT; } #if 0 diff --git a/link-grammar/utilities.h b/link-grammar/utilities.h index f03b48829..b5b255f24 100644 --- a/link-grammar/utilities.h +++ b/link-grammar/utilities.h @@ -534,4 +534,16 @@ static inline size_t next_power_of_two_up(size_t i) return j; } +/** + * Return log2 of a given power-of-2 \p i. + */ +static inline unsigned int power_of_2_log2(size_t i) +{ + unsigned int n = 0; + while (i >>= 1) + n++; + return n; +} + + #endif /* _LINK_GRAMMAR_UTILITIES_H_ */ From 1b5d1b553a7fcf32b5966752734643f02dadbe80 Mon Sep 17 00:00:00 2001 From: ampli Date: Fri, 17 May 2024 11:39:32 +0300 Subject: [PATCH 10/11] eliminate_duplicate_disjuncts: unsigned -> connector_hash_t --- link-grammar/disjunct-utils.c | 8 ++++---- link-grammar/disjunct-utils.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/link-grammar/disjunct-utils.c b/link-grammar/disjunct-utils.c index fa80b8c61..127ff0000 100644 --- a/link-grammar/disjunct-utils.c +++ b/link-grammar/disjunct-utils.c @@ -234,10 +234,10 @@ struct disjunct_dup_table_s * This is the old version that doesn't check for domination, just * equality. */ -static inline unsigned int old_hash_disjunct(disjunct_dup_table *dt, - Disjunct * d, bool string_too) +static inline connector_hash_t old_hash_disjunct(disjunct_dup_table *dt, + Disjunct * d, bool string_too) { - unsigned int i = 0; + connector_hash_t i = 0; if (NULL != d->left) i = connector_list_hash(d->left); @@ -365,7 +365,7 @@ unsigned int eliminate_duplicate_disjuncts(Disjunct *dw, bool multi_string) for (Disjunct *d = dw; d != NULL; d = d->next) { Disjunct *dx; - unsigned int h = old_hash_disjunct(dt, d, /*string_too*/!multi_string); + connector_hash_t h = old_hash_disjunct(dt, d, /*string_too*/!multi_string); for (dx = dt->dup_table[h]; dx != NULL; dx = dx->dup_table_next) { diff --git a/link-grammar/disjunct-utils.h b/link-grammar/disjunct-utils.h index bed949010..9dceb149c 100644 --- a/link-grammar/disjunct-utils.h +++ b/link-grammar/disjunct-utils.h @@ -63,7 +63,7 @@ struct Disjunct_struct /* Shared by different steps. For what | when. */ union { - uint32_t dup_hash; /* Duplicate elimination | before pruning */ + connector_hash_t dup_hash;/* Duplicate elimination | before pruning */ int32_t ordinal; /* Generation mode | after d. elimination */ }; /* 4 bytes */ From 244e05303f8478c8663b08f844857c91b9c38f09 Mon Sep 17 00:00:00 2001 From: ampli Date: Fri, 17 May 2024 12:00:24 +0300 Subject: [PATCH 11/11] eliminate_duplicate_disjuncts: Pre-calculate hash constants --- link-grammar/disjunct-utils.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/link-grammar/disjunct-utils.c b/link-grammar/disjunct-utils.c index 127ff0000..984d1f229 100644 --- a/link-grammar/disjunct-utils.c +++ b/link-grammar/disjunct-utils.c @@ -223,8 +223,8 @@ static unsigned int count_connectors(Sentence sent) typedef struct disjunct_dup_table_s disjunct_dup_table; struct disjunct_dup_table_s { - unsigned int dup_table_size; - unsigned int log2_size; + unsigned int table_size_minus_1; + unsigned int log2_divisor; Disjunct *dup_table[]; }; @@ -250,7 +250,7 @@ static inline connector_hash_t old_hash_disjunct(disjunct_dup_table *dt, i *= FIBONACCI_MULT; // Feed back log2(table_size) MSBs. - return ((i ^ (i>>(32-dt->log2_size))) & (dt->dup_table_size-1)); + return ((i ^ (i>>dt->log2_divisor)) & (dt->table_size_minus_1)); } /** @@ -324,8 +324,8 @@ static disjunct_dup_table * disjunct_dup_table_new(size_t sz) disjunct_dup_table *dt; dt = malloc(sz * sizeof(Disjunct *) + sizeof(disjunct_dup_table)); - dt->dup_table_size = sz; - dt->log2_size = power_of_2_log2(sz); + dt->table_size_minus_1 = sz - 1; + dt->log2_divisor = (sizeof(connector_hash_t)*CHAR_BIT) - power_of_2_log2(sz); memset(dt->dup_table, 0, sz * sizeof(Disjunct *));