From eed7f08b2bfdbd415247792546ee826541e6e07a Mon Sep 17 00:00:00 2001 From: Subin Siby Date: Sat, 20 Mar 2021 00:55:29 +0530 Subject: [PATCH] Fix #166: Varnam outputs invalid combinations with chil letters in Malayalam MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Varnam learnings has the word `kilivaathil => കിളിവാതിൽ`. When Varanm finds this word, what it does is use the word plus tokenizes the rest of it. This gives chil combinations. This PR adds a check for chil to replace the ending chil with its root consonant so that proper grammatical combinations can happen. --- tests/transliteration.c | 19 +++++++++++++++++++ words-table.c | 27 +++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/tests/transliteration.c b/tests/transliteration.c index 9c64712..738e778 100644 --- a/tests/transliteration.c +++ b/tests/transliteration.c @@ -114,6 +114,24 @@ START_TEST (indic_digit_rendering) } END_TEST +START_TEST (no_chil_combos) +{ + int rc; + vword* word; + varray *words; + + /* If learnings has the word മലയാളം which ends with an anusvaram or chil, the output becomes incorrect when combination is added to it + * Like മലയാളംോ + * https://github.com/varnamproject/libvarnam/issues/166 */ + + rc = varnam_transliterate (varnam_instance, "malayalamO", &words); + assert_success (rc); + ck_assert_int_eq (varray_length (words), 2); + word = varray_get (words, 0); + ck_assert_str_eq (word->text, "മലയാളമോ"); +} +END_TEST + TCase* get_transliteration_tests() { TCase* tcase = tcase_create("transliteration"); @@ -123,5 +141,6 @@ TCase* get_transliteration_tests() tcase_add_test (tcase, dependent_vowel_rendering); tcase_add_test (tcase, cancellation_character_should_force_independent_vowel_form); tcase_add_test (tcase, indic_digit_rendering); + tcase_add_test (tcase, no_chil_combos); return tcase; } diff --git a/words-table.c b/words-table.c index c87d79f..c7ca335 100644 --- a/words-table.c +++ b/words-table.c @@ -448,6 +448,26 @@ print_tokens_array(varray *tokens) } +/* Replaces last chil letter token with its root consonant. + * ൽ with ല */ +varray* +replace_last_chil(varray *tokens) +{ + varray *tmp; + vtoken *last_token; + + tmp = varray_get (tokens, varray_length(tokens) - 1); + assert (tmp); + last_token = varray_get (tmp, varray_length(tmp) - 1); + assert (last_token); + + if (strcmp(last_token->tag, "chill") == 0) { + strcpy(last_token->value1, last_token->value2); + } + + return tokens; +} + /* This function learns all possibilities of writing the word and it's prefixes. * It finds cartesian product of the tokens passed in and process each product. * tokens will be a multidimensional array */ @@ -867,6 +887,13 @@ vwt_tokenize_pattern (varnam *handle, const char *pattern, varray *result) rc = vst_tokenize (handle, strbuf_to_s(match), VARNAM_TOKENIZER_VALUE, VARNAM_MATCH_EXACT, tokens); if (rc) return rc; + /* For Malayalam, words ending with chil need to be replaced for correct output + * https://github.com/varnamproject/libvarnam/issues/166 + */ + if (strcmp(handle->internal->scheme_details->langCode, "ml") == 0) { + replace_last_chil(tokens); + } + add_tokens (handle, tokens, result, first_match); varray_clear (tokens); }