Skip to content

Commit

Permalink
Fix varnamproject#166: Varnam outputs invalid combinations with chil …
Browse files Browse the repository at this point in the history
…letters in Malayalam

Varnam learnings has the word `kilivaathil => കിളിവാതിൽ`. When Varanm finds this word,
what it does is use the word plus tokenizes the rest of it. This gives chil combinations.
This PR adds a check for chil to replace the ending chil with its root consonant so that
proper grammatical combinations can happen.
  • Loading branch information
subins2000 committed Mar 19, 2021
1 parent f32d681 commit eed7f08
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 0 deletions.
19 changes: 19 additions & 0 deletions tests/transliteration.c
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,24 @@ START_TEST (indic_digit_rendering)
}
END_TEST

START_TEST (no_chil_combos)
{
int rc;
vword* word;
varray *words;

/* If learnings has the word മലയാളം which ends with an anusvaram or chil, the output becomes incorrect when combination is added to it
* Like മലയാളംോ
* https://github.com/varnamproject/libvarnam/issues/166 */

rc = varnam_transliterate (varnam_instance, "malayalamO", &words);
assert_success (rc);
ck_assert_int_eq (varray_length (words), 2);
word = varray_get (words, 0);
ck_assert_str_eq (word->text, "മലയാളമോ");
}
END_TEST

TCase* get_transliteration_tests()
{
TCase* tcase = tcase_create("transliteration");
Expand All @@ -123,5 +141,6 @@ TCase* get_transliteration_tests()
tcase_add_test (tcase, dependent_vowel_rendering);
tcase_add_test (tcase, cancellation_character_should_force_independent_vowel_form);
tcase_add_test (tcase, indic_digit_rendering);
tcase_add_test (tcase, no_chil_combos);
return tcase;
}
27 changes: 27 additions & 0 deletions words-table.c
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,26 @@ print_tokens_array(varray *tokens)

}

/* Replaces last chil letter token with its root consonant.
* ൽ with ല */
varray*
replace_last_chil(varray *tokens)
{
varray *tmp;
vtoken *last_token;

tmp = varray_get (tokens, varray_length(tokens) - 1);
assert (tmp);
last_token = varray_get (tmp, varray_length(tmp) - 1);
assert (last_token);

if (strcmp(last_token->tag, "chill") == 0) {
strcpy(last_token->value1, last_token->value2);
}

return tokens;
}

/* This function learns all possibilities of writing the word and it's prefixes.
* It finds cartesian product of the tokens passed in and process each product.
* tokens will be a multidimensional array */
Expand Down Expand Up @@ -867,6 +887,13 @@ vwt_tokenize_pattern (varnam *handle, const char *pattern, varray *result)
rc = vst_tokenize (handle, strbuf_to_s(match), VARNAM_TOKENIZER_VALUE, VARNAM_MATCH_EXACT, tokens);
if (rc) return rc;

/* For Malayalam, words ending with chil need to be replaced for correct output
* https://github.com/varnamproject/libvarnam/issues/166
*/
if (strcmp(handle->internal->scheme_details->langCode, "ml") == 0) {
replace_last_chil(tokens);
}

add_tokens (handle, tokens, result, first_match);
varray_clear (tokens);
}
Expand Down

0 comments on commit eed7f08

Please sign in to comment.