diff --git a/learn.c b/learn.c index 6417118..facb3dc 100644 --- a/learn.c +++ b/learn.c @@ -144,7 +144,7 @@ varnam_learn_internal(varnam *handle, const char *word) /* This removes all starting and trailing special characters from the word */ sanitized_word = sanitize_word (handle, word); - rc = vst_tokenize (handle, strbuf_to_s (sanitized_word), VARNAM_TOKENIZER_VALUE, tokens); + rc = vst_tokenize (handle, strbuf_to_s (sanitized_word), VARNAM_TOKENIZER_VALUE, VARNAM_MATCH_ALL, tokens); if (rc) return rc; if (!can_learn_from_tokens (handle, tokens, strbuf_to_s (sanitized_word))) diff --git a/tests/02-ml-unicode.c b/tests/02-ml-unicode.c index 0eb3320..0f55d9f 100644 --- a/tests/02-ml-unicode.c +++ b/tests/02-ml-unicode.c @@ -69,6 +69,7 @@ int ml_unicode_transliteration(int argc, char **argv) rc = varnam_transliterate(handle, part1, &words); if(rc != VARNAM_SUCCESS) { printf("transliteration of %s failed - \n", part1); + printf("%s", varnam_get_last_error(handle)); return 1; } @@ -91,61 +92,61 @@ int ml_unicode_transliteration(int argc, char **argv) int ml_unicode_reverse_transliteration(int argc, char **argv) { - /* varnam *handle; */ - /* int rc; */ - /* char *msg; */ - /* char *output; */ - /* FILE *fp; */ - /* char line[LINE_MAX]; */ - /* char *part1, *part2; */ - - /* if(argc == 0) { */ - /* printf("no scheme file specified\n"); */ - /* return 1; */ - /* } */ - /* else if(argc == 1) { */ - /* printf("no input file specified\n"); */ - /* return 1; */ - /* } */ - /* printf("%s\n", argv[0]); */ - /* printf("%s\n", argv[1]); */ - - /* rc = varnam_init(argv[0], &handle, &msg); */ - /* if(rc != VARNAM_SUCCESS) { */ - /* printf("initialization failed - %s\n", msg); */ - /* return 1; */ - /* } */ - - /* /\* reads input from the supplied file and matches it with the expected string *\/ */ - /* fp = fopen(argv[1], "r"); */ - /* if(fp == NULL) { */ - /* printf("can't open input file\n"); */ - /* return 1; */ - /* } */ - - /* while(fgets(line, LINE_MAX, fp) != NULL) */ - /* { */ - /* part1 = strtok(line, " "); */ - /* part2 = strtok(NULL, "\n"); */ + varnam *handle; + int rc; + char *msg; + char *output; + FILE *fp; + char line[LINE_MAX]; + char *part1, *part2; + + if(argc == 0) { + printf("no scheme file specified\n"); + return 1; + } + else if(argc == 1) { + printf("no input file specified\n"); + return 1; + } + printf("%s\n", argv[0]); + printf("%s\n", argv[1]); + + rc = varnam_init(argv[0], &handle, &msg); + if(rc != VARNAM_SUCCESS) { + printf("initialization failed - %s\n", msg); + return 1; + } + + /* reads input from the supplied file and matches it with the expected string */ + fp = fopen(argv[1], "r"); + if(fp == NULL) { + printf("can't open input file\n"); + return 1; + } + + while(fgets(line, LINE_MAX, fp) != NULL) + { + part1 = strtok(line, " "); + part2 = strtok(NULL, "\n"); - /* rc = varnam_reverse_transliterate(handle, part1, &output); */ - /* if(rc != VARNAM_SUCCESS) { */ - /* printf("reverse transliteration of %s failed - \n", part1); */ - /* return 1; */ - /* } */ - - /* if(strcmp(output, part2) != 0) { */ - /* printf("reverse transliterating %s - expected %s, but was %s\n", part1, part2, output); */ - /* return 1; */ - /* } */ - /* } */ - - /* fclose(fp); */ - /* rc = varnam_destroy(handle); */ - /* if(rc != VARNAM_SUCCESS) { */ - /* printf("destruction failed\n"); */ - /* return 1; */ - /* } */ + rc = varnam_reverse_transliterate(handle, part1, &output); + if(rc != VARNAM_SUCCESS) { + printf("reverse transliteration of %s failed - \n", part1); + return 1; + } + + if(strcmp(output, part2) != 0) { + printf("reverse transliterating %s - expected %s, but was %s\n", part1, part2, output); + return 1; + } + } + + fclose(fp); + rc = varnam_destroy(handle); + if(rc != VARNAM_SUCCESS) { + printf("destruction failed\n"); + return 1; + } return 0; } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 65ef577..2f4e530 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -28,7 +28,7 @@ add_test(cleanup ruby test_output_cleanup.rb) add_test(initialization ${test_executable_name} test-varnam-initialization td-00.vst) add_test(transliteration ${test_executable_name} basic-transliteration td-01.vst) -add_test(ml-unicode ${test_executable_name} ml-unicode ../vsf/ml-unicode.vst ml-unicode-input.txt) -add_test(ml-unicode-reverse ${test_executable_name} ml-unicode-reverse ../vsf/ml-unicode.vst ml-unicode-reverse-input.txt) +add_test(ml-unicode ${test_executable_name} ml-unicode ../schemes/ml-unicode.vst ml-unicode-input.txt) +add_test(ml-unicode-reverse ${test_executable_name} ml-unicode-reverse ../schemes/ml-unicode.vst ml-unicode-reverse-input.txt) add_test(test-vst-file-creation ${test_executable_name} test-vst-file-creation) diff --git a/varnam-symbol-table.c b/varnam-symbol-table.c index ac25607..390fa64 100644 --- a/varnam-symbol-table.c +++ b/varnam-symbol-table.c @@ -26,55 +26,6 @@ #include "varnam-api.h" #include "varnam-token.h" -struct token* -find_token(varnam *handle, const char *lookup) -{ - /* struct varnam_internal *internal; */ - /* struct token *tok = NULL; */ - /* char sql[500]; */ - /* const char *pattern, *value1, *value2, *tag; */ - /* sqlite3_stmt *stmt; sqlite3 *db; */ - /* int rc, type, has_children; */ - - /* assert( handle ); assert( lookup ); */ - - /* internal = handle->internal; */ - /* db = internal->db; */ - - /* snprintf( sql, 500, "select type, pattern, value1, value2, children, tag from symbols where pattern = ?1;"); */ - /* rc = sqlite3_prepare_v2( db, sql, 500, &stmt, NULL ); */ - /* if( rc == SQLITE_OK ) */ - /* { */ - /* sqlite3_bind_text (stmt, 1, lookup, (int) strlen(lookup), NULL); */ - /* rc = sqlite3_step (stmt); */ - /* if( rc == SQLITE_ROW ) */ - /* { */ - /* type = sqlite3_column_int( stmt, 0 ); */ - /* pattern = (const char*) sqlite3_column_text( stmt, 1 ); */ - /* value1 = (const char*) sqlite3_column_text( stmt, 2 ); */ - /* value2 = (const char*) sqlite3_column_text( stmt, 3 ); */ - /* has_children = sqlite3_column_int( stmt, 4 ); */ - /* tag = (const char*) sqlite3_column_text( stmt, 5 ); */ - - /* if(internal->current_token == NULL) { */ - /* internal->current_token = (struct token *) xmalloc(sizeof (struct token)); */ - /* assert( internal->current_token ); */ - /* } */ - - /* tok = internal->current_token; */ - /* tok->type = type; */ - /* strncpy( tok->pattern, pattern, VARNAM_SYMBOL_MAX); */ - /* strncpy( tok->value1, value1, VARNAM_SYMBOL_MAX); */ - /* strncpy( tok->value2, value2, VARNAM_SYMBOL_MAX); */ - /* strncpy( tok->tag, tag, VARNAM_TOKEN_TAG_MAX); */ - /* tok->children = has_children; */ - /* } */ - /* } */ - - /* sqlite3_finalize( stmt ); */ - /* return tok; */ -} - struct token* find_rtl_token(varnam *handle, const char *lookup) { @@ -713,7 +664,7 @@ vst_get_metadata (varnam *handle, const char* key, struct strbuf *output) } static int -prepare_tokenization_stmt (varnam *handle, int tokenize_using, sqlite3_stmt **stmt) +prepare_tokenization_stmt (varnam *handle, int tokenize_using, int match_type, sqlite3_stmt **stmt) { int rc; @@ -732,15 +683,32 @@ prepare_tokenization_stmt (varnam *handle, int tokenize_using, sqlite3_stmt **st *stmt = v_->tokenize_using_pattern; break; case VARNAM_TOKENIZER_VALUE: - if (v_->tokenize_using_value == NULL) + if (match_type == VARNAM_MATCH_ALL) { - rc = sqlite3_prepare_v2( v_->db, "select id, type, match_type, pattern, value1, value2 from symbols where value1 = ?1 or value2 = ?1;", -1, &v_->tokenize_using_value, NULL ); - if (rc != SQLITE_OK) { - set_last_error (handle, "Failed to tokenize : %s", sqlite3_errmsg(v_->db)); - return VARNAM_ERROR; + if (v_->tokenize_using_value == NULL) + { + rc = sqlite3_prepare_v2( v_->db, "select id, type, match_type, pattern, value1, value2 from symbols where value1 = ?1 or value2 = ?1;", + -1, &v_->tokenize_using_value, NULL ); + if (rc != SQLITE_OK) { + set_last_error (handle, "Failed to tokenize : %s", sqlite3_errmsg(v_->db)); + return VARNAM_ERROR; + } } + *stmt = v_->tokenize_using_value; + } + else + { + if (v_->tokenize_using_value_and_match_type == NULL) + { + rc = sqlite3_prepare_v2( v_->db, "select id, type, match_type, pattern, value1, value2 from symbols where (value1 = ?1 or value2 = ?1) and match_type = ?2;", + -1, &v_->tokenize_using_value_and_match_type, NULL ); + if (rc != SQLITE_OK) { + set_last_error (handle, "Failed to tokenize : %s", sqlite3_errmsg(v_->db)); + return VARNAM_ERROR; + } + } + *stmt = v_->tokenize_using_value_and_match_type; } - *stmt = v_->tokenize_using_value; break; } @@ -748,18 +716,22 @@ prepare_tokenization_stmt (varnam *handle, int tokenize_using, sqlite3_stmt **st } static int -read_all_tokens_and_add_to_array (varnam *handle, const char *lookup, int tokenize_using, varray *tokens, bool *tokens_available) +read_all_tokens_and_add_to_array (varnam *handle, const char *lookup, int tokenize_using, int match_type, varray *tokens, bool *tokens_available) { vtoken *tok = 0; bool cleared = false; int rc; sqlite3_stmt *stmt = 0; - rc = prepare_tokenization_stmt (handle, tokenize_using, &stmt); + rc = prepare_tokenization_stmt (handle, tokenize_using, match_type, &stmt); if (rc) return rc; *tokens_available = false; sqlite3_bind_text (stmt, 1, lookup, -1, NULL); + if (match_type != VARNAM_MATCH_ALL) + { + sqlite3_bind_int (stmt, 2, match_type); + } while (true) { rc = sqlite3_step (stmt); @@ -849,7 +821,7 @@ can_find_more_matches(varnam *handle, struct strbuf *lookup, int tokenize_using, } int -vst_tokenize (varnam *handle, const char *input, int tokenize_using, varray *result) +vst_tokenize (varnam *handle, const char *input, int tokenize_using, int match_type, varray *result) { int rc, bytes_read = 0, matchpos = 0; const unsigned char *ustring; const char *inputcopy; @@ -874,7 +846,12 @@ vst_tokenize (varnam *handle, const char *input, int tokenize_using, varray *res if (tokens == NULL) tokens = get_pooled_array (handle); - rc = read_all_tokens_and_add_to_array (handle, strbuf_to_s (lookup), tokenize_using, tokens, &tokens_available); + rc = read_all_tokens_and_add_to_array (handle, + strbuf_to_s (lookup), + tokenize_using, + match_type, + tokens, + &tokens_available); if (rc) return rc; if (tokens_available) diff --git a/varnam-symbol-table.h b/varnam-symbol-table.h index 660c3f0..1e19cce 100644 --- a/varnam-symbol-table.h +++ b/varnam-symbol-table.h @@ -119,6 +119,6 @@ vst_get_metadata (varnam *handle, const char* key, struct strbuf *output); /* Tokenizes the input and add the tokens into result. Result will point to a multidimensional array * where each element will be an array of vtoken* */ int -vst_tokenize (varnam *handle, const char *input, int tokenize_using, varray *result); +vst_tokenize (varnam *handle, const char *input, int tokenize_using, int match_type, varray *result); #endif diff --git a/varnam-tl.c b/varnam-tl.c index d2b39df..cffc6df 100644 --- a/varnam-tl.c +++ b/varnam-tl.c @@ -30,39 +30,6 @@ #include "vword.h" #include "rendering.h" -static void -set_last_rtl_token(varnam *handle, struct token *tok) -{ - struct varnam_internal *vi; - vi = handle->internal; - - if(tok == NULL) { - vi->last_rtl_token_available = 0; - return; - } - - if(vi->last_rtl_token == NULL) { - vi->last_rtl_token = (struct token *) xmalloc(sizeof (struct token)); - assert(vi->last_rtl_token); - } - - vi->last_rtl_token->type = tok->type; - strncpy (vi->last_rtl_token->pattern, tok->pattern, VARNAM_SYMBOL_MAX); - strncpy (vi->last_rtl_token->value1, tok->value1, VARNAM_SYMBOL_MAX); - strncpy (vi->last_rtl_token->value2, tok->value2, VARNAM_SYMBOL_MAX); - vi->last_rtl_token->children = tok->children; - vi->last_rtl_token_available = 1; -} - -static void -cleanup(varnam *handle) -{ - strbuf_clear(handle->internal->output); - strbuf_clear(handle->internal->rtl_output); - handle->internal->last_token_available = 0; - handle->internal->last_rtl_token_available = 0; -} - /* Flattens the multi dimensional array all_tokens */ static varray* flatten(varnam *handle, varray *all_tokens) @@ -85,21 +52,21 @@ flatten(varnam *handle, varray *all_tokens) return tokens; } -int +int varnam_transliterate(varnam *handle, const char *input, varray **output) { int rc; varray *words = 0, *tokens = 0; varray *all_tokens = 0; /* This will be multidimensional array */ vword *word; - + if(handle == NULL || input == NULL) return VARNAM_ARGS_ERROR; reset_pool(handle); all_tokens = get_pooled_array (handle); - rc = vst_tokenize (handle, input, VARNAM_TOKENIZER_PATTERN, all_tokens); + rc = vst_tokenize (handle, input, VARNAM_TOKENIZER_PATTERN, VARNAM_MATCH_EXACT, all_tokens); if (rc) return rc; @@ -117,104 +84,38 @@ varnam_transliterate(varnam *handle, const char *input, varray **output) return VARNAM_SUCCESS; } -static void -resolve_rtl_token(varnam *handle, - const char *lookup, - struct token *match, - struct strbuf *string) -{ - struct varnam_token_rendering *rule; - int rc; - - assert(handle); - assert(match); - assert(string); - - rule = get_additional_rendering_rule (handle); - if (rule != NULL) { - rc = rule->render_rtl (handle, match, string); - if(rc == VARNAM_SUCCESS) { - return; - } - } - - if (match->type == VARNAM_TOKEN_VOWEL) - { - if (strcmp (match->value1, lookup) == 0 && handle->internal->last_rtl_token_available) { - /* vowel is standing in it's full form in between a word. need to prefix _ - to avoid unnecessary conjunctions */ - strbuf_add(string, "_"); - } - } +int +varnam_reverse_transliterate(varnam *handle, + const char *input, + char **output) +{ + int rc, i, j; + varray *result, *tokens; + strbuf *rtl; + vtoken *token; - strbuf_add (string, match->pattern); -} + if(handle == NULL || input == NULL) + return VARNAM_ARGS_ERROR; + result = get_pooled_array (handle); + rc = vst_tokenize (handle, input, VARNAM_TOKENIZER_VALUE, VARNAM_MATCH_EXACT, result); + if (rc) return rc; -static int -tokenize_indic_text(varnam *handle, - const char *input, - struct strbuf *string) -{ - const char *remaining; - int counter = 0, input_len = 0; - size_t matchpos = 0; - /* struct varnam_internal *vi; */ - char lookup[100], match[100]; - struct token *temp = NULL, *last = NULL; - - /* vi = handle->internal; */ - match[0] = '\0'; - - input_len = utf8_length (input); - while (counter < input_len) + rtl = get_pooled_string (handle); + assert (rtl); + for (i = 0; i < varray_length (result); i++) { - substr (lookup, input, 1, ++counter); - temp = find_rtl_token (handle, lookup); - if (temp) { - last = temp; - matchpos = strlen (lookup); - strncpy(match, lookup, 100); - } - else if( !can_find_rtl_token( handle, last, lookup )) { - break; + tokens = varray_get (result, i); + assert (tokens); + for (j = 0; j < varray_length (tokens); j++) + { + token = varray_get (tokens, j); + assert (token); + strbuf_add (rtl, token->pattern); + break; /* We only care about first element in each array */ } } - if (last) - { - resolve_rtl_token (handle, match, last, string); - remaining = input + matchpos; - set_last_rtl_token (handle, last); - } - else { - strbuf_add (string, lookup); - remaining = input + 1; - set_last_rtl_token (handle, NULL); - } - - if (strlen (remaining) > 0) - return tokenize_indic_text (handle, remaining, string); - + *output = rtl->buffer; return VARNAM_SUCCESS; } - -int -varnam_reverse_transliterate(varnam *handle, - const char *input, - char **output) -{ - int rc; - struct strbuf *result; - - if(handle == NULL || input == NULL) - return VARNAM_MISUSE; - - cleanup (handle); - result = handle->internal->rtl_output; - rc = tokenize_indic_text (handle, input, result); - *output = result->buffer; - - return rc; -} - diff --git a/varnam-types.h b/varnam-types.h index 76d0e5b..2d778be 100644 --- a/varnam-types.h +++ b/varnam-types.h @@ -33,6 +33,7 @@ /* pattern matching */ #define VARNAM_MATCH_EXACT 1 #define VARNAM_MATCH_POSSIBILITY 2 +#define VARNAM_MATCH_ALL 3 /* allowed runtime functions */ #define VARNAM_RULE_FN_INITIALS "if_initials" @@ -120,6 +121,7 @@ struct varnam_internal sqlite3_stmt *tokenize_using_pattern; sqlite3_stmt *tokenize_using_value; + sqlite3_stmt *tokenize_using_value_and_match_type; sqlite3_stmt *can_find_more_matches_using_pattern; sqlite3_stmt *can_find_more_matches_using_value; sqlite3_stmt *learn_word; diff --git a/varnam.c b/varnam.c index 2072e01..26e3884 100644 --- a/varnam.c +++ b/varnam.c @@ -82,6 +82,7 @@ initialize_internal() /* Prepared statements */ vi->tokenize_using_pattern = NULL; vi->tokenize_using_value = NULL; + vi->tokenize_using_value_and_match_type = NULL; vi->can_find_more_matches_using_pattern = NULL; vi->can_find_more_matches_using_value = NULL; vi->learn_word = NULL;