Skip to content

Commit

Permalink
Fixing reverse transliteration
Browse files Browse the repository at this point in the history
  • Loading branch information
navaneeth committed Jul 28, 2012
1 parent 7a30c3c commit 794283a
Show file tree
Hide file tree
Showing 8 changed files with 128 additions and 246 deletions.
2 changes: 1 addition & 1 deletion learn.c
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ varnam_learn_internal(varnam *handle, const char *word)
/* This removes all starting and trailing special characters from the word */
sanitized_word = sanitize_word (handle, word);

rc = vst_tokenize (handle, strbuf_to_s (sanitized_word), VARNAM_TOKENIZER_VALUE, tokens);
rc = vst_tokenize (handle, strbuf_to_s (sanitized_word), VARNAM_TOKENIZER_VALUE, VARNAM_MATCH_ALL, tokens);
if (rc) return rc;

if (!can_learn_from_tokens (handle, tokens, strbuf_to_s (sanitized_word)))
Expand Down
109 changes: 55 additions & 54 deletions tests/02-ml-unicode.c
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ int ml_unicode_transliteration(int argc, char **argv)
rc = varnam_transliterate(handle, part1, &words);
if(rc != VARNAM_SUCCESS) {
printf("transliteration of %s failed - \n", part1);
printf("%s", varnam_get_last_error(handle));
return 1;
}

Expand All @@ -91,61 +92,61 @@ int ml_unicode_transliteration(int argc, char **argv)

int ml_unicode_reverse_transliteration(int argc, char **argv)
{
/* varnam *handle; */
/* int rc; */
/* char *msg; */
/* char *output; */
/* FILE *fp; */
/* char line[LINE_MAX]; */
/* char *part1, *part2; */

/* if(argc == 0) { */
/* printf("no scheme file specified\n"); */
/* return 1; */
/* } */
/* else if(argc == 1) { */
/* printf("no input file specified\n"); */
/* return 1; */
/* } */
/* printf("%s\n", argv[0]); */
/* printf("%s\n", argv[1]); */

/* rc = varnam_init(argv[0], &handle, &msg); */
/* if(rc != VARNAM_SUCCESS) { */
/* printf("initialization failed - %s\n", msg); */
/* return 1; */
/* } */

/* /\* reads input from the supplied file and matches it with the expected string *\/ */
/* fp = fopen(argv[1], "r"); */
/* if(fp == NULL) { */
/* printf("can't open input file\n"); */
/* return 1; */
/* } */

/* while(fgets(line, LINE_MAX, fp) != NULL) */
/* { */
/* part1 = strtok(line, " "); */
/* part2 = strtok(NULL, "\n"); */
varnam *handle;
int rc;
char *msg;
char *output;
FILE *fp;
char line[LINE_MAX];
char *part1, *part2;

if(argc == 0) {
printf("no scheme file specified\n");
return 1;
}
else if(argc == 1) {
printf("no input file specified\n");
return 1;
}
printf("%s\n", argv[0]);
printf("%s\n", argv[1]);

rc = varnam_init(argv[0], &handle, &msg);
if(rc != VARNAM_SUCCESS) {
printf("initialization failed - %s\n", msg);
return 1;
}

/* reads input from the supplied file and matches it with the expected string */
fp = fopen(argv[1], "r");
if(fp == NULL) {
printf("can't open input file\n");
return 1;
}

while(fgets(line, LINE_MAX, fp) != NULL)
{
part1 = strtok(line, " ");
part2 = strtok(NULL, "\n");

/* rc = varnam_reverse_transliterate(handle, part1, &output); */
/* if(rc != VARNAM_SUCCESS) { */
/* printf("reverse transliteration of %s failed - \n", part1); */
/* return 1; */
/* } */

/* if(strcmp(output, part2) != 0) { */
/* printf("reverse transliterating %s - expected %s, but was %s\n", part1, part2, output); */
/* return 1; */
/* } */
/* } */

/* fclose(fp); */
/* rc = varnam_destroy(handle); */
/* if(rc != VARNAM_SUCCESS) { */
/* printf("destruction failed\n"); */
/* return 1; */
/* } */
rc = varnam_reverse_transliterate(handle, part1, &output);
if(rc != VARNAM_SUCCESS) {
printf("reverse transliteration of %s failed - \n", part1);
return 1;
}

if(strcmp(output, part2) != 0) {
printf("reverse transliterating %s - expected %s, but was %s\n", part1, part2, output);
return 1;
}
}

fclose(fp);
rc = varnam_destroy(handle);
if(rc != VARNAM_SUCCESS) {
printf("destruction failed\n");
return 1;
}

return 0;
}
Expand Down
4 changes: 2 additions & 2 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ add_test(cleanup ruby test_output_cleanup.rb)

add_test(initialization ${test_executable_name} test-varnam-initialization td-00.vst)
add_test(transliteration ${test_executable_name} basic-transliteration td-01.vst)
add_test(ml-unicode ${test_executable_name} ml-unicode ../vsf/ml-unicode.vst ml-unicode-input.txt)
add_test(ml-unicode-reverse ${test_executable_name} ml-unicode-reverse ../vsf/ml-unicode.vst ml-unicode-reverse-input.txt)
add_test(ml-unicode ${test_executable_name} ml-unicode ../schemes/ml-unicode.vst ml-unicode-input.txt)
add_test(ml-unicode-reverse ${test_executable_name} ml-unicode-reverse ../schemes/ml-unicode.vst ml-unicode-reverse-input.txt)
add_test(test-vst-file-creation ${test_executable_name} test-vst-file-creation)

97 changes: 37 additions & 60 deletions varnam-symbol-table.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,55 +26,6 @@
#include "varnam-api.h"
#include "varnam-token.h"

struct token*
find_token(varnam *handle, const char *lookup)
{
/* struct varnam_internal *internal; */
/* struct token *tok = NULL; */
/* char sql[500]; */
/* const char *pattern, *value1, *value2, *tag; */
/* sqlite3_stmt *stmt; sqlite3 *db; */
/* int rc, type, has_children; */

/* assert( handle ); assert( lookup ); */

/* internal = handle->internal; */
/* db = internal->db; */

/* snprintf( sql, 500, "select type, pattern, value1, value2, children, tag from symbols where pattern = ?1;"); */
/* rc = sqlite3_prepare_v2( db, sql, 500, &stmt, NULL ); */
/* if( rc == SQLITE_OK ) */
/* { */
/* sqlite3_bind_text (stmt, 1, lookup, (int) strlen(lookup), NULL); */
/* rc = sqlite3_step (stmt); */
/* if( rc == SQLITE_ROW ) */
/* { */
/* type = sqlite3_column_int( stmt, 0 ); */
/* pattern = (const char*) sqlite3_column_text( stmt, 1 ); */
/* value1 = (const char*) sqlite3_column_text( stmt, 2 ); */
/* value2 = (const char*) sqlite3_column_text( stmt, 3 ); */
/* has_children = sqlite3_column_int( stmt, 4 ); */
/* tag = (const char*) sqlite3_column_text( stmt, 5 ); */

/* if(internal->current_token == NULL) { */
/* internal->current_token = (struct token *) xmalloc(sizeof (struct token)); */
/* assert( internal->current_token ); */
/* } */

/* tok = internal->current_token; */
/* tok->type = type; */
/* strncpy( tok->pattern, pattern, VARNAM_SYMBOL_MAX); */
/* strncpy( tok->value1, value1, VARNAM_SYMBOL_MAX); */
/* strncpy( tok->value2, value2, VARNAM_SYMBOL_MAX); */
/* strncpy( tok->tag, tag, VARNAM_TOKEN_TAG_MAX); */
/* tok->children = has_children; */
/* } */
/* } */

/* sqlite3_finalize( stmt ); */
/* return tok; */
}

struct token*
find_rtl_token(varnam *handle, const char *lookup)
{
Expand Down Expand Up @@ -713,7 +664,7 @@ vst_get_metadata (varnam *handle, const char* key, struct strbuf *output)
}

static int
prepare_tokenization_stmt (varnam *handle, int tokenize_using, sqlite3_stmt **stmt)
prepare_tokenization_stmt (varnam *handle, int tokenize_using, int match_type, sqlite3_stmt **stmt)
{
int rc;

Expand All @@ -732,34 +683,55 @@ prepare_tokenization_stmt (varnam *handle, int tokenize_using, sqlite3_stmt **st
*stmt = v_->tokenize_using_pattern;
break;
case VARNAM_TOKENIZER_VALUE:
if (v_->tokenize_using_value == NULL)
if (match_type == VARNAM_MATCH_ALL)
{
rc = sqlite3_prepare_v2( v_->db, "select id, type, match_type, pattern, value1, value2 from symbols where value1 = ?1 or value2 = ?1;", -1, &v_->tokenize_using_value, NULL );
if (rc != SQLITE_OK) {
set_last_error (handle, "Failed to tokenize : %s", sqlite3_errmsg(v_->db));
return VARNAM_ERROR;
if (v_->tokenize_using_value == NULL)
{
rc = sqlite3_prepare_v2( v_->db, "select id, type, match_type, pattern, value1, value2 from symbols where value1 = ?1 or value2 = ?1;",
-1, &v_->tokenize_using_value, NULL );
if (rc != SQLITE_OK) {
set_last_error (handle, "Failed to tokenize : %s", sqlite3_errmsg(v_->db));
return VARNAM_ERROR;
}
}
*stmt = v_->tokenize_using_value;
}
else
{
if (v_->tokenize_using_value_and_match_type == NULL)
{
rc = sqlite3_prepare_v2( v_->db, "select id, type, match_type, pattern, value1, value2 from symbols where (value1 = ?1 or value2 = ?1) and match_type = ?2;",
-1, &v_->tokenize_using_value_and_match_type, NULL );
if (rc != SQLITE_OK) {
set_last_error (handle, "Failed to tokenize : %s", sqlite3_errmsg(v_->db));
return VARNAM_ERROR;
}
}
*stmt = v_->tokenize_using_value_and_match_type;
}
*stmt = v_->tokenize_using_value;
break;
}

return VARNAM_SUCCESS;
}

static int
read_all_tokens_and_add_to_array (varnam *handle, const char *lookup, int tokenize_using, varray *tokens, bool *tokens_available)
read_all_tokens_and_add_to_array (varnam *handle, const char *lookup, int tokenize_using, int match_type, varray *tokens, bool *tokens_available)
{
vtoken *tok = 0;
bool cleared = false;
int rc;
sqlite3_stmt *stmt = 0;

rc = prepare_tokenization_stmt (handle, tokenize_using, &stmt);
rc = prepare_tokenization_stmt (handle, tokenize_using, match_type, &stmt);
if (rc) return rc;

*tokens_available = false;
sqlite3_bind_text (stmt, 1, lookup, -1, NULL);
if (match_type != VARNAM_MATCH_ALL)
{
sqlite3_bind_int (stmt, 2, match_type);
}
while (true)
{
rc = sqlite3_step (stmt);
Expand Down Expand Up @@ -849,7 +821,7 @@ can_find_more_matches(varnam *handle, struct strbuf *lookup, int tokenize_using,
}

int
vst_tokenize (varnam *handle, const char *input, int tokenize_using, varray *result)
vst_tokenize (varnam *handle, const char *input, int tokenize_using, int match_type, varray *result)
{
int rc, bytes_read = 0, matchpos = 0;
const unsigned char *ustring; const char *inputcopy;
Expand All @@ -874,7 +846,12 @@ vst_tokenize (varnam *handle, const char *input, int tokenize_using, varray *res
if (tokens == NULL)
tokens = get_pooled_array (handle);

rc = read_all_tokens_and_add_to_array (handle, strbuf_to_s (lookup), tokenize_using, tokens, &tokens_available);
rc = read_all_tokens_and_add_to_array (handle,
strbuf_to_s (lookup),
tokenize_using,
match_type,
tokens,
&tokens_available);
if (rc) return rc;

if (tokens_available)
Expand Down
2 changes: 1 addition & 1 deletion varnam-symbol-table.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,6 @@ vst_get_metadata (varnam *handle, const char* key, struct strbuf *output);
/* Tokenizes the input and add the tokens into result. Result will point to a multidimensional array
* where each element will be an array of vtoken* */
int
vst_tokenize (varnam *handle, const char *input, int tokenize_using, varray *result);
vst_tokenize (varnam *handle, const char *input, int tokenize_using, int match_type, varray *result);

#endif
Loading

0 comments on commit 794283a

Please sign in to comment.