Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

word breakers + inscript learning hack #102

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions api.h
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,15 @@ varnam_get_all_tokens(
varray **tokens
);

/*Copies all word breakers in the symbol table to list
word breakers are used in the libvarnam-ibus project to denote the ending of a word.
However, each scheme file can use different set of characters as word breakers, as
specified in the scheme file. For example, see ml-inscript
*/
int
varnam_word_breakers(varnam *handle, char *list, int max_count);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd make this method return varray instead of char*. varray should contain all the word breakers configured in the scheme file.



/**
* Enable logging.
*
Expand Down
7 changes: 7 additions & 0 deletions schemes/ml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,13 @@ virama "~" => "്"

infer_dead_consonants true

word_breakers "." => ".",
"," => ",",
"?" => "?",
"!" => "!",
"(" => "(",
")" => ")"

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This need not be a hash. Just a simple array will do, right? Something like,

word_breakers [".", ","]

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And you don't need one in schemes/ml because it is not redefining any standard word break characters. It is required only in the scheme files which redefines the standard word break characters (ml-inscript for example)

vowels "a" => "അ",
[["a"], "aa", "A"] => ["ആ", "ാ"],
"i" => ["ഇ", "ി"],
Expand Down
32 changes: 28 additions & 4 deletions schemes/ml-inscript
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,18 @@ infer_dead_consonants false
$zwnj = "\u{200c}"
$zwj = "\u{200d}"

#word_breakers are symbols that denote the end
#of the sentence the user is typing. When a word
#breaker is encountered, Ibus commits the typed word
#and begins a new word

word_breakers "." => ".",
"," => ",",
"?" => "?",
"!" => "!",
"(" => "(",
")" => ")"

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Array would be better, same like above

vowels "D" => "അ",
"E" => "ആ",
"F" => "ഇ",
Expand Down Expand Up @@ -82,7 +93,18 @@ consonants "k" => "ക",
"J" => "റ",
"#" => "്ര",
"&" => "ക്ഷ",
"=" => "ൃ"
"=" => "ൃ",
"ൻ" => "ൻ",
"ൺ" => "ൺ",
"ൽ" => "ൽ",
"ൾ" => "ൾ",
"ർ" => "ർ"
#The above chill maps are necessary due to a bug
#inscript treats atomic chill as a token
#However, the token is not in the vst symbols table
#This somehow makes varnam assign the type '10' (VARNAM_TOKEN_OTHER) to the chill
#If a word contains tokens of type 10, it is not learned.
#So the absurd non-sensical chills stay there for the time being

numbers "1" => "൧",
"2" => "൨",
Expand All @@ -97,8 +119,10 @@ numbers "1" => "൧",

symbols "_" => "ഃ"

others "]" => $zwj,
"\\" => $zwnj
#non-joiner "\\" => $zwnj
joiner "]" => $zwj





Expand All @@ -117,4 +141,4 @@ others "]" => $zwj,





46 changes: 46 additions & 0 deletions symbol-table.c
Original file line number Diff line number Diff line change
Expand Up @@ -629,6 +629,52 @@ vst_add_metadata (varnam *handle, const char* key, const char* value)
return VARNAM_SUCCESS;
}

int
vst_get_word_breakers(varnam *handle, strbuf *list)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again, return varray

{
int rc;
sqlite3 *db;
sqlite3_stmt *stmt;
char *sql = "select pattern from symbols where type=?1";

db = handle->internal->db;

rc = sqlite3_prepare_v2(db, sql, -1, &stmt, NULL);
if(rc != SQLITE_OK)
{
set_last_error(handle, "Failed to prepare statement : %s", sqlite3_errmsg(db));
sqlite3_finalize(stmt);
return VARNAM_ERROR;
}

rc = sqlite3_bind_int(stmt, 1, VARNAM_WORD_BREAKER);
if(rc != SQLITE_OK)
{
sqlite3_finalize(stmt);
set_last_error(handle, "Could not bind : %s", sqlite3_errmsg(db));
return VARNAM_ERROR;
}

rc = sqlite3_step(stmt);

while(rc == SQLITE_ROW)
{
strbuf_add(list, sqlite3_column_text(stmt, 0));
printf("%s\n", strbuf_to_s(list));
rc = sqlite3_step(stmt);
}

if(rc != SQLITE_DONE)
{
set_last_error(handle, "%s", sqlite3_errmsg(db));
sqlite3_finalize(stmt);
return VARNAM_ERROR;
}

sqlite3_finalize(stmt);
return VARNAM_SUCCESS;
}

int
vst_load_scheme_details(varnam *handle, vscheme_details *output)
{
Expand Down
3 changes: 3 additions & 0 deletions symbol-table.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,4 +107,7 @@ vst_stamp_version (varnam *handle);
int
vst_load_scheme_details(varnam *handle, vscheme_details *output);

int
vst_get_word_breakers(varnam *handle, strbuf *list);

#endif
25 changes: 25 additions & 0 deletions varnam.c
Original file line number Diff line number Diff line change
Expand Up @@ -524,6 +524,31 @@ varnam_get_all_scheme_details()
return schemeDetails;
}

/*For use with ibus*/
/*To Do : Document properly*/
/*allocated - size already allocated to char *word_breakers*/
int
varnam_word_breakers(varnam *handle, char *word_breakers, int allocated)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should return varray. So manually doing realloc is not required

{
int rc;
strbuf *list = get_pooled_string(handle);

rc = vst_get_word_breakers(handle, list);
if(rc != VARNAM_SUCCESS)
{
set_last_error(handle, "Could not obtain word breakers");
return VARNAM_ERROR;
}
else
{
if(list->length > allocated)
word_breakers = (char*)realloc(word_breakers, allocated + (list->length - allocated + 1));

strcpy(word_breakers, strbuf_to_s(list));
return VARNAM_SUCCESS;
}
}

int
varnam_get_scheme_details(varnam *handle, vscheme_details **details)
{
Expand Down
10 changes: 8 additions & 2 deletions varnamc
Original file line number Diff line number Diff line change
Expand Up @@ -830,13 +830,13 @@ end

def non_joiner(hash)
_ensure_sanity(hash)
_create_token(hash, Varnam::VARNAM_TOKEN_NON_JOINER);
_create_token(hash, Varnam::VARNAM_TOKEN_NON_JOINER)
$overridden_default_symbols.push Varnam::VARNAM_TOKEN_NON_JOINER
end

def joiner(hash)
_ensure_sanity(hash)
_create_token(hash, Varnam::VARNAM_TOKEN_JOINER);
_create_token(hash, Varnam::VARNAM_TOKEN_JOINER)
$overridden_default_symbols.push Varnam::VARNAM_TOKEN_JOINER
end

Expand Down Expand Up @@ -1390,5 +1390,11 @@ def exceptions_stem(hash, options={})
end
end
end

def word_breakers(options={}, hash)
_ensure_sanity(hash)
_create_token(hash, Varnam::VARNAM_WORD_BREAKER, options)
end

do_action

3 changes: 2 additions & 1 deletion varnamruby.rb
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ module Varnam
VARNAM_TOKEN_OTHER = 10
VARNAM_TOKEN_NON_JOINER = 11
VARNAM_TOKEN_JOINER = 12

VARNAM_WORD_BREAKER = 13

VARNAM_MATCH_EXACT = 1
VARNAM_MATCH_POSSIBILITY = 2

Expand Down
1 change: 1 addition & 0 deletions vtypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
#define VARNAM_TOKEN_OTHER 10
#define VARNAM_TOKEN_NON_JOINER 11
#define VARNAM_TOKEN_JOINER 12
#define VARNAM_WORD_BREAKER 13

/* token flags */
#define VARNAM_TOKEN_FLAGS_MORE_MATCHES_FOR_PATTERN (1 << 0)
Expand Down