-
Notifications
You must be signed in to change notification settings - Fork 21
/
lang_detection.c
105 lines (85 loc) · 2.33 KB
/
lang_detection.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
/* Simple language detector
*
* Copyright (C) Navaneeth.K.N
*
* This is part of libvarnam. See LICENSE.txt for the license
*/
#include "langcodes.h"
#include "vutf8.h"
#include "vtypes.h"
#include "util.h"
#define NON_JOINER 0x200C
#define JOINER 0x200D
typedef struct codepoint_range {
int start;
int end;
int language_code;
} cpr;
static cpr ranges[] = {
{0x0900, 0x097F, VARNAM_LANG_CODE_HI},
{0x0980, 0x09FF, VARNAM_LANG_CODE_BN},
{0x0A80, 0x0AFF, VARNAM_LANG_CODE_GU},
{0x0B00, 0x0B7F, VARNAM_LANG_CODE_OR},
{0x0B80, 0x0BFF, VARNAM_LANG_CODE_TA},
{0x0C00, 0x0C7F, VARNAM_LANG_CODE_TE},
{0x0C80, 0x0CFF, VARNAM_LANG_CODE_KN},
{0x0D00, 0x0D7F, VARNAM_LANG_CODE_ML},
{0x0A00, 0x0A7F, VARNAM_LANG_CODE_PA},
{0x0900, 0x097F, VARNAM_LANG_CODE_MR},
};
static int
get_language(int codepoint)
{
int i;
cpr range;
for (i = 0; i < ARRAY_SIZE(ranges); i++)
{
range = ranges[i];
if (codepoint >= range.start && codepoint <= range.end)
return range.language_code;
}
return VARNAM_LANG_CODE_UNKNOWN;
}
static bool
should_skip (int codepoint)
{
switch (codepoint) {
case JOINER:
case NON_JOINER:
return true;
};
return false;
}
int
varnam_detect_lang(varnam *handle, const char *input)
{
strbuf *word;
utf8_decoder decoder;
int codepoint, language = VARNAM_LANG_CODE_UNKNOWN, prev_language = 0;
if (handle == NULL || input == NULL) {
return VARNAM_LANG_CODE_UNKNOWN;
}
word = get_pooled_string (handle);
strbuf_add (word, input);
if (strbuf_is_blank (word)) {
return VARNAM_LANG_CODE_UNKNOWN;
}
utf8_decode_init (word->buffer, (int) word->length, &decoder);
for (;;)
{
codepoint = utf8_decode_next (&decoder);
if (codepoint == UTF8_END || codepoint == UTF8_ERROR)
break;
if (should_skip(codepoint))
continue;
language = get_language (codepoint);
if (language == VARNAM_LANG_CODE_UNKNOWN)
return VARNAM_LANG_CODE_UNKNOWN;
if (prev_language != 0 && language != prev_language) {
/* Looks like characters from multiple languages are mixed */
return VARNAM_LANG_CODE_UNKNOWN;
}
prev_language = language;
}
return language;
}