-
Notifications
You must be signed in to change notification settings - Fork 37
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #6 from fedecosta/develop
added ce and ba files
- Loading branch information
Showing
16 changed files
with
317 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
--- | ||
|
||
language: | ||
name: "Balear Catalan" | ||
code: "ca-ba" | ||
phonemes: !env "${config_dir}/phonemes.txt" | ||
keep_stress: true | ||
|
||
lexicon: !env "${config_dir}/lexicon.db" | ||
|
||
g2p: | ||
model: !env "${config_dir}/g2p.fst" | ||
|
||
symbols: | ||
casing: "lower" | ||
number_regex: "^-?\\d+([,.]\\d+)*$" | ||
token_split: "\\s+" | ||
token_join: " " | ||
minor_breaks: | ||
- "," | ||
- ":" | ||
- ";" | ||
- "..." | ||
major_breaks: | ||
- "." | ||
- "?" | ||
- "!" | ||
replace: | ||
"[\\<\\>\\(\\)\\[\\]\"]+": "" | ||
"\\B'": "\"" | ||
"'\\B": "\"" | ||
"’": "'" | ||
"'": "" | ||
"-": "" | ||
"l·l": "l" | ||
punctuations: | ||
- "\"" | ||
- "„" | ||
- "“" | ||
- "”" | ||
- "«" | ||
- "»" | ||
- "," | ||
- ":" | ||
- ";" | ||
- "." | ||
- "?" | ||
- "¿" | ||
- "!" | ||
- "¡" |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
# https://en.wikipedia.org/wiki/Catalan_phonology | ||
# Catalan Central accent phonemes | ||
|
||
p [p]ala | ||
b [b]ala | ||
t [t]ela | ||
d [d]onar | ||
k [k]ala | ||
ɡ [g]ala | ||
m [m]ala | ||
ɲ fa[ng] | ||
β aca[b]a | ||
ð ca[d]a | ||
ɣ ama[g]ar | ||
f [f]als | ||
v a[f]ganès | ||
s [s]ala | ||
z ca[s]a | ||
ʃ [x]oc | ||
ʒ mà[g]ic | ||
tʃ co[tx]e | ||
dʒ me[tg]e | ||
l [l]íquid | ||
ʎ [ll]amp | ||
r ca[rr]o | ||
ɾ ca[r]a | ||
w ve[u]en | ||
uw ca[u]re | ||
j ca[i]re | ||
y [i]a[i]a | ||
n [n]ena | ||
ŋ pi[n]güí | ||
ts po[ts]er | ||
dz do[tz]e | ||
|
||
# Vowels | ||
i r[i]c | ||
e c[e]c | ||
ɛ s[e]c | ||
a s[a]c | ||
ɔ f[o]c | ||
o s[ó]c | ||
u s[u]c | ||
ə [a]mor |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
--- | ||
|
||
language: | ||
name: "Central Catalan" | ||
code: "ca-ce" | ||
phonemes: !env "${config_dir}/phonemes.txt" | ||
keep_stress: true | ||
|
||
lexicon: !env "${config_dir}/lexicon.db" | ||
|
||
g2p: | ||
model: !env "${config_dir}/g2p.fst" | ||
|
||
symbols: | ||
casing: "lower" | ||
number_regex: "^-?\\d+([,.]\\d+)*$" | ||
token_split: "\\s+" | ||
token_join: " " | ||
minor_breaks: | ||
- "," | ||
- ":" | ||
- ";" | ||
- "..." | ||
major_breaks: | ||
- "." | ||
- "?" | ||
- "!" | ||
replace: | ||
"[\\<\\>\\(\\)\\[\\]\"]+": "" | ||
"\\B'": "\"" | ||
"'\\B": "\"" | ||
"’": "'" | ||
"'": "" | ||
"-": "" | ||
"l·l": "l" | ||
punctuations: | ||
- "\"" | ||
- "„" | ||
- "“" | ||
- "”" | ||
- "«" | ||
- "»" | ||
- "," | ||
- ":" | ||
- ";" | ||
- "." | ||
- "?" | ||
- "¿" | ||
- "!" | ||
- "¡" |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
# https://en.wikipedia.org/wiki/Catalan_phonology | ||
# Catalan Central accent phonemes | ||
|
||
p [p]ala | ||
b [b]ala | ||
t [t]ela | ||
d [d]onar | ||
k [k]ala | ||
ɡ [g]ala | ||
m [m]ala | ||
ɲ fa[ng] | ||
β aca[b]a | ||
ð ca[d]a | ||
ɣ ama[g]ar | ||
f [f]als | ||
v a[f]ganès | ||
s [s]ala | ||
z ca[s]a | ||
ʃ [x]oc | ||
ʒ mà[g]ic | ||
tʃ co[tx]e | ||
dʒ me[tg]e | ||
l [l]íquid | ||
ʎ [ll]amp | ||
r ca[rr]o | ||
ɾ ca[r]a | ||
w ve[u]en | ||
uw ca[u]re | ||
j ca[i]re | ||
y [i]a[i]a | ||
n [n]ena | ||
ŋ pi[n]güí | ||
ts po[ts]er | ||
dz do[tz]e | ||
|
||
# Vowels | ||
i r[i]c | ||
e c[e]c | ||
ɛ s[e]c | ||
a s[a]c | ||
ɔ f[o]c | ||
o s[ó]c | ||
u s[u]c | ||
ə [a]mor |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
ca-ce Catalan |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# gruut Catalan | ||
|
||
Language-specific files for Catalan (ca) in [gruut](https://github.com/rhasspy/gruut) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
0.0.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
"""Catalan language resources""" | ||
import os | ||
import typing | ||
from pathlib import Path | ||
|
||
try: | ||
import importlib.resources | ||
|
||
files = importlib.resources.files | ||
except (ImportError, AttributeError): | ||
# Backport for Python < 3.9 | ||
import importlib_resources # type: ignore | ||
|
||
files = importlib_resources.files | ||
|
||
_PACKAGE = "gruut_lang_ca" | ||
_DIR = Path(typing.cast(os.PathLike, files(_PACKAGE))) | ||
|
||
|
||
def get_lang_dir() -> Path: | ||
"""Get directory with language resources""" | ||
return _DIR |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
# https://en.wikipedia.org/wiki/Catalan_phonology | ||
# Catalan Central accent phonemes | ||
|
||
p [p]ala | ||
b [b]ala | ||
t [t]ela | ||
d [d]onar | ||
k [k]ala | ||
ɡ [g]ala | ||
m [m]ala | ||
ɲ fa[ng] | ||
β aca[b]a | ||
ð ca[d]a | ||
ɣ ama[g]ar | ||
f [f]als | ||
v a[f]ganès | ||
s [s]ala | ||
z ca[s]a | ||
ʃ [x]oc | ||
ʒ mà[g]ic | ||
tʃ co[tx]e | ||
dʒ me[tg]e | ||
l [l]íquid | ||
ʎ [ll]amp | ||
r ca[rr]o | ||
ɾ ca[r]a | ||
w ve[u]en | ||
uw ca[u]re | ||
j ca[i]re | ||
y [i]a[i]a | ||
n [n]ena | ||
ŋ pi[n]güí | ||
ts po[ts]er | ||
dz do[tz]e | ||
|
||
# Vowels | ||
i r[i]c | ||
e c[e]c | ||
ɛ s[e]c | ||
a s[a]c | ||
ɔ f[o]c | ||
o s[ó]c | ||
u s[u]c | ||
ə [a]mor |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
"""Setup file for gruut_lang_ca""" | ||
from pathlib import Path | ||
|
||
import setuptools | ||
|
||
module_name = "gruut_lang_ca" | ||
|
||
this_dir = Path(__file__).parent | ||
module_dir = this_dir / module_name | ||
|
||
# ----------------------------------------------------------------------------- | ||
|
||
# Load README in as long description | ||
long_description: str = "" | ||
readme_path = this_dir / "README.md" | ||
if readme_path.is_file(): | ||
long_description = readme_path.read_text(encoding="utf-8") | ||
|
||
version_path = module_dir / "VERSION" | ||
with open(version_path, "r", encoding="utf-8") as version_file: | ||
version = version_file.read().strip() | ||
|
||
|
||
# Extra package data files | ||
extra_files = [] | ||
maybe_extra_files = ["pos/model.crf", "pos/postagger.model"] | ||
for maybe_extra_str in maybe_extra_files: | ||
extra_path = module_dir / maybe_extra_str | ||
if extra_path.is_file(): | ||
extra_files.append(maybe_extra_str) | ||
|
||
# ----------------------------------------------------------------------------- | ||
|
||
setuptools.setup( | ||
name=module_name, | ||
description="Catalan language files for gruut tokenizer/phonemizer", | ||
version=version, | ||
author="Michael Hansen", | ||
author_email="[email protected]", | ||
url="https://github.com/rhasspy/gruut", | ||
packages=setuptools.find_packages(), | ||
package_data={ | ||
module_name: [ | ||
"VERSION", | ||
"lexicon.db", | ||
"g2p/model.crf", | ||
#"espeak/lexicon.db", | ||
#"espeak/g2p/model.crf", | ||
] | ||
+ extra_files | ||
}, | ||
classifiers=[ | ||
"Programming Language :: Python :: 3", | ||
"License :: OSI Approved :: MIT License", | ||
], | ||
long_description=long_description, | ||
long_description_content_type="text/markdown", | ||
) |