Skip to content

Commit

Permalink
Merge pull request #6 from fedecosta/develop
Browse files Browse the repository at this point in the history
added ce and ba files
  • Loading branch information
fedecosta authored Oct 20, 2023
2 parents b6ffdd0 + 09be7b0 commit 40c5eee
Show file tree
Hide file tree
Showing 16 changed files with 317 additions and 0 deletions.
Binary file added data/ca-ba/g2p/model.crf
Binary file not shown.
50 changes: 50 additions & 0 deletions data/ca-ba/language.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
---

language:
name: "Balear Catalan"
code: "ca-ba"
phonemes: !env "${config_dir}/phonemes.txt"
keep_stress: true

lexicon: !env "${config_dir}/lexicon.db"

g2p:
model: !env "${config_dir}/g2p.fst"

symbols:
casing: "lower"
number_regex: "^-?\\d+([,.]\\d+)*$"
token_split: "\\s+"
token_join: " "
minor_breaks:
- ","
- ":"
- ";"
- "..."
major_breaks:
- "."
- "?"
- "!"
replace:
"[\\<\\>\\(\\)\\[\\]\"]+": ""
"\\B'": "\""
"'\\B": "\""
"": "'"
"'": ""
"-": ""
"l·l": "l"
punctuations:
- "\""
- ""
- ""
- ""
- "«"
- "»"
- ","
- ":"
- ";"
- "."
- "?"
- "¿"
- "!"
- "¡"
Binary file added data/ca-ba/lexicon.db
Binary file not shown.
44 changes: 44 additions & 0 deletions data/ca-ba/phonemes.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# https://en.wikipedia.org/wiki/Catalan_phonology
# Catalan Central accent phonemes

p [p]ala
b [b]ala
t [t]ela
d [d]onar
k [k]ala
ɡ [g]ala
m [m]ala
ɲ fa[ng]
β aca[b]a
ð ca[d]a
ɣ ama[g]ar
f [f]als
v a[f]ganès
s [s]ala
z ca[s]a
ʃ [x]oc
ʒ mà[g]ic
tʃ co[tx]e
dʒ me[tg]e
l [l]íquid
ʎ [ll]amp
r ca[rr]o
ɾ ca[r]a
w ve[u]en
uw ca[u]re
j ca[i]re
y [i]a[i]a
n [n]ena
ŋ pi[n]güí
ts po[ts]er
dz do[tz]e

# Vowels
i r[i]c
e c[e]c
ɛ s[e]c
a s[a]c
ɔ f[o]c
o s[ó]c
u s[u]c
ə [a]mor
Binary file added data/ca-ce/g2p/model.crf
Binary file not shown.
50 changes: 50 additions & 0 deletions data/ca-ce/language.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
---

language:
name: "Central Catalan"
code: "ca-ce"
phonemes: !env "${config_dir}/phonemes.txt"
keep_stress: true

lexicon: !env "${config_dir}/lexicon.db"

g2p:
model: !env "${config_dir}/g2p.fst"

symbols:
casing: "lower"
number_regex: "^-?\\d+([,.]\\d+)*$"
token_split: "\\s+"
token_join: " "
minor_breaks:
- ","
- ":"
- ";"
- "..."
major_breaks:
- "."
- "?"
- "!"
replace:
"[\\<\\>\\(\\)\\[\\]\"]+": ""
"\\B'": "\""
"'\\B": "\""
"": "'"
"'": ""
"-": ""
"l·l": "l"
punctuations:
- "\""
- ""
- ""
- ""
- "«"
- "»"
- ","
- ":"
- ";"
- "."
- "?"
- "¿"
- "!"
- "¡"
Binary file added data/ca-ce/lexicon.db
Binary file not shown.
44 changes: 44 additions & 0 deletions data/ca-ce/phonemes.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# https://en.wikipedia.org/wiki/Catalan_phonology
# Catalan Central accent phonemes

p [p]ala
b [b]ala
t [t]ela
d [d]onar
k [k]ala
ɡ [g]ala
m [m]ala
ɲ fa[ng]
β aca[b]a
ð ca[d]a
ɣ ama[g]ar
f [f]als
v a[f]ganès
s [s]ala
z ca[s]a
ʃ [x]oc
ʒ mà[g]ic
tʃ co[tx]e
dʒ me[tg]e
l [l]íquid
ʎ [ll]amp
r ca[rr]o
ɾ ca[r]a
w ve[u]en
uw ca[u]re
j ca[i]re
y [i]a[i]a
n [n]ena
ŋ pi[n]güí
ts po[ts]er
dz do[tz]e

# Vowels
i r[i]c
e c[e]c
ɛ s[e]c
a s[a]c
ɔ f[o]c
o s[ó]c
u s[u]c
ə [a]mor
1 change: 1 addition & 0 deletions gruut-lang-ca/LANGUAGE
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ca-ce Catalan
3 changes: 3 additions & 0 deletions gruut-lang-ca/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# gruut Catalan

Language-specific files for Catalan (ca) in [gruut](https://github.com/rhasspy/gruut)
1 change: 1 addition & 0 deletions gruut-lang-ca/gruut_lang_ca/VERSION
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0.0.0
22 changes: 22 additions & 0 deletions gruut-lang-ca/gruut_lang_ca/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
"""Catalan language resources"""
import os
import typing
from pathlib import Path

try:
import importlib.resources

files = importlib.resources.files
except (ImportError, AttributeError):
# Backport for Python < 3.9
import importlib_resources # type: ignore

files = importlib_resources.files

_PACKAGE = "gruut_lang_ca"
_DIR = Path(typing.cast(os.PathLike, files(_PACKAGE)))


def get_lang_dir() -> Path:
"""Get directory with language resources"""
return _DIR
Binary file added gruut-lang-ca/gruut_lang_ca/g2p/model.crf
Binary file not shown.
Binary file added gruut-lang-ca/gruut_lang_ca/lexicon.db
Binary file not shown.
44 changes: 44 additions & 0 deletions gruut-lang-ca/gruut_lang_ca/phonemes.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# https://en.wikipedia.org/wiki/Catalan_phonology
# Catalan Central accent phonemes

p [p]ala
b [b]ala
t [t]ela
d [d]onar
k [k]ala
ɡ [g]ala
m [m]ala
ɲ fa[ng]
β aca[b]a
ð ca[d]a
ɣ ama[g]ar
f [f]als
v a[f]ganès
s [s]ala
z ca[s]a
ʃ [x]oc
ʒ mà[g]ic
tʃ co[tx]e
dʒ me[tg]e
l [l]íquid
ʎ [ll]amp
r ca[rr]o
ɾ ca[r]a
w ve[u]en
uw ca[u]re
j ca[i]re
y [i]a[i]a
n [n]ena
ŋ pi[n]güí
ts po[ts]er
dz do[tz]e

# Vowels
i r[i]c
e c[e]c
ɛ s[e]c
a s[a]c
ɔ f[o]c
o s[ó]c
u s[u]c
ə [a]mor
58 changes: 58 additions & 0 deletions gruut-lang-ca/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"""Setup file for gruut_lang_ca"""
from pathlib import Path

import setuptools

module_name = "gruut_lang_ca"

this_dir = Path(__file__).parent
module_dir = this_dir / module_name

# -----------------------------------------------------------------------------

# Load README in as long description
long_description: str = ""
readme_path = this_dir / "README.md"
if readme_path.is_file():
long_description = readme_path.read_text(encoding="utf-8")

version_path = module_dir / "VERSION"
with open(version_path, "r", encoding="utf-8") as version_file:
version = version_file.read().strip()


# Extra package data files
extra_files = []
maybe_extra_files = ["pos/model.crf", "pos/postagger.model"]
for maybe_extra_str in maybe_extra_files:
extra_path = module_dir / maybe_extra_str
if extra_path.is_file():
extra_files.append(maybe_extra_str)

# -----------------------------------------------------------------------------

setuptools.setup(
name=module_name,
description="Catalan language files for gruut tokenizer/phonemizer",
version=version,
author="Michael Hansen",
author_email="[email protected]",
url="https://github.com/rhasspy/gruut",
packages=setuptools.find_packages(),
package_data={
module_name: [
"VERSION",
"lexicon.db",
"g2p/model.crf",
#"espeak/lexicon.db",
#"espeak/g2p/model.crf",
]
+ extra_files
},
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
],
long_description=long_description,
long_description_content_type="text/markdown",
)

0 comments on commit 40c5eee

Please sign in to comment.