Skip to content

Commit

Permalink
Merge pull request #8 from fedecosta/feature/catalan_v3_1
Browse files Browse the repository at this point in the history
Feature/catalan v3 1
  • Loading branch information
fedecosta authored Dec 28, 2023
2 parents 3ad3c34 + 28eabfe commit 64a23ab
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 4 deletions.
Binary file modified data/ca-ba/lexicon.db
Binary file not shown.
Binary file modified data/ca-no/lexicon.db
Binary file not shown.
Binary file modified data/ca-va/lexicon.db
Binary file not shown.
25 changes: 21 additions & 4 deletions gruut/lang.py
Original file line number Diff line number Diff line change
Expand Up @@ -2180,7 +2180,26 @@ def ca_post_process_sentence(
nodes.append(typing.cast(PunctuationWordNode, node))

lang = identify_lang(nodes)


# HACK
# Training corpora includes an invalid sequence of phonemes: l ʎ l
# We fix that here, in the next iteration will be properly solved
phonemes_to_fix = "l ʎ l"
fixed_phonemes = "l l"
for node in nodes:

if node is None:
continue

if isinstance(node, WordNode):
if not (node.text and node.phonemes):
continue
phonemes_text = " ".join(node.phonemes)
if phonemes_to_fix in phonemes_text:
phonemes_text = phonemes_text.replace(phonemes_to_fix, fixed_phonemes)
node.phonemes = phonemes_text.split(" ")
_LOGGER.debug(f"FIX: phoneme sequence '{phonemes_to_fix}' fixed at {node.text}. Fixed transcription: {node.phonemes}")

# Create a list of contiguous word nodes
contiguous_word_nodes = []
for node_1, node_2 in sliding_window(nodes, 2):
Expand Down Expand Up @@ -2309,6 +2328,4 @@ def __call__(
self.phonemizer = SqlitePhonemizer(db_conn=db_conn, **self.phonemizer_args)

assert self.phonemizer is not None
return self.phonemizer(word, role=role, do_transforms=do_transforms)


return self.phonemizer(word, role=role, do_transforms=do_transforms)

0 comments on commit 64a23ab

Please sign in to comment.