diff --git a/.gitignore b/.gitignore index 8d465b9..f979723 100644 --- a/.gitignore +++ b/.gitignore @@ -17,4 +17,4 @@ lexicon*.txt words.txt # TODO uncomment -#/data/ \ No newline at end of file +#/data/ diff --git a/gruut/__init__.py b/gruut/__init__.py index 721125e..9b76e06 100644 --- a/gruut/__init__.py +++ b/gruut/__init__.py @@ -102,4 +102,3 @@ def is_language_supported(lang: str) -> bool: def get_supported_languages() -> typing.Set[str]: """Set of supported gruut languages""" return set(KNOWN_LANGS) - diff --git a/gruut/__main__.py b/gruut/__main__.py index ab2459b..5d907b5 100755 --- a/gruut/__main__.py +++ b/gruut/__main__.py @@ -8,6 +8,7 @@ import sys from enum import Enum from pathlib import Path + import jsonlines from gruut.const import KNOWN_LANGS @@ -39,7 +40,6 @@ class StdinFormat(str, Enum): def main(): - """Main entry point""" if len(sys.argv) < 2: # Print known languages and exit @@ -64,29 +64,16 @@ def main(): args.model_prefix = "espeak" # ------------------------------------------------------------------------- - text_processor = TextProcessor( default_lang=args.language, model_prefix=args.model_prefix, ) if args.debug: - _LOGGER.debug(f"settings: {text_processor.settings}") - - # lines definition - if args.input_csv_path: - with open(args.input_csv_path) as csvfile: - reader = csv.reader(csvfile, delimiter = args.input_csv_delimiter) - lines_ids = [row[0] for row in reader] - csvfile.close() - with open(args.input_csv_path) as csvfile: - reader = csv.reader(csvfile, delimiter = args.input_csv_delimiter) - lines = [row[1] for row in reader] - csvfile.close() - - elif args.text: + _LOGGER.debug(text_processor.settings) + + if args.text: # Use arguments lines = args.text - else: # Use stdin stdin_format = StdinFormat.LINES @@ -105,59 +92,7 @@ def main(): if os.isatty(sys.stdin.fileno()): print("Reading input from stdin...", file=sys.stderr) - # writer, input_text an output_sentences definition - if args.output_csv_path: - - # Clean output file if exists - with open(args.output_csv_path, 'w') as outcsvfile: - outcsvfile.close() - - def input_text(lines): - for line_num, line in enumerate(lines): - text = line - text_id = lines_ids[line_num] - yield (text, text_id) - - def output_sentences(sentences, writer, text_data=None): - for sentence in sentences: - sentence_dict = dataclasses.asdict(sentence) - writer.write(sentence_dict) - - def output_transcription( - sentences, - writer, - text_data=None, - word_begin_sep = '[', - word_end_sep = ']', - g2p_word_begin_sep = '{', - g2p_word_end_sep = '}', - ): - - transcription = "" - for sentence in sentences: - sentence_dict = dataclasses.asdict(sentence) - for word_dict in sentence_dict["words"]: - word_phonemes = word_dict["phonemes"] - in_lexicon = text_processor._is_word_in_lexicon( - word_dict["text"], - text_processor.get_settings(lang = args.language), - ) - if in_lexicon == False: - transcription = f"{transcription.strip()} {' '.join([g2p_word_begin_sep] + word_phonemes + [g2p_word_end_sep]).strip()}".strip() - else: - transcription = f"{transcription.strip()} {' '.join([word_begin_sep] + word_phonemes + [word_end_sep]).strip()}".strip() - - row_to_write = f"{text_data}{args.output_csv_delimiter}{transcription}" - row_to_write = [text_data, transcription] - writer.writerow(row_to_write) - - def output_json(sentences, writer, text_data=None): - import json - for sentence in sentences: - sentence_dict = dataclasses.asdict(sentence) - print(json.dumps(sentence_dict, indent=4)) - - elif args.csv: + if args.csv: writer = csv.writer(sys.stdout, delimiter=args.csv_delimiter) def input_text(lines): @@ -184,7 +119,7 @@ def output_sentences(sentences, writer, text_data=None): row.append(args.phoneme_word_separator.join(phonemes)) writer.writerow(row) - + else: writer = jsonlines.Writer(sys.stdout, flush=True) @@ -196,46 +131,8 @@ def output_sentences(sentences, writer, text_data=None): for sentence in sentences: sentence_dict = dataclasses.asdict(sentence) writer.write(sentence_dict) - - # TEST - def output_transcription( - sentences, - writer, - text_data=None, - word_begin_sep = '[', - word_end_sep = ']', - g2p_word_begin_sep = '{', - g2p_word_end_sep = '}', - ): - - transcription = "" - for sentence in sentences: - sentence_dict = dataclasses.asdict(sentence) - for word_dict in sentence_dict["words"]: - word_phonemes = word_dict["phonemes"] - in_lexicon = text_processor._is_word_in_lexicon( - word_dict["text"], - text_processor.get_settings(lang = args.language), - ) - if in_lexicon == False: - transcription = f"{transcription.strip()} {' '.join([g2p_word_begin_sep] + word_phonemes + [g2p_word_end_sep]).strip()}".strip() - else: - transcription = f"{transcription.strip()} {' '.join([word_begin_sep] + word_phonemes + [word_end_sep]).strip()}".strip() - - writer.write(transcription) - - def output_json(sentences, writer, text_data=None): - import json - for sentence in sentences: - sentence_dict = dataclasses.asdict(sentence) - print(json.dumps(sentence_dict, indent=4)) - - # Transcription output + for text, text_data in input_text(lines): - - # I think lowercase is not applied before! - text = text.lower() - try: graph, root = text_processor( text, @@ -268,32 +165,9 @@ def output_json(sentences, writer, text_data=None): punctuations=(not args.no_punctuation), ) ) - - if args.output_csv_path: - with open(args.output_csv_path, 'a') as outcsvfile: - writer = csv.writer(outcsvfile, delimiter = args.output_csv_delimiter) - output_transcription( - sentences, - writer, - text_data, - word_begin_sep = args.word_begin_sep, - word_end_sep = args.word_end_sep, - g2p_word_begin_sep = args.g2p_word_begin_sep, - g2p_word_end_sep = args.g2p_word_end_sep, - ) - outcsvfile.close() - else: - output_transcription( - sentences, - writer, - text_data, - word_begin_sep = args.word_begin_sep, - word_end_sep = args.word_end_sep, - g2p_word_begin_sep = args.g2p_word_begin_sep, - g2p_word_end_sep = args.g2p_word_end_sep, - ) - - + + output_sentences(sentences, writer, text_data) + except Exception as e: _LOGGER.exception(text) @@ -315,9 +189,7 @@ class TextProcessingError(Exception): def get_args() -> argparse.Namespace: """Parse command-line arguments""" - parser = argparse.ArgumentParser(prog="gruut") - parser.add_argument( "-l", "--language", @@ -326,11 +198,9 @@ def get_args() -> argparse.Namespace: ) parser.add_argument("text", nargs="*", help="Text to tokenize (default: stdin)") - parser.add_argument( "--ssml", action="store_true", help="Input text is SSML", ) - parser.add_argument( "--stdin-format", choices=[str(v.value) for v in StdinFormat], @@ -344,61 +214,50 @@ def get_args() -> argparse.Namespace: action="store_true", help="Disable number replacement (1 -> one)", ) - parser.add_argument( "--no-currency", action="store_true", help="Disable currency replacement ($1 -> one dollar)", ) - parser.add_argument( "--no-dates", action="store_true", help="Disable date replacement (4/1/2021 -> April first twenty twenty one)", ) - parser.add_argument( "--no-times", action="store_true", help="Disable time replacement (4:01pm -> four oh one P M)", ) - parser.add_argument( "--no-pos", action="store_true", help="Disable part of speech tagger", ) - parser.add_argument( "--no-lexicon", action="store_true", help="Disable phoneme lexicon database", ) - parser.add_argument( "--no-g2p", action="store_true", help="Disable grapheme to phoneme guesser", ) - parser.add_argument( "--no-punctuation", action="store_true", help="Don't output punctuations (quotes, brackets, etc.)", ) - parser.add_argument( "--no-major-breaks", action="store_true", help="Don't output major breaks (periods, question marks, etc.)", ) - parser.add_argument( "--no-minor-breaks", action="store_true", help="Don't output minor breaks (commas, semicolons, etc.)", ) - parser.add_argument( "--no-post-process", action="store_true", help="Disable post-processing of sentences (e.g., liasons)", ) - parser.add_argument( "--no-fail", action="store_true", help="Skip lines that result in errors", ) @@ -409,80 +268,36 @@ def get_args() -> argparse.Namespace: action="store_true", help="Use eSpeak versions of lexicons (overrides --model-prefix)", ) - parser.add_argument( "--model-prefix", help="Sub-directory of gruut language data files with different lexicon, etc. (e.g., espeak)", ) - parser.add_argument( "--csv", action="store_true", help="Input text is id|text (see --csv-delimiter)" ) - - parser.add_argument( - "--input-csv-path", help="Input csv path", - ) - parser.add_argument( - "--output-csv-path", help="Output csv path", + "--csv-delimiter", default="|", help="Delimiter for input text with --csv" ) - - parser.add_argument( - "--input-csv-delimiter", default="|", help="Delimiter for input csv" - ) - - parser.add_argument( - "--output-csv-delimiter", default="|", help="Delimiter for output csv" - ) - parser.add_argument( "--sentence-separator", default=". ", help="String used to separate sentences in CSV output", ) - parser.add_argument( "--word-separator", default=" ", help="String used to separate words in CSV output", ) - parser.add_argument( "--phoneme-word-separator", default="#", help="String used to separate phonemes in CSV output", ) - parser.add_argument( "--phoneme-separator", default=" ", help="String used to separate words in CSV output phonemes", ) - - parser.add_argument( - "--word_begin_sep", - default="[", - help="String used to indicate the begining of words transcribed using the lexicon.", - ) - - parser.add_argument( - "--word_end_sep", - default="]", - help="String used to indicate the ending of words transcribed using the lexicon.", - ) - - parser.add_argument( - "--g2p_word_begin_sep", - default="{", - help="String used to indicate the begining of words transcribed using the g2p model.", - ) - - parser.add_argument( - "--g2p_word_end_sep", - default="}", - help="String used to indicate the ending of words transcribed using the g2p model.", - ) - parser.add_argument( "--debug", action="store_true", help="Print DEBUG messages to console" ) @@ -494,4 +309,4 @@ def get_args() -> argparse.Namespace: if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/gruut/const.py b/gruut/const.py index 42499a6..2199151 100644 --- a/gruut/const.py +++ b/gruut/const.py @@ -16,7 +16,9 @@ # alias -> full language name LANG_ALIASES = { "ar": "ar", - "ca": "ca", + "ca": "ca-ce", + "ca-ce": "ca-ce", + "ca-ba": "ca-ba", "cs": "cs-cz", "de": "de-de", "en": "en-us", @@ -41,6 +43,7 @@ # Languages that are expected to have a model directory KNOWN_LANGS = set(itertools.chain(ENGLISH_LANGS, LANG_ALIASES.values())) + try: # Python >= 3.7 REGEX_PATTERN = re.Pattern # type: ignore diff --git a/gruut/lang.py b/gruut/lang.py index 8cac967..aee9fda 100644 --- a/gruut/lang.py +++ b/gruut/lang.py @@ -31,7 +31,6 @@ def get_settings( **settings_args, ) -> TextProcessorSettings: """Get settings for a specific language""" - model_prefix = model_prefix or "" # Resolve language @@ -116,7 +115,7 @@ def get_settings( # Arabic return get_ar_settings(lang_dir, **settings_args) - if lang_only == "ca": + if lang_only in {"ca-ce", "ca-ba"}: # Catalan return get_ca_settings(lang_dir, **settings_args) @@ -176,10 +175,6 @@ def get_settings( # Chinese return get_zh_settings(lang_dir, **settings_args) - if lang_only == "ca": - # Catalan - return get_ca_settings(lang_dir, **settings_args) - # Default settings only return TextProcessorSettings(lang=lang, **settings_args) @@ -837,7 +832,1395 @@ def get_zh_settings(lang_dir=None, **settings_args) -> TextProcessorSettings: # ----------------------------------------------------------------------------- +# Catalan (ca, Catalan) +# ----------------------------------------------------------------------------- + +# Pre-Process functions and classes + +from collections import deque + +def vocal(carac: str) -> bool: + vocal_chars = ['a', 'à', 'e', 'é', 'è', 'i', 'í', 'ï', 'o', 'ó', 'ò', 'u', 'ü', 'ú'] + return carac in vocal_chars + +def acaba_en_vocal(prefix: str) -> bool: + darrer = prefix[-1] + return vocal(darrer) + +def post_prefix_ok(resta: str) -> bool: + + mida = len(resta) + primer = resta[0] + segon = '\0' + if mida > 1: + segon = resta[1] + + if primer in "iu": + return True + elif primer in "rs": + if mida > 1 and vocal(segon): + return True + return False + +def nuclitica(carac: str) -> bool: + nuclitic_chars = ['a', 'à', 'e', 'é', 'è', 'í', 'ï', 'o', 'ó', 'ò', 'ú'] + return carac in nuclitic_chars + +def gicf_suf(mot: str, pos: int, mots_voc_ir: typing.List[str]) -> bool: + + mida = len(mot) + + if mot[pos:].endswith("isme") and len(mot) - pos == 4: + return True + elif mot[pos:].endswith("ista") and len(mot) - pos == 4: + return True + elif mot[pos:].endswith("ismes") and len(mot) - pos == 5: + return True + elif mot[pos:].endswith("istes") and len(mot) - pos == 5: + return True + + i1 = mot.find("ir") + if i1 == pos and len(mot) - pos == 2: + if mot in mots_voc_ir: + return False + else: + return True + + i1 = mot.find("int") + if i1 == pos and len(mot) - pos == 3: + return True + + i1 = mot.find("iré") + if i1 == pos and len(mot) - pos == 3: + return True + + i1 = mot.find("iràs") + if i1 == pos and len(mot) - pos == 4: + return True + + i1 = mot.find("irà") + if i1 == pos and len(mot) - pos == 3: + return True + + i1 = mot.find("irem") + if i1 == pos and len(mot) - pos == 4: + return True + + i1 = mot.find("ireu") + if i1 == pos and len(mot) - pos == 4: + return True + + i1 = mot.find("iran") + if i1 == pos and len(mot) - pos == 4: + return True + + i1 = mot.find("iria") + if i1 == pos and len(mot) - pos == 4: + return True + + i1 = mot.find("iries") + if i1 == pos and len(mot) - pos == 5: + return True + + i1 = mot.find("iríem") + if i1 == pos and len(mot) - pos == 5: + return True + + i1 = mot.find("iríeu") + if i1 == pos and len(mot) - pos == 5: + return True + + i1 = mot.find("irien") + if i1 == pos and len(mot) - pos == 5: + return True + + return False + + +class Sillaba: + + def __init__(self, sil: str): + + self.text_ = sil + self.tonica_ = False + self.grafnuc_ = -1 + self.fonnuc_ = -1 + self.fons_ = deque() + + def grafnuc(self, nuc: int): + self.grafnuc_ = nuc + + def get_grafnuc(self) -> int: + return self.grafnuc_ + + def get_text(self) -> str: + return self.text_ + + def get_text_at_index(self, idx: int) -> str: + return self.text_[idx] + + def sizetext(self) -> int: + return len(self.text_) + + def tonica(self) -> bool: + self.tonica_ = True + + def asktonica(self) -> bool: + return self.tonica_ + + def es_sil_tonica(self) -> bool: + + if self.tonica_: + return "sí" + else: + return "no" + + def numfons(self) -> int: + return len(self.fons_) + + def allofon(self, fonidx: int) -> str: + return self.fons_[fonidx] + + def allofons(self) -> deque: + return self.fons_ + + def push_back(self, fon: str): + self.fons_.append(fon) + + def push_front(self, fon: str): + self.fons_.insert(0, fon) + + def pop_front(self): + self.fons_.popleft() + + def pop_back(self): + self.fons_.pop() + + def empty(self) -> bool: + return len(self.fons_) == 0 + + def fonnuc(self, fnuc: int): + self.fonnuc_ = fnuc + + def get_fonnuc(self) -> int: + return self.fonnuc_ + + +class Part: + + def __init__(self, tros: str): + self.text_ = tros + self.transsil_ = deque() # It will be a deque structure with Sillaba instances as elements + + def push_back(self, sil: Sillaba): + self.transsil_.append(sil) + + def pop_back(self): + self.transsil_.pop() + + def pop_front(self): + self.transsil_.popleft() + + def empty(self) -> bool: + return len(self.transsil_) == 0 + + def size(self) -> int: + return len(self.transsil_) + + def tonica(self, silidx: int) -> bool: + # self.transsil_[silidx] is an Sillaba instance, which has the attribute tonica_ + return self.transsil_[silidx].tonica_ + + def idxgrafnucli(self, silidx: int) -> int: + # self.transsil_[silidx] is an Sillaba instance, which has the attribute grafnuc_ + return self.transsil_[silidx].grafnuc_ + + def grafnucli(self, silidx: int) -> str: + # self.transsil_[silidx] is an Sillaba instance, which has an attributes text_ and grafnuc_ + return self.transsil_[silidx].text_[self.transsil_[silidx].grafnuc_] + + def sil(self, silnum: int) -> Sillaba: + return self.transsil_[silnum] + + def sils(self) -> deque: + return self.transsil_ + + def text(self) -> str: + return self.text_ + + def textinici(self, silindex: int, charindex: int) -> str: + + # Gives the text of the previous syllables, and from the syllable silindex to charindex not included + + mot = "" + for i in range(silindex): + mot += self.transsil_[i].text_ + if charindex: + mot += self.transsil_[silindex].text_[:charindex] + return mot + + def textfinal(self, silindex: int, charindex: int) -> str: + + # Gives the text starting from the syllable silindex and the character charindex (included) and up to the end of the word + + mot = self.transsil_[silindex].text_[charindex:] + for i in range(silindex + 1, len(self.transsil_)): + mot += self.transsil_[i].text_ + return mot + + def textsilini(self, silindex: int, charindex: int) -> str: + + # gives the text of the syllable silindex, from the beginning to the character charindex not included + return self.transsil_[silindex].text_[:charindex] + + def textsilfinal(self, silindex: int, charindex: int) -> str: + + # Gives the text of the syllable silindex, from charindex inclusive to the end + return self.transsil_[silindex].text_[charindex:] + + def charidxsilini(self, silindex: int) -> int: + + car = self.transsil_[silindex].text_[0] + if car == "'" or car == '-': + return 1 + else: + return 0 + + def charidxsilfinal(self, silindex: int) -> int: + + siltxt = self.transsil_[silindex].text_ + car = siltxt[-1] + if car == "'" or car == '-': + return len(siltxt) - 2 + else: + return len(siltxt) - 1 + + +class MotNuclis: + + def __init__(self, mot: str, es_adverbi: bool): + + self.adverbi_ = es_adverbi + self.el_mot = mot + self.pos_nuclis = [] + + self.load_insep() + + + def load_insep(self): + + # Set self.insep_ and self.mots_voc_ir_ + + self.insep_ = [ + 'bh', 'bl', 'br', 'ch', 'cl', 'cr', 'dh', 'dj', 'dr', 'fh', 'fh', 'fl', 'fr', \ + 'gh', 'gl', 'gr', 'gu', 'gü', 'jh', 'kh', 'kl', 'kr', 'lh', 'll', 'mh', \ + 'nh', 'ny', 'ph', 'pl', 'pr', 'qu', 'qü', 'rh', 'sh', 'th', 'th', 'tr', \ + 'vh', 'wh', 'xh', 'xh', 'yh', 'zh', + ] + self.mots_voc_ir_ = ["cuir", "vair"] + + + def troba_nuclis_mot(self): + + mida = len(self.el_mot) + adjectiu = "" + + if self.adverbi_: + adjectiu = self.el_mot[0:mida - 4] + self.el_mot = adjectiu + mida = len(self.el_mot) + + gr = 0 + while gr < mida: + + car = self.el_mot[gr] + + if nuclitica(car): + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + + elif car == 'i': + # gicf o sufix + if gicf_suf(self.el_mot, gr, self.mots_voc_ir_): + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + else: + abans = self.el_mot[0:gr] + premida = len(abans) + + if (premida == 0) or (premida == 1 and abans == "h"): + # casos iode o hiena, i, hi + if gr == mida - 1: + # i, hi + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + elif vocal(self.el_mot[gr+1]): + # hiena iode + gr = gr + 1 + continue + # hissar, ira + else: + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + + elif (premida == 1) and (abans == "u"): + + if gr == mida - 1 or self.el_mot[gr + 1] == 'x': + gr = gr + 1 + continue + else: + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + + elif (premida == 2) and (abans == "hu"): + + if gr == mida - 1: + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + + if self.el_mot[gr + 1] == 'x': + gr = gr + 1 + continue + else: + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + + elif self.el_mot[gr - 1] == 'u': + # tres vocals seguides vocal+u+i, la u es consonant i la "i" es nucli + if (premida > 1) and vocal(self.el_mot[gr - 2]): + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + elif (premida > 1) and (self.el_mot[gr - 2] == 'q' or self.el_mot[gr - 2] == 'g'): + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + else: + # ui tot sol + gr = gr + 1 + continue + + elif self.el_mot[gr - 1] == 'ü': + + if (premida > 1) and (self.el_mot[gr - 2] == 'q' or self.el_mot[gr - 2] == 'g'): + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + else: + # üi no precedit de g,q + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + + elif vocal(self.el_mot[gr - 1]): + # vocal + i, la i no es nucli + gr = gr + 1 + continue + + else: + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + + elif car == 'u': + + abans = self.el_mot[0:gr] + premida = len(abans) + + if (premida == 0) or (premida == 1 and abans == "h"): + # casos uadi o hu+vocal, u, hu + if gr == mida - 1: + # u, hu + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + elif self.el_mot == "ui" or self.el_mot == "uix": + # potser se n'han d'afegir mes + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + elif (pos := self.el_mot.find("ix")) != -1 and pos == gr + 1: + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + elif vocal(self.el_mot[gr+1]): + # uadi hu+vocal + gr = gr + 1 + continue + else: + # huns, una + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + + elif (premida == 1) and (abans == "i"): + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + + elif self.el_mot[gr - 1] == 'i': + # tres vocals seguides vocal+i+u, la i es consonant i la "u" es nucli + if premida > 2: + boci = self.el_mot[gr - 3 : gr - 1] + + if boci == "gu" or boci == "qu": + gr = gr + 1 + continue + + elif vocal(self.el_mot[gr - 2]): + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + + else: + gr = gr + 1 + continue + + elif premida == 2: + if vocal(self.el_mot[gr - 2]): + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + else: + gr = gr + 1 + continue + else: + gr = gr + 1 + continue + + elif self.el_mot[gr - 1] == 'g' or self.el_mot[gr - 1] == 'q': + if gr == mida - 1: + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + + elif vocal(self.el_mot[gr + 1]): + gr = gr + 1 + continue + + else: + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + + elif self.el_mot[gr - 1] == 'ü': + if (premida > 1) and (self.el_mot[gr - 2] == 'q' or self.el_mot[gr - 2] == 'g'): + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + else: + # üu no precedit de g,q + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + + elif vocal(self.el_mot[gr - 1]): + # vocal + u, la u no es nucli + gr = gr + 1 + continue + + else: + # tancara l'else de quan no es sufix + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + + elif car == 'ü': + + pos = 0 + + if (pos := self.el_mot.find("argü")) != -1: + if pos + 3 == gr: + self.pos_nuclis.append(gr) + self.pos_nuclis.append(gr + 1) + gr += 1 + gr = gr + 1 + continue + else: + gr = gr + 1 + continue + elif gr > 0: + if self.el_mot[gr - 1] == 'g' or self.el_mot[gr - 1] == 'q': + gr = gr + 1 + continue + else: + self.pos_nuclis.append(gr) + gr = gr + 1 + continue + + else: + gr = gr + 1 + continue + + if self.adverbi_: + self.el_mot += "ment" + mida = len(self.el_mot) + self.pos_nuclis.append(mida - 3) + + + def inseparable(self, tros: str) -> bool: + return tros in self.insep_ + + + def separa_sillabes(self, vec_sil: typing.List[str], els_nuclis: typing.List[int]) -> typing.Tuple[typing.List[str], typing.List[int]]: + + fronteres = [] + + if len(self.pos_nuclis) == 1: + + vec_sil.append(self.el_mot) + els_nuclis.append(self.pos_nuclis[0]) + + return vec_sil, els_nuclis + + # Set the fronteres vector + for i in range(len(self.pos_nuclis) - 1): + + longi = self.pos_nuclis[i + 1] - self.pos_nuclis[i] - 1 + tros = self.el_mot[self.pos_nuclis[i] + 1: self.pos_nuclis[i] + 1 + longi] + + # vocals contigues + if longi == 0: + fronteres.append(self.pos_nuclis[i]) + + elif longi == 1: + fronteres.append(self.pos_nuclis[i]) + + elif longi == 2: + if self.inseparable(self.el_mot[self.pos_nuclis[i] + 1: self.pos_nuclis[i] + 1 + 2]): + fronteres.append(self.pos_nuclis[i]) + elif self.el_mot[self.pos_nuclis[i] + 2] == 'h': + fronteres.append(self.pos_nuclis[i]) + else: + fronteres.append(self.pos_nuclis[i] + 1) + + elif longi == 3: + if self.inseparable(self.el_mot[self.pos_nuclis[i] + 2: self.pos_nuclis[i] + 2 + 2]): + if self.el_mot[self.pos_nuclis[i] + 1] == '-': + fronteres.append(self.pos_nuclis[i]) + else: + fronteres.append(self.pos_nuclis[i] + 1) + else: + if self.el_mot[self.pos_nuclis[i] + 3] == '-': + fronteres.append(self.pos_nuclis[i] + 1) + else: + fronteres.append(self.pos_nuclis[i] + 2) + + elif longi == 4: + pos = 0 + + if (pos := tros.find("s")) != -1: + fronteres.append(self.pos_nuclis[i] + pos + 1) + else: + fronteres.append(self.pos_nuclis[i] + 2) + + elif longi == 5: + fronteres.append(self.pos_nuclis[i] + 3) + + else: + _LOGGER.info(f"No puc separar en sillabes el mot {self.el_mot}, cluster massa gran, de longitud {longi}") + exit(1) + + numsil = len(fronteres) + for i in range(numsil): + if i == 0: + if fronteres[i] != 0: + esta_sil = self.el_mot[0:fronteres[i] + 1] + vec_sil.append(esta_sil) + else: + esta_sil = self.el_mot[0] + vec_sil.append(esta_sil) + else: + esta_sil = self.el_mot[fronteres[i - 1] + 1 : fronteres[i] + 1] + vec_sil.append(esta_sil) + + esta_sil = self.el_mot[fronteres[numsil - 1] + 1:] + vec_sil.append(esta_sil) + + els_nuclis.append(self.pos_nuclis[0]) + longitud = len(vec_sil[0]) + + for i in range(1, len(self.pos_nuclis)): + this_nucli = self.pos_nuclis[i] - longitud + els_nuclis.append(this_nucli) + longitud += len(vec_sil[i]) + + return vec_sil, els_nuclis + + + def empty(self) -> bool: + return len(self.pos_nuclis) == 0 + + + def mot(self) -> str: + return self.el_mot + + + def nucli(self, i: int) -> typing.Union[int, None]: + if 0 <= i < len(self.pos_nuclis): + return self.pos_nuclis[i] + return None + + + def size(self) -> int: + return len(self.pos_nuclis) + + + def nuclis(self) -> typing.List[int]: + return self.pos_nuclis + + +class Transcripcio: + + def __init__(self, mot: str): + + self.motorig_ = mot + + self.prefixos_ = [] + self.pref_atons = [] + self.excepcions_prefs = {} + self.excepcions_gen = set() + self.einesgram_ = set() + self.excep_acc = {} + self.trossos_ = [] + self.transpart_ = [] + + self.carrega_einesgram() + self.carrega_exc_accent() + + + def carrega_einesgram(self): + + # Set self.einesgram_ + self.einesgram_ = [ + '-de-', '-en', '-hi', '-ho', '-i', '-i-', '-la', '-les', '-li', '-lo', '-los', '-me', '-ne', '-nos', \ + '-se', '-te', '-us', '-vos', 'a', 'a-', 'al', 'als', 'amb', 'bi-', 'co', 'de', 'de-', 'del', 'dels', \ + 'el', 'els', 'em', 'en', 'ens', 'es', 'et', 'hi', 'ho', 'i', 'i-', 'la', 'les', 'li', 'lo', 'ma', \ + 'me', 'mon', 'na', 'pel', 'pels', 'per', 'que', 're', 'sa', 'se', 'ses', 'si', 'sos', 'sub', \ + 'ta', 'te', 'tes', 'ton', 'un', 'uns', 'us', + ] + + + def carrega_exc_accent(self): + + # Set self.excep_acc (excepcions d'accentuacio) + self.excep_acc = { + 'antropologico': 'antropològico', 'arterio': 'artèrio', 'artistico': 'artístico', 'basquet': 'bàsquet', 'cardio': 'càrdio', \ + 'catolico': 'catòlico', 'cientifico': 'científico', 'circum': 'círcum', 'civico': 'cívico', 'democrata': 'demòcrata', \ + 'democratico': 'democràtico', 'dumping': 'dúmping', 'economico': 'econòmico', 'edgar': 'èdgar', 'fenicio': 'fenício', \ + 'filosofico': 'filosòfico', 'fisico': 'físico', 'fisio': 'físio', 'geografico': 'geogràfico', 'hetero': 'hétero', \ + 'higenico': 'higènico', 'higienico': 'higiènico', 'hiper': 'híper', 'historico': 'històrico', 'ibero': 'íbero', \ + 'ideologico': 'ideològico', 'input': 'ínput', 'inter': 'ínter', 'jonatan': 'jònatan', 'juridico': 'jurídico', 'labio': 'làbio', \ + 'linguo': 'línguo', 'literario': 'literàrio', 'logico': 'lògico', 'magico': 'màgico', 'maniaco': 'maníaco', 'marketing': 'màrketing', \ + 'oxido': 'òxido', 'petroleo': 'petròleo', 'politico': 'político', 'quantum': 'quàntum', 'quimico': 'químico', 'quimio': 'químio', \ + 'radio': 'ràdio', 'romanico': 'romànico', 'simbolico': 'simbòlico', 'socio': 'sòcio', 'super': 'súper', 'tecnico': 'tècnico', \ + 'teorico': 'teòrico', 'tragico': 'tràgico', 'traqueo': 'tràqueo', + } + + + def normalize_word(self, word: str) -> str: + + word = word.lower() + + return word + + + def segmenta(self, mot: str, final: typing.List[str]) -> typing.List[str]: + + # Word with prefixes segmentation + no_te_prefix = True + for prefix in self.prefixos_: + lon = len(prefix) + pos = mot.find(prefix) + if pos != -1 and pos == 0: + no_te_prefix = False + + if lon == len(mot): + final.append(mot) + return final + elif lon == len(mot) - 1 and mot[lon] == '-': + final.append(mot) + return final + else: + # If there are no exceptions split it + if prefix not in self.excepcions_prefs: + final.append(prefix) + resta = mot[lon:] + self.segmenta(resta, final) + return final + # If there are exceptions check that it is not part of it + else: + if mot not in self.excepcions_prefs[prefix]: + final.append(prefix) + resta = mot[lon:] + self.segmenta(resta, final) + return final + else: + final.append(mot) + return final + + for prefix in self.pref_atons: + lon = len(prefix) + pos = mot.find(prefix) + if pos != -1 and pos == 0: + no_te_prefix = False + + if lon == len(mot): + final.append(mot) + return final + elif lon == len(mot) - 1 and mot[lon] == '-': + final.append(mot) + return final + else: + # It should only be started if: + # if the prefix ends in a vowel + # only if the word continues with i, u, -r+vowel, -s+vowel + # if the prefix always ends in a consonant + # except in both cases + # if it is part of the exceptions, if there are any + if acaba_en_vocal(prefix): + resta = mot[lon:] + if post_prefix_ok(resta): + if prefix not in self.excepcions_prefs: + final.append(prefix) + self.segmenta(resta, final) + return final + else: + if mot not in self.excepcions_prefs[prefix]: + final.append(prefix) + self.segmenta(resta, final) + return final + else: + final.append(mot) + return final + else: + final.append(mot) + return final + # It is not an exception + else: + if prefix not in self.excepcions_prefs: + final.append(prefix) + queda = mot[lon:] + self.segmenta(queda, final) + return final + else: + if mot not in self.excepcions_prefs[prefix]: + final.append(prefix) + queda = mot[lon:] + self.segmenta(queda, final) + return final + else: + final.append(mot) + return final + + if no_te_prefix: + final.append(mot) + return final + + + def tracta_prefixos(self, inici: typing.List[str], final: typing.List[str]) -> typing.List[str]: + + # For each start word, + # if there is a prefix at the beginning and the word is not part of the exception list, + # split it after the prefix, unless after the prefix there is a hyphen + + for mot in inici: + final = self.segmenta(mot, final) + + return final + + + def parteix_mot(self): + + # Set parts + parts = [self.motnorm_] + + self.trossos_ = self.tracta_prefixos(parts, self.trossos_) + + for tros in self.trossos_: + partmot = Part(tros) + self.transpart_.append(partmot) + + + def no_es_nom_ment(self, mot: str) -> bool: + + if mot not in self.excepcions_gen: + return True + else: + return False + + + def es_adverbi(self, mot: str) -> bool: + + pos = 0 + tros = "ment" + pos = mot.rfind(tros) + if pos != -1: + if pos == len(mot) - len(tros): + if self.no_es_nom_ment(mot): + return True + else: + return False + else: + return False + else: + return False + + + def es_exc_accent(self, mot: str) -> str: + + if mot in self.excep_acc: + mot = self.excep_acc[mot] + + return mot + + + def troba_nuclis_mot(self): + + for i in range(len(self.trossos_)): + + self.trossos_[i] = self.es_exc_accent(self.trossos_[i]) + + # Determine if it's an adverb and pass the information to mot_amb_nuclis + is_adverb = self.es_adverbi(self.trossos_[i]) + + mot_amb_nuclis = MotNuclis( + mot = self.trossos_[i], + es_adverbi = is_adverb, + ) + + mot_amb_nuclis.troba_nuclis_mot() + + sillabes, nuclis = [], [] + if not mot_amb_nuclis.empty(): + sillabes, nuclis = mot_amb_nuclis.separa_sillabes(sillabes, nuclis) + for sil in range(len(sillabes)): + sillab = Sillaba(sillabes[sil]) + sillab.grafnuc(nuclis[sil]) + self.transpart_[i].push_back(sillab) + else: + sillab = Sillaba(self.trossos_[i]) + self.transpart_[i].push_back(sillab) + + + def dotze_term(self, pnum: int) -> bool: + + # retorna cert quan es mot pla (paroxiton) ja sigui per les dotze terminacions o per ser un diftong decreixent + + dift_decr = ["au", "ai", "eu", "ei", "ou", "oi", "iu", "àu", "ui"] + voc_sola = ["a", "e", "i", "o", "u", "ï", "ü"] + voc_mes_s = ["as", "es", "is", "os", "us", "às", "ès"] + en_in = ["en", "in", "àn"] + + numsil = self.transpart_[pnum].size() + darsil = self.transpart_[pnum].transsil_[numsil - 1].get_text() + darsil = darsil.lower() # Convert to lowercase for case-insensitive comparison + + mida = len(darsil) + + # mida de la sillaba 2 o + + if mida >= 2: + last_dos = darsil[-2:] + + # diftong decreixent, inclou gui, qui + for dift in dift_decr: + es_dift_decr = last_dos == dift + # diftong decreixent i nucli -> agut + # diftong decreixent i no es nucli (ex: preui)-> pla + if es_dift_decr and (self.transpart_[pnum].transsil_[numsil - 1].grafnuc_ == mida - 2): + return False + elif es_dift_decr: + return True + + # vocal sola + last_voc = darsil[-1:] + if last_voc in voc_sola: + return True + + # si la dar sil acaba en s (mida 2 o + encara) + if darsil[-1:] == 's': + if mida >= 3: + last_dos = darsil[-3:-1] + for dift in dift_decr: + es_dift_decr = last_dos == dift + if es_dift_decr and (self.transpart_[pnum].transsil_[numsil - 1].grafnuc_ == mida - 3): + return False + elif es_dift_decr: + return True + + last_dos = darsil[-2:] + if last_dos in voc_mes_s: + return True + + last_dos = darsil[-2:] + if last_dos in en_in: + return True + + last_voc = darsil[-1:] + if last_voc in voc_sola: + return True + + return False + + + def accentua_mot(self, pnum: int): + + numsil = self.transpart_[pnum].size() + + if self.dotze_term(pnum): + # If it ends with a vowel or vowel+s, or with o or in, it's flat (plana) + # Vowels are aeiouàèéíòóúü + self.transpart_[pnum].transsil_[numsil - 2].tonica() + else: + # Otherwise, it's acute (aguda) + self.transpart_[pnum].transsil_[numsil - 1].tonica() + + + def einagram(self, mot: str) -> bool: + + if mot not in self.einesgram_: + return False + else: + return True + + + def troba_accent_tonic_mot(self): + + vocaccent = ['à', 'é', 'è', 'í', 'ó', 'ò', 'ú'] + + for pnum in range(len(self.trossos_)): + + if not self.transpart_[pnum]: + # es una particula sense vocal + continue + + numsil = self.transpart_[pnum].size() + accent_grafic = False + # bucle sobre les sillabes per veure si hi ha accent grafic + for snum in range(numsil): + sillaba = self.transpart_[pnum].transsil_[snum].get_text() + pos = 0 + if any(accented_vowel in sillaba for accented_vowel in vocaccent): + + last_sil = self.transpart_[pnum].transsil_[numsil - 1].get_text() + accent_grafic = True + + if last_sil == "ment": + self.transpart_[pnum].transsil_[snum].tonica() + self.transpart_[pnum].transsil_[numsil - 1].tonica() + else: + self.transpart_[pnum].transsil_[snum].tonica() + + break + + if not accent_grafic: + + # si es monosillab es tonic a menys que sigui eina gramatical + # tonic car es morfema lexematic d'una sillaba + # si te mes d'una sillaba, estudiar la terminacio, descartant abans + # un guio que hi pugui haver al final + # prefixos que poden ser d'una o dues sillabes tenen nomes + # accent secundari si son tonics i funcionen realment com a prefix + + if numsil == 1: + sillaba = self.transpart_[pnum].transsil_[0].get_text() + if (self.transpart_[pnum].transsil_[0].grafnuc_ == -1): + # es part de mot sense nucli + continue + elif self.einagram(sillaba): + #amb les parts de mot + continue + else: + # soliem mirar si era un prefix tonic o un lexema, ja no cal + self.transpart_[pnum].transsil_[0].tonica() + else: + # no es monosillab + + last_sil = self.transpart_[pnum].transsil_[numsil - 1].get_text() + # no es referencia, last_sil, car volem guardar el valor + + if last_sil == "ment": + # no cal tractar diferent els prefixos tonics + if self.no_es_nom_ment(self.trossos_[pnum]) and self.no_es_nom_ment(self.motnorm_): + if numsil - 1 > 1: + self.transpart_[pnum].pop_back() # Remove the last syllable + self.accentua_mot(pnum) # Accentuate from the syllables + darsil = Sillaba(last_sil) # Create a syllable like before + self.transpart_[pnum].push_back(darsil) # Add it and make it tonic + self.transpart_[pnum].transsil_[numsil - 1].tonica() + self.transpart_[pnum].transsil_[numsil - 1].grafnuc_ = 1 + # # es la e de ment + else: + self.transpart_[pnum].transsil_[0].tonica() + self.transpart_[pnum].transsil_[numsil - 1].tonica() + else: + self.accentua_mot(pnum) + + + def sillaba_accentua_mot(self): + + self.parteix_mot() + self.troba_nuclis_mot() + self.troba_accent_tonic_mot() + + + def stress_tonic(self) -> str: + + accent_changes = { + "a" : "à", + "e" : "é", + "i" : "í", + "ï" : "í", + "o" : "ó", + "u" : "ú", + "ü" : "ú", + } + + all_vowels = ['a', 'à', 'e', 'é', 'è', 'i', 'í', 'ï', 'o', 'ó', 'ò', 'u', 'ü', 'ú'] + accented_vowels = ['à', 'é', 'è', 'í', 'ó', 'ò', 'ú'] + unaccented_vowels = ['a', 'e', 'i', 'ï', 'o', 'u', 'ü'] + + original_word = "" + stressed_word = "" + + for i in range(len(self.transpart_)): + + word = self.transpart_[i].text_ + + if any(ext in word for ext in accented_vowels): + stressed_word = stressed_word + word + else: + for j in range(self.transpart_[i].size()): + sil = self.transpart_[i].transsil_[j] + sillaba_text = sil.get_text() + idxgrafnucli = sil.get_grafnuc() + graf_nucli = sil.get_text_at_index(idxgrafnucli) + is_tonic = sil.es_sil_tonica() + + if is_tonic == "sí": + sillaba_list = list(sillaba_text) + if sillaba_list[idxgrafnucli] in unaccented_vowels: + if sillaba_list[idxgrafnucli] == "e": + if j == self.transpart_[i].size() - 1: + # for accute words almost always this is the correct accented e + sillaba_list[idxgrafnucli] = "è" + elif j == self.transpart_[i].size() - 2: + # the word has its accent in the penultimate sillabe + # almost always this is the correct accented e + sillaba_list[idxgrafnucli] = "è" + else: + # proparoxytone + # almost always this is the correct accented e + sillaba_list[idxgrafnucli] = "è" + elif sillaba_list[idxgrafnucli] == "o": + if j == self.transpart_[i].size() - 1: + # the word has its accent in the last sillabe + # almost always this is the correct accented o + sillaba_list[idxgrafnucli] = "ó" + elif j == self.transpart_[i].size() - 2: + # the word has its accent in the penultimate sillabe + # almost always this is the correct accented o + sillaba_list[idxgrafnucli] = "ò" + else: + # proparoxytone + # almost always this is the correct accented o + sillaba_list[idxgrafnucli] = "ò" + else: + sillaba_list[idxgrafnucli] = accent_changes[sillaba_list[idxgrafnucli]] + + sillaba_text = "".join(sillaba_list) + + stressed_word = stressed_word + sillaba_text + + original_word = original_word + word + + return stressed_word + + + def stress_word(self) -> str: + + self.motnorm_ = self.normalize_word(self.motorig_) + + self.sillaba_accentua_mot() + + self.stressed_word = self.stress_tonic() + + return self.stressed_word + + +class CatalanPreProcessText: + """Pre-processes text""" + + def __init__(self, lookup_phonemes, settings_values: dict, lang: str): + + self.lookup_phonemes = lookup_phonemes + self.settings_values = settings_values + self.lang = lang + + + def __call__(self, text: str) -> str: + + breaks = [" "] + breaks = breaks + list(self.settings_values["major_breaks"]) + breaks = breaks + list(self.settings_values["minor_breaks"]) + breaks = breaks + list(self.settings_values["word_breaks"]) + breaks = breaks + list(self.settings_values["begin_punctuations"]) + breaks = breaks + list(self.settings_values["end_punctuations"]) + + tokens = [text.strip()] + for char_break in breaks: + tokens = [re.split(f"(\{char_break})", item) for item in tokens] + tokens = [item for sublist in tokens for item in sublist if item != ""] + + preprocessed_tokens = [] + for token in tokens: + + if token in breaks: + processed_token = token + else: + is_in_lexicon = self.lookup_phonemes(token) is not None + if is_in_lexicon: + processed_token = token + else: + tr = Transcripcio(token) + processed_token = tr.stress_word() + + preprocessed_tokens.append(processed_token) + + processed_text = "".join(preprocessed_tokens) + + return processed_text + + +# Post-Process functions and classes + +from gruut.text_processor import DATA_PROP, WordNode, BreakWordNode, BreakNode, PunctuationWordNode +from gruut.utils import sliding_window + +def identify_lang(nodes: typing.List[typing.Union[WordNode, BreakWordNode, BreakNode, PunctuationWordNode]]) -> str: + + from gruut.text_processor import WordNode + + try: + for node in nodes: + if isinstance(node, WordNode): + lang = node.lang + break + except: + lang = "ca" + + return lang + +def phoneme_is_vowel(phoneme: str) -> bool: + vowels = ["'a", "'ɛ", "'ɔ", "'e", "'i", "'o", "'u", "ə", "i", "u"] + return phoneme in vowels + +def phoneme_is_stressed_vowel(phoneme: str) -> bool: + stressed_vowels = ["'a", "'ɛ", "'ɔ", "'e", "'i", "'o", "'u"] + return phoneme in stressed_vowels + +def phoneme_is_unstressed_vowel(phoneme: str) -> bool: + return phoneme_is_vowel(phoneme) and not phoneme_is_stressed_vowel(phoneme) + +def phoneme_is_high_vowel(phoneme: str) -> bool: + high_vowels = ["i", "u", "'i", "'u"] + return phoneme in high_vowels + +def phoneme_is_high_stressed_vowel(phoneme: str) -> bool: + return phoneme_is_high_vowel(phoneme) and phoneme_is_stressed_vowel(phoneme) + +def phoneme_is_high_unstressed_vowel(phoneme: str) -> bool: + return phoneme_is_high_vowel(phoneme) and phoneme_is_unstressed_vowel(phoneme) + +def phoneme_is_neutral_vowel(phoneme: str) -> bool: + neutral_vowels = ["ə"] + return phoneme in neutral_vowels + +def fusion_if_needed(node_1: WordNode, node_2: WordNode, lang: str): + + if len(node_1.phonemes) == 0 or len(node_2.phonemes) == 0: + return + else: + + last_phoneme_word_1 = node_1.phonemes[-1] + first_phoneme_word_2 = node_2.phonemes[0] + + # Case 1: high unstressed vowel + stressed vowel of the same timbre + if phoneme_is_high_unstressed_vowel(last_phoneme_word_1) and phoneme_is_high_stressed_vowel(first_phoneme_word_2) \ + and last_phoneme_word_1 == first_phoneme_word_2.replace("'", ""): + # Case [i] + [i'] = [i'] or [u] + [u'] = [u'] + node_1.phonemes.pop() + _LOGGER.debug(f"FUSION CASE 1 {node_1.text} {node_2.text}: {node_1.phonemes} {node_2.phonemes}") + + # Case 2: high unstressed vowel + high unstressed vowel of the same timbre + elif phoneme_is_high_unstressed_vowel(last_phoneme_word_1) and phoneme_is_high_unstressed_vowel(first_phoneme_word_2) \ + and last_phoneme_word_1 == first_phoneme_word_2: + # Case [i] + [i] = [i] or [u] + [u] = [u] + node_1.phonemes.pop() + _LOGGER.debug(f"FUSION CASE 2 {node_1.text} {node_2.text}: {node_1.phonemes} {node_2.phonemes}") + + # Case 3: neutral vowel + neutral vowel (except if any of the vowels is the proposition "a") + elif phoneme_is_neutral_vowel(last_phoneme_word_1) and phoneme_is_neutral_vowel(first_phoneme_word_2) \ + and node_1.text != "a" and node_2.text != "a": + node_1.phonemes.pop() + _LOGGER.debug(f"FUSION CASE 3 {node_1.text} {node_2.text}: {node_1.phonemes} {node_2.phonemes}") + +def elision_if_needed(node_1: WordNode, node_2: WordNode, lang: str): + + if len(node_1.phonemes) == 0 or len(node_2.phonemes) == 0: + return + else: + + last_phoneme_word_1 = node_1.phonemes[-1] + first_phoneme_word_2 = node_2.phonemes[0] + + # Case 1: stressed vowel ['a], ['ɛ] or ['ɔ] + neutral vowel (except if any of the vowels is the proposition "a") + if (phoneme_is_stressed_vowel(last_phoneme_word_1) and not phoneme_is_high_vowel(last_phoneme_word_1)) \ + and (phoneme_is_neutral_vowel(first_phoneme_word_2) and node_2.text != "a"): + node_2.phonemes.pop(0) + _LOGGER.debug(f"ELISION CASE 1 {node_1.text} {node_2.text}: {node_1.phonemes} {node_2.phonemes}") + +def diphthong_if_needed(node_1: WordNode, node_2: WordNode, lang: str): + + if len(node_1.phonemes) == 0 or len(node_2.phonemes) == 0: + return + else: + + last_phoneme_word_1 = node_1.phonemes[-1] + first_phoneme_word_2 = node_2.phonemes[0] + + # Case 1: stressed vowel + high unstressed vowel + if (phoneme_is_stressed_vowel(last_phoneme_word_1) and not phoneme_is_high_vowel(last_phoneme_word_1)) \ + and phoneme_is_high_unstressed_vowel(first_phoneme_word_2): + if first_phoneme_word_2 == "i": + # Case [stressed vowel] + [i] = [stressed vowel + j], stressed vowel not 'i or 'u + node_2.phonemes[0] = "j" + _LOGGER.debug(f"DIPTHONG CASE 1 {node_1.text} {node_2.text}: {node_1.phonemes} {node_2.phonemes}") + + elif first_phoneme_word_2 == "u": + # Case [stressed vowel] + [u] = [stressed vowel + uw], stressed vowel not 'i or 'u + node_2.phonemes[0] = "uw" + _LOGGER.debug(f"DIPTHONG CASE 1 {node_1.text} {node_2.text}: {node_1.phonemes} {node_2.phonemes}") + + # Case 2: high unstressed vowel + stressed vowel + elif phoneme_is_high_unstressed_vowel(last_phoneme_word_1) and phoneme_is_stressed_vowel(first_phoneme_word_2): + if last_phoneme_word_1 == "i" and first_phoneme_word_2 not in ["'i"] and node_1.text in ["hi", "ho", "i"]: + # Case [i] + [stressed] = [y + stressed vowel], i only from "hi", "ho" or "i" + node_1.phonemes[-1] = "y" + _LOGGER.debug(f"DIPTHONG CASE 2 {node_1.text} {node_2.text}: {node_1.phonemes} {node_2.phonemes}") + + elif last_phoneme_word_1 == "u" and first_phoneme_word_2 not in ["'u"] and node_1.text in ["hi", "ho", "i"]: + # Case [u] + [stressed] = [u + stressed vowel], i only from "hi", "ho" or "i" + pass + + # Case 3: unstressed vowel + high unstressed vowel + elif phoneme_is_neutral_vowel(last_phoneme_word_1) and phoneme_is_high_unstressed_vowel(first_phoneme_word_2): + if first_phoneme_word_2 == "i": + # Case [neutral vowel] + [i] = [neutral vowel + j] + node_2.phonemes[0] = "j" + _LOGGER.debug(f"DIPTHONG CASE 3 {node_1.text} {node_2.text}: {node_1.phonemes} {node_2.phonemes}") + + elif first_phoneme_word_2 == "u": + # Case [neutral vowel] + [u] = [neutral vowel + uw] + node_2.phonemes[0] = "uw" + _LOGGER.debug(f"DIPTHONG CASE 3 {node_1.text} {node_2.text}: {node_1.phonemes} {node_2.phonemes}") + + # Case 4: unstressed vowel + high unstressed vowel + elif phoneme_is_high_unstressed_vowel(last_phoneme_word_1) and phoneme_is_neutral_vowel(first_phoneme_word_2): + pass + +def ca_post_process_sentence( + graph: GraphType, sent_node: SentenceNode, settings: TextProcessorSettings +): + + nodes = [] + for dfs_node in nx.dfs_preorder_nodes(graph, sent_node.node): + + node = graph.nodes[dfs_node][DATA_PROP] + + if not graph.out_degree(dfs_node) == 0: + # Only leave + continue + + node = graph.nodes[dfs_node][DATA_PROP] + if isinstance(node, WordNode): + nodes.append(typing.cast(WordNode, node)) + if isinstance(node, BreakWordNode): + nodes.append(typing.cast(BreakWordNode, node)) + if isinstance(node, BreakNode): + nodes.append(typing.cast(BreakNode, node)) + if isinstance(node, PunctuationWordNode): + nodes.append(typing.cast(PunctuationWordNode, node)) + + lang = identify_lang(nodes) + + contiguous_word_nodes = [] + for node_1, node_2 in sliding_window(nodes, 2): + + if node_1 is None or node_2 is None: + continue + + if isinstance(node_1, WordNode) and isinstance(node_2, WordNode): + if not (node_1.text and node_1.phonemes and node_2.text and node_2.phonemes): + continue + contiguous_word_nodes.append([node_1, node_2]) + + for (node_1, node_2) in contiguous_word_nodes: + + diphthong_if_needed(node_1, node_2, lang) + fusion_if_needed(node_1, node_2, lang) + elision_if_needed(node_1, node_2, lang) + + +# Settings + +def get_ca_settings(lang_dir=None, **settings_args) -> TextProcessorSettings: + + """Create settings for Catalan""" + + try: + lang = str(lang_dir).split("/")[-1] + main_lang, lang_version = lang.split("-") + lang = f"{main_lang.lower()}-{lang_version.upper()}" + except: + lang = "ca" + + lookup_phonemes = settings_args["lookup_phonemes"] + + settings_values = { + "major_breaks": {".", "?", "!"}, + "minor_breaks": {",", ";", ":", "..."}, + "word_breaks": {"_"}, + "begin_punctuations": {'"', "“", "«", "[", "(", "<", "¡", "¿"}, + "end_punctuations": {'"', "”", "»", "]", ")", ">", "!", "?"}, + "default_currency": "EUR", + "default_date_format": InterpretAsFormat.DATE_DMY, + "replacements": [ + ("’", "'"), # normalize apostrophe + ("'", ""), # remove orthographic apostrophe + ("-", ""), + ("l·l", "l"), + ], + } + + settings_args = { + **settings_values, + "pre_process_text": CatalanPreProcessText(lookup_phonemes, settings_values, lang), + "post_process_sentence": ca_post_process_sentence, + **settings_args, + } + + return TextProcessorSettings(lang="ca", **settings_args) + +# ----------------------------------------------------------------------------- class DelayedGraphemesToPhonemes: """Grapheme to phoneme guesser that loads on first use""" @@ -909,31 +2292,3 @@ def __call__( return self.phonemizer(word, role=role, do_transforms=do_transforms) -# ----------------------------------------------------------------------------- -# Catalan (ca, Catalan) -# ----------------------------------------------------------------------------- - - -def get_ca_settings(lang_dir=None, **settings_args) -> TextProcessorSettings: - - """Create settings for Catalan""" - - settings_args = { - "major_breaks": {".", "?", "!"}, - "minor_breaks": {",", ";", ":", "..."}, - "word_breaks": {"_"}, - "begin_punctuations": {'"', "“", "«", "[", "(", "<", "¡", "¿"}, - "end_punctuations": {'"', "”", "»", "]", ")", ">", "!", "?"}, - "default_currency": "EUR", - "default_date_format": InterpretAsFormat.DATE_DMY, - "replacements": [ - ("’", "'"), # normalize apostrophe - ("'", ""), # remove orthographic apostrophe - ("-", ""), - ("l·l", "l"), - ], - **settings_args, - } - - return TextProcessorSettings(lang="ca", **settings_args) - diff --git a/gruut/text_processor.py b/gruut/text_processor.py index 10f9914..10349b4 100644 --- a/gruut/text_processor.py +++ b/gruut/text_processor.py @@ -80,9 +80,6 @@ def __init__( ] = None, **kwargs, ): - - #_LOGGER.debug(f"[TEST] Entered __init__ method.") - self.default_lang = default_lang self.default_settings_kwargs = kwargs @@ -116,8 +113,6 @@ def sentences( ) -> typing.Iterable[Sentence]: """Processes text and returns each sentence""" - #_LOGGER.debug(f"[TEST] Entered sentences method.") - def get_lang(lang: str) -> str: if explicit_lang or (lang != self.default_lang): return lang @@ -397,34 +392,21 @@ def get_lang(lang: str) -> str: def words(self, graph: GraphType, root: Node, **kwargs) -> typing.Iterable[Word]: """Processes text and returns each word""" - - #_LOGGER.debug(f"[TEST] Entered words method.") - for sent in self.sentences(graph, root, **kwargs): for word in sent: yield word def get_settings(self, lang: typing.Optional[str] = None) -> TextProcessorSettings: """Gets or creates settings for a language""" - - #_LOGGER.debug(f"[TEST] Entered get_settings method.") - lang = lang or self.default_lang lang_settings = self.settings.get(lang) - #_LOGGER.debug(f"[TEST] lang: {lang}") - #_LOGGER.debug(f"[TEST] self.settings: {self.settings}") - #_LOGGER.debug(f"[TEST] lang_settings: {lang_settings}") - + if lang_settings is not None: return lang_settings # Try again with resolved language resolved_lang = resolve_lang(lang) lang_settings = self.settings.get(resolved_lang) - #_LOGGER.debug(f"[TEST] Try again with resolved language.") - #_LOGGER.debug(f"[TEST] resolved_lang: {resolved_lang}") - #_LOGGER.debug(f"[TEST] lang_settings: {lang_settings}") - if lang_settings is not None: # Patch for the future self.settings[lang] = self.settings[resolved_lang] @@ -437,12 +419,7 @@ def get_settings(self, lang: typing.Optional[str] = None) -> TextProcessorSettin ) # Create default settings for language - #_LOGGER.debug(f"[TEST] Create default settings for language.") lang_dir = self.lang_dirs.get(lang) - #_LOGGER.debug(f"[TEST] lang_dir: {lang_dir}") - #_LOGGER.debug(f"[TEST] self.model_prefix: {self.model_prefix}") - #_LOGGER.debug(f"[TEST] self.search_dirs: {self.search_dirs}") - #_LOGGER.debug(f"[TEST] self.default_settings_kwargs: {self.default_settings_kwargs}") lang_settings = get_settings( lang, lang_dir=lang_dir, @@ -453,10 +430,6 @@ def get_settings(self, lang: typing.Optional[str] = None) -> TextProcessorSettin self.settings[lang] = lang_settings self.settings[resolved_lang] = lang_settings - #_LOGGER.debug(f"[TEST] lang_settings: {lang_settings}") - - #_LOGGER.debug(f"[TEST] Exit get_settings method.") - return lang_settings # ------------------------------------------------------------------------- @@ -465,7 +438,6 @@ def get_settings(self, lang: typing.Optional[str] = None) -> TextProcessorSettin def __call__(self, *args, **kwargs): """Processes text or SSML""" - #_LOGGER.debug(f"[TEST] entered __call__ method.") return self.process(*args, **kwargs) def process( @@ -511,9 +483,6 @@ def process( graph, root: text graph and root node """ - - #_LOGGER.debug(f"[TEST] entered process method.") - if ssml: try: root_element = etree.fromstring(text) @@ -1039,18 +1008,14 @@ def in_inline_lexicon( # Do replacements before minor/major breaks if pipeline_split(self._split_replacements, graph, root): was_changed = True - ##_LOGGER.debug(f"[TEST] Do replacements before minor/major breaks.") # Split punctuations (quotes, etc.) before breaks if pipeline_split(self._split_punctuations, graph, root): was_changed = True - ##_LOGGER.debug(f"[TEST] Split punctuations (quotes, etc.) before breaks.") # Split on minor breaks (commas, etc.) - ###_LOGGER.debug(f"[TEST] self._split_minor_breaks: {self._split_minor_breaks}") if pipeline_split(self._split_minor_breaks, graph, root): was_changed = True - ##_LOGGER.debug(f"[TEST] Split on minor breaks (commas, etc.).") # Expand abbrevations before major breaks if pipeline_split(self._split_abbreviations, graph, root): @@ -1063,7 +1028,6 @@ def in_inline_lexicon( # Split on major breaks (periods, etc.) if pipeline_split(self._split_major_breaks, graph, root): was_changed = True - ##_LOGGER.debug(f"[TEST] Split on major breaks (periods, etc.).") # Break apart sentences using BreakWordNodes if self._break_sentences(graph, root): @@ -1233,15 +1197,10 @@ def process_sentence(words: typing.List[WordNode]): # Post process entire graph self.post_process_graph(graph, root) - #_LOGGER.debug(f"[TEST] exit process method.") - return graph, root def post_process_graph(self, graph: GraphType, root: Node): """User-defined post-processing of entire graph""" - - #_LOGGER.debug(f"[TEST] Entered post_process_graph method.") - pass # ------------------------------------------------------------------------- @@ -1250,9 +1209,6 @@ def post_process_graph(self, graph: GraphType, root: Node): def _break_sentences(self, graph: GraphType, root: Node) -> bool: """Break sentences apart at BreakWordNode(break_type="major") nodes.""" - - #_LOGGER.debug(f"[TEST] Entered _break_sentences method.") - was_changed = False # This involves: @@ -1330,9 +1286,6 @@ def _break_sentences(self, graph: GraphType, root: Node) -> bool: def _break_words(self, graph: GraphType, node: Node): """Break apart words according to work breaks pattern""" - - #_LOGGER.debug(f"[TEST] Entered _break_words method.") - if not isinstance(node, WordNode): return @@ -1380,7 +1333,6 @@ def _break_words(self, graph: GraphType, node: Node): } def _split_punctuations(self, graph: GraphType, node: Node): - #_LOGGER.debug(f"[TEST] Entered _split_punctuations method.") if not isinstance(node, WordNode): return @@ -1493,7 +1445,6 @@ def _split_punctuations(self, graph: GraphType, node: Node): } def _split_major_breaks(self, graph: GraphType, node: Node): - #_LOGGER.debug(f"[TEST] Entered _split_major_breaks method.") if not isinstance(node, WordNode): return @@ -1540,11 +1491,7 @@ def _split_major_breaks(self, graph: GraphType, node: Node): } def _split_minor_breaks(self, graph: GraphType, node: Node): - - #_LOGGER.debug(f"[TEST] Entered _split_minor_breaks method") - if not isinstance(node, WordNode): - ##_LOGGER.debug(f"[TEST] Entered if not isinstance(node, WordNode)") return word = typing.cast(WordNode, node) @@ -1553,10 +1500,7 @@ def _split_minor_breaks(self, graph: GraphType, node: Node): return settings = self.get_settings(word.lang) - ##_LOGGER.debug(f"[TEST] word.lang: {word.lang}") - ##_LOGGER.debug(f"[TEST] settings: {settings}") if settings.minor_breaks_pattern is None: - ##_LOGGER.debug(f"[TEST] Entered if settings.minor_breaks_pattern is None") # No pattern set for this language return @@ -1590,7 +1534,6 @@ def _split_minor_breaks(self, graph: GraphType, node: Node): def _find_parent(self, graph, node, *classes): """Tries to find a node whose type is in classes in the tree above node""" - #_LOGGER.debug(f"[TEST] Entered _find_parent method.") parents = [] for parent_node in graph.predecessors(node.node): parent = graph.nodes[parent_node][DATA_PROP] @@ -1612,7 +1555,6 @@ def _phonemes_for_break( break_type: typing.Union[str, BreakType], lang: typing.Optional[str] = None, ) -> typing.Optional[PHONEMES_TYPE]: - #_LOGGER.debug(f"[TEST] Entered _phonemes_for_break method.") if break_type == BreakType.MAJOR: return [IPA.BREAK_MAJOR.value] @@ -1635,7 +1577,6 @@ def _pipeline_tokenize( ] = None, ): """Splits text into word nodes""" - #_LOGGER.debug(f"[TEST] Entered _pipeline_tokenize method.") if scope_kwargs is None: scope_kwargs = {} @@ -2150,7 +2091,6 @@ def _is_word_in_lexicon( self, word: str, settings: TextProcessorSettings ) -> typing.Optional[bool]: """True if word is in the lexicon""" - #_LOGGER.debug(f"[TEST] Entered _is_word_in_lexicon method.") if settings.lookup_phonemes is None: return None diff --git a/gruut/utils.py b/gruut/utils.py index 9654f89..c41af45 100644 --- a/gruut/utils.py +++ b/gruut/utils.py @@ -44,9 +44,6 @@ def resolve_lang(lang: str) -> str: Returns: Resolved language name """ - - _LOGGER.debug(f"Entered resolve_lang method.") - lang = lang.lower().replace("_", "-") return LANG_ALIASES.get(lang, lang) @@ -75,8 +72,6 @@ def find_lang_dir( Returns: Path to the language model directory or None if it can't be found """ - _LOGGER.debug(f"Entered find_lang_dir method.") - base_lang = LANG_SPLIT_PATTERN.split(lang)[0].lower() lang_module_name = f"gruut_lang_{base_lang}" diff --git a/requirements.txt b/requirements.txt index 3e62390..547e430 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,7 @@ Babel>=2.8.0,<3.0.0 dateparser~=1.1.0 +gruut-ipa>=0.12.0,<1.0 +gruut_lang_en~=2.0.0 jsonlines~=1.2.0 networkx>=2.5.0,<3.0.0 num2words>=0.5.10,<1.0.0