diff --git a/docs/opus-trainer.md b/docs/opus-trainer.md index 6ccb61e6c..22a7fc3a7 100644 --- a/docs/opus-trainer.md +++ b/docs/opus-trainer.md @@ -92,9 +92,10 @@ modifiers: max_word_length: 5 # Maximum word length for each word in the noisy sentence max_words: 6 # Maximum number of words in each noisy sentence - Tags: 0.05 - custom_detok_src: {src} - custom_detok_trg: {trg} + custom_detok_src: "icu:{src}" + custom_detok_trg: "icu:{trg}" augment: 1 + tag: 0 spm_vocab: {vocab} seed: 1111 @@ -104,13 +105,14 @@ num_fields: 3 #### Tokenization and alignments -`Tags` modifiers requires whitespace or Moses tokenized alignments as input. +`Tags` modifiers requires whitespace, Moses or ICU tokenized alignments as input. Marian requires Sentencepiece tokenized alignments and raw text input. To make them compatible `Tags` modifier can remap the alignments in the end using the passed Sentencepiece model `spm_vocab: vocab.spm` (student model use case). If the `spm_vocab` argument is missing `Tags` modifier will remove alignments and output only the parallel sentences (teacher model use case). -Currently, Moses-tokenized text and its alignments are passed to OpusTrainer (to work around CJK languages where whitespace-based tokenization doesn't make sense). -`custom_detok_{src,trg}` OpusTrainer modifiers are applied to detokenize text after inline noise is added. +Currently, ICUs-tokenized text and its alignments are passed to OpusTrainer (to work around CJK languages where whitespace-based tokenization doesn't make sense). +Whitespaces are represented with a special symbol "▁" to allow for lossless text reconstruction on OpusTrainer side. +`custom_detok_icu:{src,trg}` OpusTrainer modifiers are applied to detokenize text after inline noise is added. Then the detokenized text is passed to Marian together with the alignments remapped to SentencePiece tokenization. ## Models diff --git a/pipeline/alignments/align.py b/pipeline/alignments/align.py index bfc5c2ac1..9fee8d390 100755 --- a/pipeline/alignments/align.py +++ b/pipeline/alignments/align.py @@ -32,7 +32,7 @@ import zstandard from tqdm import tqdm -from pipeline.alignments.tokenizer import tokenize_moses +from pipeline.alignments.tokenizer import tokenize, TokenizerType from pipeline.common.datasets import decompress from pipeline.common.logging import get_logger @@ -42,6 +42,7 @@ class Tokenization(Enum): spaces = "spaces" moses = "moses" + icu = "icu" def run( @@ -64,25 +65,29 @@ def run( corpus_src = maybe_decompress(corpus_src) corpus_trg = maybe_decompress(corpus_trg) - if tokenization == Tokenization.moses: + if tokenization == Tokenization.spaces: + tokenized_src, tokenized_trg = corpus_src, corpus_trg + output_aln = output_path + else: + ext = f".tok-{tokenization.value}" tokenized_src = ( - corpus_src[: corpus_src.rfind(".")] - + ".tok-moses" - + corpus_src[corpus_src.rfind(".") :] + corpus_src[: corpus_src.rfind(".")] + ext + corpus_src[corpus_src.rfind(".") :] ) tokenized_trg = ( - corpus_trg[: corpus_trg.rfind(".")] - + ".tok-moses" - + corpus_trg[corpus_trg.rfind(".") :] + corpus_trg[: corpus_trg.rfind(".")] + ext + corpus_trg[corpus_trg.rfind(".") :] ) output_aln = os.path.join(tmp_dir, "aln") + + if tokenization == Tokenization.moses: + tokenizer = TokenizerType.fast_moses + elif tokenization == Tokenization.icu: + tokenizer = TokenizerType.icu + else: + raise ValueError(f"Unrecognized tokenization type {tokenization}") # C++ tokenizer can process 100k sentences per second on a single core, # so the chunks to parallelize things should be large enough to increase throughput - tokenize_moses(corpus_src, tokenized_src, src, sentences_per_chunk=500000) - tokenize_moses(corpus_trg, tokenized_trg, trg, sentences_per_chunk=500000) - else: - tokenized_src, tokenized_trg = corpus_src, corpus_trg - output_aln = output_path + tokenize(corpus_src, tokenized_src, src, sentences_per_chunk=500000, tokenizer=tokenizer) + tokenize(corpus_trg, tokenized_trg, trg, sentences_per_chunk=500000, tokenizer=tokenizer) fwd_path, rev_path = align( corpus_src=tokenized_src, @@ -102,7 +107,7 @@ def run( priors_output_path=priors_output_path, ) - if tokenization == Tokenization.moses: + if tokenization != Tokenization.spaces: if output_tokenized: logger.info("Saving tokenized corpus") # Copy tokenized corpus to output directory @@ -260,12 +265,12 @@ def remap( output_aln_path: str, ) -> None: """ - Remaps alignments that were calculated for Moses-tokenized corpus to whitespace-tokenized ones. + Remaps alignments that were calculated for tokenized corpus to whitespace-tokenized ones. :param src_path: path to whitespace-tokenized sentences in source language :param trg_path: path to whitespace-tokenized sentences in target language - :param tok_src_path: path to Moses-tokenized sentences in source language - :param tok_trg_path: path to Moses-tokenized sentences in target language - :param aln_path: path to the alignments calculated for Moses-tokenized corpus + :param tok_src_path: path to tokenized sentences in source language + :param tok_trg_path: path to tokenized sentences in target language + :param aln_path: path to the alignments calculated for tokenized corpus :param output_aln_path: path to output alignments file remapped to whitespace-tokenized corpus """ logger.info("Remapping alignments to whitespace tokenization") @@ -389,7 +394,7 @@ def main() -> None: choices=list(Tokenization), default=Tokenization.spaces, help="Use the specified tokenization method. Default is `spaces` which means no tokenization will be applied. " - "It remaps the alignments back to whitespace tokenized ones if the `moses` tokenization is used.", + "It remaps the alignments back to whitespace tokenized ones if another tokenization method is used.", ) parser.add_argument( "--output_tokenized", diff --git a/pipeline/alignments/requirements/alignments.in b/pipeline/alignments/requirements/alignments.in index f6413a8ed..05bd3e23d 100644 --- a/pipeline/alignments/requirements/alignments.in +++ b/pipeline/alignments/requirements/alignments.in @@ -3,3 +3,4 @@ opus-fast-mosestokenizer==0.0.8.5 tqdm requests==2.31.0 zstandard +PyICU==2.8.1 diff --git a/pipeline/alignments/requirements/alignments.txt b/pipeline/alignments/requirements/alignments.txt index b7b40eaa1..4136cdfa5 100644 --- a/pipeline/alignments/requirements/alignments.txt +++ b/pipeline/alignments/requirements/alignments.txt @@ -18,6 +18,8 @@ numpy==1.26.4 # via eflomal opus-fast-mosestokenizer==0.0.8.5 # via -r pipeline/alignments/requirements/alignments.in +pyicu==2.8.1 + # via -r pipeline/alignments/requirements/alignments.in requests==2.31.0 # via -r pipeline/alignments/requirements/alignments.in tqdm==4.66.4 diff --git a/pipeline/alignments/tokenizer.py b/pipeline/alignments/tokenizer.py index d9a0aaace..f4465e277 100644 --- a/pipeline/alignments/tokenizer.py +++ b/pipeline/alignments/tokenizer.py @@ -4,15 +4,23 @@ Example: python pipeline/alignments/tokenizer.py --input_path=data/datasets/news.2023.en.shuffled.deduped \ - --output_path=data/datasets/news.2023.en.shuffled.deduped.tok-moses --lang=en --chunk_size=500000 + --output_path=data/datasets/news.2023.en.shuffled.deduped.tok-icu --lang=en --chunk_size=500000 --tokenizer=icu Using C++ opus-fast-mosestokenizer sometimes requires specifying LD_LIBRARY_PATH before starting the Python process see https://github.com/Helsinki-NLP/opus-fast-mosestokenizer/issues/6 export LD_LIBRARY_PATH=...//lib/python3.10/site-packages/mosestokenizer/lib +Using ICU tokenizer requires installing it with `apt-get install python3-icu`, +see more installation instructions here: https://pypi.org/project/PyICU/ + +Whitespaces are ignored by Moses based tokenizers and preserved and replaced with a special token "▁" by ICU tokenizer +which allows lossless reconstruction of the original text on detokenization. + """ import argparse import multiprocessing +from abc import ABC, abstractmethod +from enum import Enum from typing import List from tqdm import tqdm @@ -22,28 +30,121 @@ logger = get_logger("tokenizer") +class TokenizerType(Enum): + fast_moses = "fast_moses" + sacre_moses = "sacre_moses" + icu = "icu" + + +class Tokenizer(ABC): + def __init__(self, lang: str): + self.lang = lang + + @abstractmethod + def tokenize(self, text: str) -> List[str]: + pass + + @abstractmethod + def detokenize(self, tokens: List[str]) -> str: + pass + + +class FastMosesTokenizer(Tokenizer): + """ + Uses Moses tokenizer https://github.com/Helsinki-NLP/opus-fast-mosestokenizer + """ + + def __init__(self, lang): + super().__init__(lang) + from mosestokenizer import MosesTokenizer + + try: + self.tokenizer = MosesTokenizer(lang) + except RuntimeError as err: + msg = str(err) + if "No known abbreviations for language" in msg: + # Fall-back to English if the language is not found + self.tokenizer = MosesTokenizer("en") + else: + raise err + + def tokenize(self, text: str) -> List[str]: + return self.tokenizer.tokenize(text) + + def detokenize(self, tokens: List[str]) -> str: + return self.tokenizer.detokenize(tokens) + + +class SacreMosesTokenizer(Tokenizer): + """ + Uses Moses tokenizer https://github.com/hplt-project/sacremoses + """ + + def __init__(self, lang): + super().__init__(lang) + import sacremoses + + self.tokenizer = sacremoses.MosesTokenizer(lang) + self.detokenizer = sacremoses.MosesDetokenizer(lang) + + def tokenize(self, text: str) -> List[str]: + return self.tokenizer.tokenize(text) + + def detokenize(self, tokens: List[str]) -> str: + return self.detokenizer.detokenize(tokens) + + +class IcuTokenizer(Tokenizer): + """ + Uses ICU based word segmenter https://pypi.org/project/PyICU/ + Preserves whitespaces as tokens by replacing them with a special character "▁". + Allows lossless reconstruction of the original text on detokenization. + """ + + # Same character is used by SentencePiece + SPACE_TOKEN = "▁" + + def tokenize(self, text: str) -> List[str]: + from icu import BreakIterator, Locale + + bi = BreakIterator.createWordInstance(Locale(self.lang)) + bi.setText(text) + + tokens = [] + start = bi.first() + for end in bi: + token = text[start:end] + if ( + token and token != "\n" + ): # exclude empty tokens, but leave whitespaces and replace them with a special token + tokens.append(token.replace(" ", self.SPACE_TOKEN)) + start = end + return tokens + + def detokenize(self, tokens: List[str]) -> str: + return "".join(tokens).replace(self.SPACE_TOKEN, " ") + + def _read_file_in_chunks(file_path, chunk_size): with open(file_path, "r", encoding="utf-8") as file: while True: lines = file.readlines(chunk_size) if not lines: break - yield lines + yield [line.rstrip() for line in lines] def _tokenize_lines(params) -> List[str]: - lines, lang = params - from mosestokenizer import MosesTokenizer - - try: - tokenizer = MosesTokenizer(lang) - except RuntimeError as err: - msg = str(err) - if "No known abbreviations for language" in msg: - # Fall-back to English if the language is not found - tokenizer = MosesTokenizer("en") - else: - raise err + lines, lang, tok_type = params + + if tok_type == TokenizerType.fast_moses: + tokenizer = FastMosesTokenizer(lang) + elif tok_type == TokenizerType.sacre_moses: + tokenizer = SacreMosesTokenizer(lang) + elif tok_type == TokenizerType.icu: + tokenizer = IcuTokenizer(lang) + else: + raise ValueError(f"Unknown tokenizer type: {tok_type}") tokenized = [] for line in lines: @@ -52,8 +153,12 @@ def _tokenize_lines(params) -> List[str]: return tokenized -def tokenize_moses( - input_path: str, output_path: str, lang: str, sentences_per_chunk: int = 100000 +def tokenize( + input_path: str, + output_path: str, + lang: str, + tokenizer: TokenizerType, + sentences_per_chunk: int = 100000, ) -> None: logger.info(f"Tokenizing {input_path} with Moses tokenizer") @@ -65,7 +170,7 @@ def tokenize_moses( # ~100K sentences per second on a single core for tokenized_chunk in pool.imap( _tokenize_lines, - ((ch, lang) for ch in chunks), + ((ch, lang, tokenizer) for ch in chunks), ): output_file.write("\n".join(tokenized_chunk) + "\n") pbar.update(len(tokenized_chunk)) @@ -104,5 +209,19 @@ def tokenize_moses( default=None, help="Number of lines to process per chunk", ) + parser.add_argument( + "--tokenizer", + metavar="TOKENIZER", + type=TokenizerType, + choices=TokenizerType, + default=TokenizerType.icu, + help="Tokenization method", + ) args = parser.parse_args() - tokenize_moses(args.input_path, args.output_path, args.lang, args.chunk_size) + tokenize( + input_path=args.input_path, + output_path=args.output_path, + lang=args.lang, + sentences_per_chunk=args.chunk_size, + tokenizer=args.tokenizer, + ) diff --git a/pipeline/data/requirements/data.in b/pipeline/data/requirements/data.in index d09287692..b8c17ed5e 100644 --- a/pipeline/data/requirements/data.in +++ b/pipeline/data/requirements/data.in @@ -1,5 +1,5 @@ -# use the latest main, switch to PyPi when released -git+https://github.com/hplt-project/OpusTrainer.git@c966d7b353d6b3c6a09d9573f1ab6ba3221c1d21 +# TODO: ICU tokenizer commit, replace to PyPi version when released. Issue: https://github.com/mozilla/translations/issues/967 +git+https://github.com/mozilla/OpusTrainer.git@554b7202cecbb2eaae38819aebb6c5020685f670 simalign==0.4 mtdata==0.4.1 psutil==6.0.0 diff --git a/pipeline/data/requirements/data.txt b/pipeline/data/requirements/data.txt index fdf43babc..034e61704 100644 --- a/pipeline/data/requirements/data.txt +++ b/pipeline/data/requirements/data.txt @@ -62,7 +62,7 @@ numpy==1.26.4 # transformers opencc==1.1.9 # via -r pipeline/data/requirements/data.in -opustrainer @ git+https://github.com/hplt-project/OpusTrainer.git@c966d7b353d6b3c6a09d9573f1ab6ba3221c1d21 +opustrainer @ git+https://github.com/mozilla/OpusTrainer.git@554b7202cecbb2eaae38819aebb6c5020685f670 # via -r pipeline/data/requirements/data.in packaging==24.1 # via diff --git a/pipeline/train/configs/opustrainer/student.cjk.yml b/pipeline/train/configs/opustrainer/student.cjk.yml index af6936d77..fe05bddba 100644 --- a/pipeline/train/configs/opustrainer/student.cjk.yml +++ b/pipeline/train/configs/opustrainer/student.cjk.yml @@ -21,8 +21,9 @@ modifiers: # Tags modifier has to be the last one to retokenize the alignments - Tags: 0.005 augment: 1 - custom_detok_src: {src} - custom_detok_trg: {trg} + tag: 0 + custom_detok_src: "icu:{src}" + custom_detok_trg: "icu:{trg}" spm_vocab: {vocab} seed: 1111 diff --git a/pipeline/train/configs/opustrainer/student.yml b/pipeline/train/configs/opustrainer/student.yml index c3a3df12a..6cc7d67c5 100644 --- a/pipeline/train/configs/opustrainer/student.yml +++ b/pipeline/train/configs/opustrainer/student.yml @@ -26,8 +26,9 @@ modifiers: # Tags modifier has to be the last one to retokenize the alignments - Tags: 0.005 augment: 1 - custom_detok_src: {src} - custom_detok_trg: {trg} + tag: 0 + custom_detok_src: "icu:{src}" + custom_detok_trg: "icu:{trg}" spm_vocab: {vocab} seed: 1111 diff --git a/pipeline/train/configs/opustrainer/teacher.one-stage.yml b/pipeline/train/configs/opustrainer/teacher.one-stage.yml index e4a3d586d..2ec4f69bf 100644 --- a/pipeline/train/configs/opustrainer/teacher.one-stage.yml +++ b/pipeline/train/configs/opustrainer/teacher.one-stage.yml @@ -30,9 +30,10 @@ modifiers: # we don't use alignments for teacher training # Tags modifier has to be the last one to remove the alignments - Tags: 0.005 - custom_detok_src: {src} - custom_detok_trg: {trg} + custom_detok_src: "icu:{src}" + custom_detok_trg: "icu:{trg}" augment: 1 + tag: 0 # random seed should be different for different teacher models diff --git a/pipeline/train/configs/opustrainer/teacher.two-stage.cjk.yml b/pipeline/train/configs/opustrainer/teacher.two-stage.cjk.yml index a1fa599ec..ae059fe2a 100644 --- a/pipeline/train/configs/opustrainer/teacher.two-stage.cjk.yml +++ b/pipeline/train/configs/opustrainer/teacher.two-stage.cjk.yml @@ -30,9 +30,10 @@ modifiers: # we don't use alignments for teacher training # Tags modifier has to be the last one to remove the alignments - Tags: 0.005 - custom_detok_src: {src} - custom_detok_trg: {trg} + custom_detok_src: "icu:{src}" + custom_detok_trg: "icu:{trg}" augment: 1 + tag: 0 # random seed should be different for different teacher models diff --git a/pipeline/train/configs/opustrainer/teacher.two-stage.yml b/pipeline/train/configs/opustrainer/teacher.two-stage.yml index 0f33402d3..83fc26a2f 100644 --- a/pipeline/train/configs/opustrainer/teacher.two-stage.yml +++ b/pipeline/train/configs/opustrainer/teacher.two-stage.yml @@ -35,9 +35,10 @@ modifiers: # we don't use alignments for teacher training # Tags modifier has to be the last one to remove the alignments - Tags: 0.005 - custom_detok_src: {src} - custom_detok_trg: {trg} + custom_detok_src: "icu:{src}" + custom_detok_trg: "icu:{trg}" augment: 1 + tag: 0 # random seed should be different for different teacher models diff --git a/pipeline/train/requirements/train.in b/pipeline/train/requirements/train.in index 5f461a0c2..0f2d3f6d9 100644 --- a/pipeline/train/requirements/train.in +++ b/pipeline/train/requirements/train.in @@ -1,3 +1,3 @@ -# use the latest main, switch to PyPi when released -git+https://github.com/hplt-project/OpusTrainer.git@c966d7b353d6b3c6a09d9573f1ab6ba3221c1d21 +# TODO: ICU tokenizer commit, replace to PyPi version when released. Issue: https://github.com/mozilla/translations/issues/967 +git+https://github.com/mozilla/OpusTrainer.git@554b7202cecbb2eaae38819aebb6c5020685f670 gpustat==1.1.1 diff --git a/pipeline/train/requirements/train.txt b/pipeline/train/requirements/train.txt index 34ad968ad..f31898c29 100644 --- a/pipeline/train/requirements/train.txt +++ b/pipeline/train/requirements/train.txt @@ -14,7 +14,7 @@ joblib==1.3.2 # via sacremoses nvidia-ml-py==12.560.30 # via gpustat -opustrainer @ git+https://github.com/hplt-project/OpusTrainer.git@c966d7b353d6b3c6a09d9573f1ab6ba3221c1d21 +opustrainer @ git+https://github.com/mozilla/OpusTrainer.git@554b7202cecbb2eaae38819aebb6c5020685f670 # via -r pipeline/train/requirements/train.in psutil==6.1.0 # via gpustat diff --git a/poetry.lock b/poetry.lock index 5497ea803..b11184f6f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. [[package]] name = "absl-py" @@ -41,87 +41,87 @@ files = [ [[package]] name = "aiohttp" -version = "3.11.10" +version = "3.11.11" description = "Async http client/server framework (asyncio)" optional = false python-versions = ">=3.9" files = [ - {file = "aiohttp-3.11.10-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cbad88a61fa743c5d283ad501b01c153820734118b65aee2bd7dbb735475ce0d"}, - {file = "aiohttp-3.11.10-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:80886dac673ceaef499de2f393fc80bb4481a129e6cb29e624a12e3296cc088f"}, - {file = "aiohttp-3.11.10-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:61b9bae80ed1f338c42f57c16918853dc51775fb5cb61da70d590de14d8b5fb4"}, - {file = "aiohttp-3.11.10-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9e2e576caec5c6a6b93f41626c9c02fc87cd91538b81a3670b2e04452a63def6"}, - {file = "aiohttp-3.11.10-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:02c13415b5732fb6ee7ff64583a5e6ed1c57aa68f17d2bda79c04888dfdc2769"}, - {file = "aiohttp-3.11.10-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4cfce37f31f20800a6a6620ce2cdd6737b82e42e06e6e9bd1b36f546feb3c44f"}, - {file = "aiohttp-3.11.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3bbbfff4c679c64e6e23cb213f57cc2c9165c9a65d63717108a644eb5a7398df"}, - {file = "aiohttp-3.11.10-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:49c7dbbc1a559ae14fc48387a115b7d4bbc84b4a2c3b9299c31696953c2a5219"}, - {file = "aiohttp-3.11.10-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:68386d78743e6570f054fe7949d6cb37ef2b672b4d3405ce91fafa996f7d9b4d"}, - {file = "aiohttp-3.11.10-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:9ef405356ba989fb57f84cac66f7b0260772836191ccefbb987f414bcd2979d9"}, - {file = "aiohttp-3.11.10-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:5d6958671b296febe7f5f859bea581a21c1d05430d1bbdcf2b393599b1cdce77"}, - {file = "aiohttp-3.11.10-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:99b7920e7165be5a9e9a3a7f1b680f06f68ff0d0328ff4079e5163990d046767"}, - {file = "aiohttp-3.11.10-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:0dc49f42422163efb7e6f1df2636fe3db72713f6cd94688e339dbe33fe06d61d"}, - {file = "aiohttp-3.11.10-cp310-cp310-win32.whl", hash = "sha256:40d1c7a7f750b5648642586ba7206999650208dbe5afbcc5284bcec6579c9b91"}, - {file = "aiohttp-3.11.10-cp310-cp310-win_amd64.whl", hash = "sha256:68ff6f48b51bd78ea92b31079817aff539f6c8fc80b6b8d6ca347d7c02384e33"}, - {file = "aiohttp-3.11.10-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:77c4aa15a89847b9891abf97f3d4048f3c2d667e00f8a623c89ad2dccee6771b"}, - {file = "aiohttp-3.11.10-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:909af95a72cedbefe5596f0bdf3055740f96c1a4baa0dd11fd74ca4de0b4e3f1"}, - {file = "aiohttp-3.11.10-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:386fbe79863eb564e9f3615b959e28b222259da0c48fd1be5929ac838bc65683"}, - {file = "aiohttp-3.11.10-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3de34936eb1a647aa919655ff8d38b618e9f6b7f250cc19a57a4bf7fd2062b6d"}, - {file = "aiohttp-3.11.10-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0c9527819b29cd2b9f52033e7fb9ff08073df49b4799c89cb5754624ecd98299"}, - {file = "aiohttp-3.11.10-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65a96e3e03300b41f261bbfd40dfdbf1c301e87eab7cd61c054b1f2e7c89b9e8"}, - {file = "aiohttp-3.11.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:98f5635f7b74bcd4f6f72fcd85bea2154b323a9f05226a80bc7398d0c90763b0"}, - {file = "aiohttp-3.11.10-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:03b6002e20938fc6ee0918c81d9e776bebccc84690e2b03ed132331cca065ee5"}, - {file = "aiohttp-3.11.10-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6362cc6c23c08d18ddbf0e8c4d5159b5df74fea1a5278ff4f2c79aed3f4e9f46"}, - {file = "aiohttp-3.11.10-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:3691ed7726fef54e928fe26344d930c0c8575bc968c3e239c2e1a04bd8cf7838"}, - {file = "aiohttp-3.11.10-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:31d5093d3acd02b31c649d3a69bb072d539d4c7659b87caa4f6d2bcf57c2fa2b"}, - {file = "aiohttp-3.11.10-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:8b3cf2dc0f0690a33f2d2b2cb15db87a65f1c609f53c37e226f84edb08d10f52"}, - {file = "aiohttp-3.11.10-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:fbbaea811a2bba171197b08eea288b9402faa2bab2ba0858eecdd0a4105753a3"}, - {file = "aiohttp-3.11.10-cp311-cp311-win32.whl", hash = "sha256:4b2c7ac59c5698a7a8207ba72d9e9c15b0fc484a560be0788b31312c2c5504e4"}, - {file = "aiohttp-3.11.10-cp311-cp311-win_amd64.whl", hash = "sha256:974d3a2cce5fcfa32f06b13ccc8f20c6ad9c51802bb7f829eae8a1845c4019ec"}, - {file = "aiohttp-3.11.10-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:b78f053a7ecfc35f0451d961dacdc671f4bcbc2f58241a7c820e9d82559844cf"}, - {file = "aiohttp-3.11.10-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ab7485222db0959a87fbe8125e233b5a6f01f4400785b36e8a7878170d8c3138"}, - {file = "aiohttp-3.11.10-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:cf14627232dfa8730453752e9cdc210966490992234d77ff90bc8dc0dce361d5"}, - {file = "aiohttp-3.11.10-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:076bc454a7e6fd646bc82ea7f98296be0b1219b5e3ef8a488afbdd8e81fbac50"}, - {file = "aiohttp-3.11.10-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:482cafb7dc886bebeb6c9ba7925e03591a62ab34298ee70d3dd47ba966370d2c"}, - {file = "aiohttp-3.11.10-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bf3d1a519a324af764a46da4115bdbd566b3c73fb793ffb97f9111dbc684fc4d"}, - {file = "aiohttp-3.11.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:24213ba85a419103e641e55c27dc7ff03536c4873470c2478cce3311ba1eee7b"}, - {file = "aiohttp-3.11.10-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b99acd4730ad1b196bfb03ee0803e4adac371ae8efa7e1cbc820200fc5ded109"}, - {file = "aiohttp-3.11.10-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:14cdb5a9570be5a04eec2ace174a48ae85833c2aadc86de68f55541f66ce42ab"}, - {file = "aiohttp-3.11.10-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:7e97d622cb083e86f18317282084bc9fbf261801b0192c34fe4b1febd9f7ae69"}, - {file = "aiohttp-3.11.10-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:012f176945af138abc10c4a48743327a92b4ca9adc7a0e078077cdb5dbab7be0"}, - {file = "aiohttp-3.11.10-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:44224d815853962f48fe124748227773acd9686eba6dc102578defd6fc99e8d9"}, - {file = "aiohttp-3.11.10-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c87bf31b7fdab94ae3adbe4a48e711bfc5f89d21cf4c197e75561def39e223bc"}, - {file = "aiohttp-3.11.10-cp312-cp312-win32.whl", hash = "sha256:06a8e2ee1cbac16fe61e51e0b0c269400e781b13bcfc33f5425912391a542985"}, - {file = "aiohttp-3.11.10-cp312-cp312-win_amd64.whl", hash = "sha256:be2b516f56ea883a3e14dda17059716593526e10fb6303189aaf5503937db408"}, - {file = "aiohttp-3.11.10-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8cc5203b817b748adccb07f36390feb730b1bc5f56683445bfe924fc270b8816"}, - {file = "aiohttp-3.11.10-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5ef359ebc6949e3a34c65ce20230fae70920714367c63afd80ea0c2702902ccf"}, - {file = "aiohttp-3.11.10-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:9bca390cb247dbfaec3c664326e034ef23882c3f3bfa5fbf0b56cad0320aaca5"}, - {file = "aiohttp-3.11.10-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:811f23b3351ca532af598405db1093f018edf81368e689d1b508c57dcc6b6a32"}, - {file = "aiohttp-3.11.10-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ddf5f7d877615f6a1e75971bfa5ac88609af3b74796ff3e06879e8422729fd01"}, - {file = "aiohttp-3.11.10-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6ab29b8a0beb6f8eaf1e5049252cfe74adbaafd39ba91e10f18caeb0e99ffb34"}, - {file = "aiohttp-3.11.10-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c49a76c1038c2dd116fa443eba26bbb8e6c37e924e2513574856de3b6516be99"}, - {file = "aiohttp-3.11.10-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7f3dc0e330575f5b134918976a645e79adf333c0a1439dcf6899a80776c9ab39"}, - {file = "aiohttp-3.11.10-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:efb15a17a12497685304b2d976cb4939e55137df7b09fa53f1b6a023f01fcb4e"}, - {file = "aiohttp-3.11.10-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:db1d0b28fcb7f1d35600150c3e4b490775251dea70f894bf15c678fdd84eda6a"}, - {file = "aiohttp-3.11.10-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:15fccaf62a4889527539ecb86834084ecf6e9ea70588efde86e8bc775e0e7542"}, - {file = "aiohttp-3.11.10-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:593c114a2221444f30749cc5e5f4012488f56bd14de2af44fe23e1e9894a9c60"}, - {file = "aiohttp-3.11.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7852bbcb4d0d2f0c4d583f40c3bc750ee033265d80598d0f9cb6f372baa6b836"}, - {file = "aiohttp-3.11.10-cp313-cp313-win32.whl", hash = "sha256:65e55ca7debae8faaffee0ebb4b47a51b4075f01e9b641c31e554fd376595c6c"}, - {file = "aiohttp-3.11.10-cp313-cp313-win_amd64.whl", hash = "sha256:beb39a6d60a709ae3fb3516a1581777e7e8b76933bb88c8f4420d875bb0267c6"}, - {file = "aiohttp-3.11.10-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:0580f2e12de2138f34debcd5d88894786453a76e98febaf3e8fe5db62d01c9bf"}, - {file = "aiohttp-3.11.10-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a55d2ad345684e7c3dd2c20d2f9572e9e1d5446d57200ff630e6ede7612e307f"}, - {file = "aiohttp-3.11.10-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:04814571cb72d65a6899db6099e377ed00710bf2e3eafd2985166f2918beaf59"}, - {file = "aiohttp-3.11.10-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e44a9a3c053b90c6f09b1bb4edd880959f5328cf63052503f892c41ea786d99f"}, - {file = "aiohttp-3.11.10-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:502a1464ccbc800b4b1995b302efaf426e8763fadf185e933c2931df7db9a199"}, - {file = "aiohttp-3.11.10-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:613e5169f8ae77b1933e42e418a95931fb4867b2991fc311430b15901ed67079"}, - {file = "aiohttp-3.11.10-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4cca22a61b7fe45da8fc73c3443150c3608750bbe27641fc7558ec5117b27fdf"}, - {file = "aiohttp-3.11.10-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:86a5dfcc39309470bd7b68c591d84056d195428d5d2e0b5ccadfbaf25b026ebc"}, - {file = "aiohttp-3.11.10-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:77ae58586930ee6b2b6f696c82cf8e78c8016ec4795c53e36718365f6959dc82"}, - {file = "aiohttp-3.11.10-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:78153314f26d5abef3239b4a9af20c229c6f3ecb97d4c1c01b22c4f87669820c"}, - {file = "aiohttp-3.11.10-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:98283b94cc0e11c73acaf1c9698dea80c830ca476492c0fe2622bd931f34b487"}, - {file = "aiohttp-3.11.10-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:53bf2097e05c2accc166c142a2090e4c6fd86581bde3fd9b2d3f9e93dda66ac1"}, - {file = "aiohttp-3.11.10-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:c5532f0441fc09c119e1dca18fbc0687e64fbeb45aa4d6a87211ceaee50a74c4"}, - {file = "aiohttp-3.11.10-cp39-cp39-win32.whl", hash = "sha256:47ad15a65fb41c570cd0ad9a9ff8012489e68176e7207ec7b82a0940dddfd8be"}, - {file = "aiohttp-3.11.10-cp39-cp39-win_amd64.whl", hash = "sha256:c6b9e6d7e41656d78e37ce754813fa44b455c3d0d0dced2a047def7dc5570b74"}, - {file = "aiohttp-3.11.10.tar.gz", hash = "sha256:b1fc6b45010a8d0ff9e88f9f2418c6fd408c99c211257334aff41597ebece42e"}, + {file = "aiohttp-3.11.11-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a60804bff28662cbcf340a4d61598891f12eea3a66af48ecfdc975ceec21e3c8"}, + {file = "aiohttp-3.11.11-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4b4fa1cb5f270fb3eab079536b764ad740bb749ce69a94d4ec30ceee1b5940d5"}, + {file = "aiohttp-3.11.11-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:731468f555656767cda219ab42e033355fe48c85fbe3ba83a349631541715ba2"}, + {file = "aiohttp-3.11.11-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb23d8bb86282b342481cad4370ea0853a39e4a32a0042bb52ca6bdde132df43"}, + {file = "aiohttp-3.11.11-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f047569d655f81cb70ea5be942ee5d4421b6219c3f05d131f64088c73bb0917f"}, + {file = "aiohttp-3.11.11-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dd7659baae9ccf94ae5fe8bfaa2c7bc2e94d24611528395ce88d009107e00c6d"}, + {file = "aiohttp-3.11.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:af01e42ad87ae24932138f154105e88da13ce7d202a6de93fafdafb2883a00ef"}, + {file = "aiohttp-3.11.11-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5854be2f3e5a729800bac57a8d76af464e160f19676ab6aea74bde18ad19d438"}, + {file = "aiohttp-3.11.11-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:6526e5fb4e14f4bbf30411216780c9967c20c5a55f2f51d3abd6de68320cc2f3"}, + {file = "aiohttp-3.11.11-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:85992ee30a31835fc482468637b3e5bd085fa8fe9392ba0bdcbdc1ef5e9e3c55"}, + {file = "aiohttp-3.11.11-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:88a12ad8ccf325a8a5ed80e6d7c3bdc247d66175afedbe104ee2aaca72960d8e"}, + {file = "aiohttp-3.11.11-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:0a6d3fbf2232e3a08c41eca81ae4f1dff3d8f1a30bae415ebe0af2d2458b8a33"}, + {file = "aiohttp-3.11.11-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:84a585799c58b795573c7fa9b84c455adf3e1d72f19a2bf498b54a95ae0d194c"}, + {file = "aiohttp-3.11.11-cp310-cp310-win32.whl", hash = "sha256:bfde76a8f430cf5c5584553adf9926534352251d379dcb266ad2b93c54a29745"}, + {file = "aiohttp-3.11.11-cp310-cp310-win_amd64.whl", hash = "sha256:0fd82b8e9c383af11d2b26f27a478640b6b83d669440c0a71481f7c865a51da9"}, + {file = "aiohttp-3.11.11-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ba74ec819177af1ef7f59063c6d35a214a8fde6f987f7661f4f0eecc468a8f76"}, + {file = "aiohttp-3.11.11-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4af57160800b7a815f3fe0eba9b46bf28aafc195555f1824555fa2cfab6c1538"}, + {file = "aiohttp-3.11.11-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ffa336210cf9cd8ed117011085817d00abe4c08f99968deef0013ea283547204"}, + {file = "aiohttp-3.11.11-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:81b8fe282183e4a3c7a1b72f5ade1094ed1c6345a8f153506d114af5bf8accd9"}, + {file = "aiohttp-3.11.11-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3af41686ccec6a0f2bdc66686dc0f403c41ac2089f80e2214a0f82d001052c03"}, + {file = "aiohttp-3.11.11-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:70d1f9dde0e5dd9e292a6d4d00058737052b01f3532f69c0c65818dac26dc287"}, + {file = "aiohttp-3.11.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:249cc6912405917344192b9f9ea5cd5b139d49e0d2f5c7f70bdfaf6b4dbf3a2e"}, + {file = "aiohttp-3.11.11-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0eb98d90b6690827dcc84c246811feeb4e1eea683c0eac6caed7549be9c84665"}, + {file = "aiohttp-3.11.11-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ec82bf1fda6cecce7f7b915f9196601a1bd1a3079796b76d16ae4cce6d0ef89b"}, + {file = "aiohttp-3.11.11-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:9fd46ce0845cfe28f108888b3ab17abff84ff695e01e73657eec3f96d72eef34"}, + {file = "aiohttp-3.11.11-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:bd176afcf8f5d2aed50c3647d4925d0db0579d96f75a31e77cbaf67d8a87742d"}, + {file = "aiohttp-3.11.11-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:ec2aa89305006fba9ffb98970db6c8221541be7bee4c1d027421d6f6df7d1ce2"}, + {file = "aiohttp-3.11.11-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:92cde43018a2e17d48bb09c79e4d4cb0e236de5063ce897a5e40ac7cb4878773"}, + {file = "aiohttp-3.11.11-cp311-cp311-win32.whl", hash = "sha256:aba807f9569455cba566882c8938f1a549f205ee43c27b126e5450dc9f83cc62"}, + {file = "aiohttp-3.11.11-cp311-cp311-win_amd64.whl", hash = "sha256:ae545f31489548c87b0cced5755cfe5a5308d00407000e72c4fa30b19c3220ac"}, + {file = "aiohttp-3.11.11-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:e595c591a48bbc295ebf47cb91aebf9bd32f3ff76749ecf282ea7f9f6bb73886"}, + {file = "aiohttp-3.11.11-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:3ea1b59dc06396b0b424740a10a0a63974c725b1c64736ff788a3689d36c02d2"}, + {file = "aiohttp-3.11.11-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8811f3f098a78ffa16e0ea36dffd577eb031aea797cbdba81be039a4169e242c"}, + {file = "aiohttp-3.11.11-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd7227b87a355ce1f4bf83bfae4399b1f5bb42e0259cb9405824bd03d2f4336a"}, + {file = "aiohttp-3.11.11-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d40f9da8cabbf295d3a9dae1295c69975b86d941bc20f0a087f0477fa0a66231"}, + {file = "aiohttp-3.11.11-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ffb3dc385f6bb1568aa974fe65da84723210e5d9707e360e9ecb51f59406cd2e"}, + {file = "aiohttp-3.11.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8f5f7515f3552d899c61202d99dcb17d6e3b0de777900405611cd747cecd1b8"}, + {file = "aiohttp-3.11.11-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3499c7ffbfd9c6a3d8d6a2b01c26639da7e43d47c7b4f788016226b1e711caa8"}, + {file = "aiohttp-3.11.11-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8e2bf8029dbf0810c7bfbc3e594b51c4cc9101fbffb583a3923aea184724203c"}, + {file = "aiohttp-3.11.11-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b6212a60e5c482ef90f2d788835387070a88d52cf6241d3916733c9176d39eab"}, + {file = "aiohttp-3.11.11-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:d119fafe7b634dbfa25a8c597718e69a930e4847f0b88e172744be24515140da"}, + {file = "aiohttp-3.11.11-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:6fba278063559acc730abf49845d0e9a9e1ba74f85f0ee6efd5803f08b285853"}, + {file = "aiohttp-3.11.11-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:92fc484e34b733704ad77210c7957679c5c3877bd1e6b6d74b185e9320cc716e"}, + {file = "aiohttp-3.11.11-cp312-cp312-win32.whl", hash = "sha256:9f5b3c1ed63c8fa937a920b6c1bec78b74ee09593b3f5b979ab2ae5ef60d7600"}, + {file = "aiohttp-3.11.11-cp312-cp312-win_amd64.whl", hash = "sha256:1e69966ea6ef0c14ee53ef7a3d68b564cc408121ea56c0caa2dc918c1b2f553d"}, + {file = "aiohttp-3.11.11-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:541d823548ab69d13d23730a06f97460f4238ad2e5ed966aaf850d7c369782d9"}, + {file = "aiohttp-3.11.11-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:929f3ed33743a49ab127c58c3e0a827de0664bfcda566108989a14068f820194"}, + {file = "aiohttp-3.11.11-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0882c2820fd0132240edbb4a51eb8ceb6eef8181db9ad5291ab3332e0d71df5f"}, + {file = "aiohttp-3.11.11-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b63de12e44935d5aca7ed7ed98a255a11e5cb47f83a9fded7a5e41c40277d104"}, + {file = "aiohttp-3.11.11-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aa54f8ef31d23c506910c21163f22b124facb573bff73930735cf9fe38bf7dff"}, + {file = "aiohttp-3.11.11-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a344d5dc18074e3872777b62f5f7d584ae4344cd6006c17ba12103759d407af3"}, + {file = "aiohttp-3.11.11-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b7fb429ab1aafa1f48578eb315ca45bd46e9c37de11fe45c7f5f4138091e2f1"}, + {file = "aiohttp-3.11.11-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c341c7d868750e31961d6d8e60ff040fb9d3d3a46d77fd85e1ab8e76c3e9a5c4"}, + {file = "aiohttp-3.11.11-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ed9ee95614a71e87f1a70bc81603f6c6760128b140bc4030abe6abaa988f1c3d"}, + {file = "aiohttp-3.11.11-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:de8d38f1c2810fa2a4f1d995a2e9c70bb8737b18da04ac2afbf3971f65781d87"}, + {file = "aiohttp-3.11.11-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:a9b7371665d4f00deb8f32208c7c5e652059b0fda41cf6dbcac6114a041f1cc2"}, + {file = "aiohttp-3.11.11-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:620598717fce1b3bd14dd09947ea53e1ad510317c85dda2c9c65b622edc96b12"}, + {file = "aiohttp-3.11.11-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:bf8d9bfee991d8acc72d060d53860f356e07a50f0e0d09a8dfedea1c554dd0d5"}, + {file = "aiohttp-3.11.11-cp313-cp313-win32.whl", hash = "sha256:9d73ee3725b7a737ad86c2eac5c57a4a97793d9f442599bea5ec67ac9f4bdc3d"}, + {file = "aiohttp-3.11.11-cp313-cp313-win_amd64.whl", hash = "sha256:c7a06301c2fb096bdb0bd25fe2011531c1453b9f2c163c8031600ec73af1cc99"}, + {file = "aiohttp-3.11.11-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:3e23419d832d969f659c208557de4a123e30a10d26e1e14b73431d3c13444c2e"}, + {file = "aiohttp-3.11.11-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:21fef42317cf02e05d3b09c028712e1d73a9606f02467fd803f7c1f39cc59add"}, + {file = "aiohttp-3.11.11-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1f21bb8d0235fc10c09ce1d11ffbd40fc50d3f08a89e4cf3a0c503dc2562247a"}, + {file = "aiohttp-3.11.11-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1642eceeaa5ab6c9b6dfeaaa626ae314d808188ab23ae196a34c9d97efb68350"}, + {file = "aiohttp-3.11.11-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2170816e34e10f2fd120f603e951630f8a112e1be3b60963a1f159f5699059a6"}, + {file = "aiohttp-3.11.11-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8be8508d110d93061197fd2d6a74f7401f73b6d12f8822bbcd6d74f2b55d71b1"}, + {file = "aiohttp-3.11.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4eed954b161e6b9b65f6be446ed448ed3921763cc432053ceb606f89d793927e"}, + {file = "aiohttp-3.11.11-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6c9af134da4bc9b3bd3e6a70072509f295d10ee60c697826225b60b9959acdd"}, + {file = "aiohttp-3.11.11-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:44167fc6a763d534a6908bdb2592269b4bf30a03239bcb1654781adf5e49caf1"}, + {file = "aiohttp-3.11.11-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:479b8c6ebd12aedfe64563b85920525d05d394b85f166b7873c8bde6da612f9c"}, + {file = "aiohttp-3.11.11-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:10b4ff0ad793d98605958089fabfa350e8e62bd5d40aa65cdc69d6785859f94e"}, + {file = "aiohttp-3.11.11-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:b540bd67cfb54e6f0865ceccd9979687210d7ed1a1cc8c01f8e67e2f1e883d28"}, + {file = "aiohttp-3.11.11-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1dac54e8ce2ed83b1f6b1a54005c87dfed139cf3f777fdc8afc76e7841101226"}, + {file = "aiohttp-3.11.11-cp39-cp39-win32.whl", hash = "sha256:568c1236b2fde93b7720f95a890741854c1200fba4a3471ff48b2934d2d93fd3"}, + {file = "aiohttp-3.11.11-cp39-cp39-win_amd64.whl", hash = "sha256:943a8b052e54dfd6439fd7989f67fc6a7f2138d0a2cf0a7de5f18aa4fe7eb3b1"}, + {file = "aiohttp-3.11.11.tar.gz", hash = "sha256:bb49c7f1e6ebf3821a42d81d494f538107610c3a705987f53068546b0e90303e"}, ] [package.dependencies] @@ -139,13 +139,13 @@ speedups = ["Brotli", "aiodns (>=3.2.0)", "brotlicffi"] [[package]] name = "aiosignal" -version = "1.3.1" +version = "1.3.2" description = "aiosignal: a list of registered asynchronous callbacks" optional = false -python-versions = ">=3.7" +python-versions = ">=3.9" files = [ - {file = "aiosignal-1.3.1-py3-none-any.whl", hash = "sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17"}, - {file = "aiosignal-1.3.1.tar.gz", hash = "sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc"}, + {file = "aiosignal-1.3.2-py2.py3-none-any.whl", hash = "sha256:45cde58e409a301715980c2b01d0c28bdde3770d8290b5eb2173759d9acb31a5"}, + {file = "aiosignal-1.3.2.tar.gz", hash = "sha256:a8c255c66fafb1e499c9351d0bf32ff2d8a0321595ebac3b93713656d2436f54"}, ] [package.dependencies] @@ -197,13 +197,13 @@ files = [ [[package]] name = "argcomplete" -version = "3.5.1" +version = "3.5.2" description = "Bash tab completion for argparse" optional = false python-versions = ">=3.8" files = [ - {file = "argcomplete-3.5.1-py3-none-any.whl", hash = "sha256:1a1d148bdaa3e3b93454900163403df41448a248af01b6e849edc5ac08e6c363"}, - {file = "argcomplete-3.5.1.tar.gz", hash = "sha256:eb1ee355aa2557bd3d0145de7b06b2a45b0ce461e1e7813f5d066039ab4177b4"}, + {file = "argcomplete-3.5.2-py3-none-any.whl", hash = "sha256:036d020d79048a5d525bc63880d7a4b8d1668566b8a76daf1144c0bbe0f63472"}, + {file = "argcomplete-3.5.2.tar.gz", hash = "sha256:23146ed7ac4403b70bd6026402468942ceba34a6732255b9edf5b7354f68a6bb"}, ] [package.extras] @@ -241,19 +241,19 @@ files = [ [[package]] name = "attrs" -version = "24.2.0" +version = "24.3.0" description = "Classes Without Boilerplate" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "attrs-24.2.0-py3-none-any.whl", hash = "sha256:81921eb96de3191c8258c199618104dd27ac608d9366f5e35d011eae1867ede2"}, - {file = "attrs-24.2.0.tar.gz", hash = "sha256:5cfb1b9148b5b086569baec03f20d7b6bf3bcacc9a42bebf87ffaaca362f6346"}, + {file = "attrs-24.3.0-py3-none-any.whl", hash = "sha256:ac96cd038792094f438ad1f6ff80837353805ac950cd2aa0e0625ef19850c308"}, + {file = "attrs-24.3.0.tar.gz", hash = "sha256:8f5c07333d543103541ba7be0e2ce16eeee8130cb0b3f9238ab904ce1e85baff"}, ] [package.extras] benchmark = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins", "pytest-xdist[psutil]"] cov = ["cloudpickle", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] -dev = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +dev = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pre-commit-uv", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier (<24.7)"] tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"] @@ -419,12 +419,12 @@ isodate = ">=0.6.1" [[package]] name = "azureml-core" -version = "1.58.0.post1" +version = "1.59.0" description = "Azure Machine Learning core packages, modules, and classes" optional = false python-versions = "<4.0,>=3.8" files = [ - {file = "azureml_core-1.58.0.post1-py3-none-any.whl", hash = "sha256:02af855badd3229656749a1af970a0ca2015fdbe3a9c706e56b64752b372a7dd"}, + {file = "azureml_core-1.59.0-py3-none-any.whl", hash = "sha256:c7bde8b4a54ffc470e6342a089f62bfa07dbe855fd3f9dc90217b08f110daa7b"}, ] [package.dependencies] @@ -436,7 +436,7 @@ azure-graphrbac = ">=0.40.0,<1.0.0" azure-mgmt-authorization = ">=0.40.0,<5" azure-mgmt-containerregistry = ">=8.2.0,<11" azure-mgmt-keyvault = ">=0.40.0,<11.0.0" -azure-mgmt-network = "<=28.0.0" +azure-mgmt-network = "<=29.0.0" azure-mgmt-resource = ">=15.0.0,<=24.0.0" azure-mgmt-storage = ">=16.0.0,<=22.0.0" "backports.tempfile" = "*" @@ -444,7 +444,7 @@ contextlib2 = "<22.0.0" docker = "<8.0.0" humanfriendly = ">=4.7,<11.0" jmespath = "<2.0.0" -jsonpickle = "<4.0.0" +jsonpickle = "<5.0.0" knack = "<0.13.0" msal = ">=1.15.0,<2.0.0" msal-extensions = ">=0.3.0,<=2.0.0" @@ -640,13 +640,13 @@ files = [ [[package]] name = "certifi" -version = "2024.8.30" +version = "2024.12.14" description = "Python package for providing Mozilla's CA Bundle." optional = false python-versions = ">=3.6" files = [ - {file = "certifi-2024.8.30-py3-none-any.whl", hash = "sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8"}, - {file = "certifi-2024.8.30.tar.gz", hash = "sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9"}, + {file = "certifi-2024.12.14-py3-none-any.whl", hash = "sha256:1275f7a45be9464efc1173084eaa30f866fe2e47d389406136d332ed4967ec56"}, + {file = "certifi-2024.12.14.tar.gz", hash = "sha256:b650d30f370c2b724812bee08008be0c4163b163ddaec3f2546c1caf65f191db"}, ] [[package]] @@ -923,7 +923,6 @@ files = [ {file = "cryptography-44.0.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:761817a3377ef15ac23cd7834715081791d4ec77f9297ee694ca1ee9c2c7e5eb"}, {file = "cryptography-44.0.0-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3c672a53c0fb4725a29c303be906d3c1fa99c32f58abe008a82705f9ee96f40b"}, {file = "cryptography-44.0.0-cp37-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:4ac4c9f37eba52cb6fbeaf5b59c152ea976726b865bd4cf87883a7e7006cc543"}, - {file = "cryptography-44.0.0-cp37-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:60eb32934076fa07e4316b7b2742fa52cbb190b42c2df2863dbc4230a0a9b385"}, {file = "cryptography-44.0.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ed3534eb1090483c96178fcb0f8893719d96d5274dfde98aa6add34614e97c8e"}, {file = "cryptography-44.0.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:f3f6fdfa89ee2d9d496e2c087cebef9d4fcbb0ad63c40e821b39f74bf48d9c5e"}, {file = "cryptography-44.0.0-cp37-abi3-win32.whl", hash = "sha256:eb33480f1bad5b78233b0ad3e1b0be21e8ef1da745d8d2aecbb20671658b9053"}, @@ -934,7 +933,6 @@ files = [ {file = "cryptography-44.0.0-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:c5eb858beed7835e5ad1faba59e865109f3e52b3783b9ac21e7e47dc5554e289"}, {file = "cryptography-44.0.0-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f53c2c87e0fb4b0c00fa9571082a057e37690a8f12233306161c8f4b819960b7"}, {file = "cryptography-44.0.0-cp39-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:9e6fc8a08e116fb7c7dd1f040074c9d7b51d74a8ea40d4df2fc7aa08b76b9e6c"}, - {file = "cryptography-44.0.0-cp39-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:9abcc2e083cbe8dde89124a47e5e53ec38751f0d7dfd36801008f316a127d7ba"}, {file = "cryptography-44.0.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:d2436114e46b36d00f8b72ff57e598978b37399d2786fd39793c36c6d5cb1c64"}, {file = "cryptography-44.0.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a01956ddfa0a6790d594f5b34fc1bfa6098aca434696a03cfdbe469b8ed79285"}, {file = "cryptography-44.0.0-cp39-abi3-win32.whl", hash = "sha256:eca27345e1214d1b9f9490d200f9db5a874479be914199194e746c893788d417"}, @@ -1265,13 +1263,13 @@ files = [ [[package]] name = "google-auth" -version = "2.36.0" +version = "2.37.0" description = "Google Authentication Library" optional = false python-versions = ">=3.7" files = [ - {file = "google_auth-2.36.0-py2.py3-none-any.whl", hash = "sha256:51a15d47028b66fd36e5c64a82d2d57480075bccc7da37cde257fc94177a61fb"}, - {file = "google_auth-2.36.0.tar.gz", hash = "sha256:545e9618f2df0bcbb7dcbc45a546485b1212624716975a1ea5ae8149ce769ab1"}, + {file = "google_auth-2.37.0-py2.py3-none-any.whl", hash = "sha256:42664f18290a6be591be5329a96fe30184be1a1badb7292a7f686a9659de9ca0"}, + {file = "google_auth-2.37.0.tar.gz", hash = "sha256:0054623abf1f9c83492c63d3f47e77f0a544caa3d40b2d98e099a611c2dd5d00"}, ] [package.dependencies] @@ -1282,6 +1280,7 @@ rsa = ">=3.1.4,<5" [package.extras] aiohttp = ["aiohttp (>=3.6.2,<4.0.0.dev0)", "requests (>=2.20.0,<3.0.0.dev0)"] enterprise-cert = ["cryptography", "pyopenssl"] +pyjwt = ["cryptography (>=38.0.3)", "pyjwt (>=2.0)"] pyopenssl = ["cryptography (>=38.0.3)", "pyopenssl (>=20.0.0)"] reauth = ["pyu2f (>=0.1.5)"] requests = ["requests (>=2.20.0,<3.0.0.dev0)"] @@ -1599,18 +1598,18 @@ release = ["towncrier"] [[package]] name = "jsonpickle" -version = "3.4.2" +version = "4.0.1" description = "jsonpickle encodes/decodes any Python object to/from JSON" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "jsonpickle-3.4.2-py3-none-any.whl", hash = "sha256:fd6c273278a02b3b66e3405db3dd2f4dbc8f4a4a3123bfcab3045177c6feb9c3"}, - {file = "jsonpickle-3.4.2.tar.gz", hash = "sha256:2efa2778859b6397d5804b0a98d52cd2a7d9a70fcb873bc5a3ca5acca8f499ba"}, + {file = "jsonpickle-4.0.1-py3-none-any.whl", hash = "sha256:2973c0b0d988c6792ed6c446fa582c48352e79c2880fa2c013f1abde15905555"}, + {file = "jsonpickle-4.0.1.tar.gz", hash = "sha256:b5336144d902958b92cb08bc1e76bfa47199b8afd454303693894defd2fa50c5"}, ] [package.extras] cov = ["pytest-cov"] -dev = ["black"] +dev = ["black", "pyupgrade"] docs = ["furo", "rst.linker (>=1.9)", "sphinx (>=3.5)"] packaging = ["build", "setuptools (>=61.2)", "setuptools-scm[toml] (>=6.0)", "twine"] testing = ["PyYAML", "atheris (>=2.3.0,<2.4.0)", "bson", "ecdsa", "feedparser", "gmpy2", "numpy", "pandas", "pymongo", "pytest (>=6.0,!=8.1.*)", "pytest-benchmark", "pytest-benchmark[histogram]", "pytest-checkdocs (>=1.2.3)", "pytest-enabler (>=1.0.1)", "pytest-ruff (>=0.2.1)", "scikit-learn", "scipy", "scipy (>=1.9.3)", "simplejson", "sqlalchemy", "ujson"] @@ -2318,9 +2317,9 @@ typo = "0.1.5" [package.source] type = "git" -url = "https://github.com/hplt-project/OpusTrainer.git" -reference = "c966d7b353d6b3c6a09d9573f1ab6ba3221c1d21" -resolved_reference = "c966d7b353d6b3c6a09d9573f1ab6ba3221c1d21" +url = "https://github.com/mozilla/OpusTrainer.git" +reference = "554b7202cecbb2eaae38819aebb6c5020685f670" +resolved_reference = "554b7202cecbb2eaae38819aebb6c5020685f670" [[package]] name = "packaging" @@ -2776,6 +2775,16 @@ files = [ [package.extras] windows-terminal = ["colorama (>=0.4.6)"] +[[package]] +name = "pyicu" +version = "2.8.1" +description = "Python extension wrapping the ICU C++ API" +optional = false +python-versions = "*" +files = [ + {file = "PyICU-2.8.1.tar.gz", hash = "sha256:f0b9549a87f87ba7c413f13679d137271e0b37f1f39b0109ace38257d4d148d6"}, +] + [[package]] name = "pyjwt" version = "2.10.1" @@ -2877,13 +2886,13 @@ dev = ["build", "flake8", "mypy", "pytest", "twine"] [[package]] name = "pyright" -version = "1.1.390" +version = "1.1.391" description = "Command line wrapper for pyright" optional = false python-versions = ">=3.7" files = [ - {file = "pyright-1.1.390-py3-none-any.whl", hash = "sha256:ecebfba5b6b50af7c1a44c2ba144ba2ab542c227eb49bc1f16984ff714e0e110"}, - {file = "pyright-1.1.390.tar.gz", hash = "sha256:aad7f160c49e0fbf8209507a15e17b781f63a86a1facb69ca877c71ef2e9538d"}, + {file = "pyright-1.1.391-py3-none-any.whl", hash = "sha256:54fa186f8b3e8a55a44ebfa842636635688670c6896dcf6cf4a7fc75062f4d15"}, + {file = "pyright-1.1.391.tar.gz", hash = "sha256:66b2d42cdf5c3cbab05f2f4b76e8bec8aa78e679bfa0b6ad7b923d9e027cadb2"}, ] [package.dependencies] @@ -4502,4 +4511,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "c2579dfe6fad8f2cf8c5071ba2933c80db7f127289394c27bb5e2937de74791f" +content-hash = "0e1861eb058f93071adf9cd4e1d4e6ecb6f09e3a2b294ed27b19468c222665d6" diff --git a/pyproject.toml b/pyproject.toml index e05b3398f..5d29b5b0a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,8 +54,8 @@ sacrebleu="2.4.2" mtdata="0.4.1" requests="^2.26.0" pytest="7.4.3" -# use the latest main, switch to PyPi when released -opustrainer = {git = "https://github.com/hplt-project/OpusTrainer.git", rev="c966d7b353d6b3c6a09d9573f1ab6ba3221c1d21"} +# TODO: ICU tokenizer commit, replace to PyPi version when released. Issue: https://github.com/mozilla/translations/issues/967 +opustrainer = {git = "https://github.com/mozilla/OpusTrainer.git", rev="554b7202cecbb2eaae38819aebb6c5020685f670"} requests-mock = "^1.11.0" sh = "^2.0.6" zstandard = "^0.22.0" @@ -65,6 +65,7 @@ translations_taskgraph = {path="./taskcluster/", develop=true} sacremoses = "0.1.1" hanzidentifier = "1.2.0" OpenCC = "1.1.9" +PyICU = "2.8.1" [tool.black] extend-exclude= "/3rd_party" diff --git a/taskcluster/docker/test/Dockerfile b/taskcluster/docker/test/Dockerfile index 5257423e1..9a75824ac 100644 --- a/taskcluster/docker/test/Dockerfile +++ b/taskcluster/docker/test/Dockerfile @@ -15,6 +15,8 @@ RUN apt-get update -qq \ libboost-filesystem1.74.0 \ libboost-iostreams1.74.0 \ python3.10-venv \ + pkg-config \ + libicu-dev \ && apt-get clean diff --git a/taskcluster/docker/train/Dockerfile b/taskcluster/docker/train/Dockerfile index 2156d5080..f7d3864b5 100644 --- a/taskcluster/docker/train/Dockerfile +++ b/taskcluster/docker/train/Dockerfile @@ -14,6 +14,8 @@ RUN apt-get update -qq \ pigz \ curl \ wget \ + pkg-config \ + libicu-dev \ && apt-get clean diff --git a/taskcluster/kinds/alignments-backtranslated/kind.yml b/taskcluster/kinds/alignments-backtranslated/kind.yml index 218e230c2..ab32f4737 100644 --- a/taskcluster/kinds/alignments-backtranslated/kind.yml +++ b/taskcluster/kinds/alignments-backtranslated/kind.yml @@ -79,7 +79,7 @@ tasks: --corpus_trg=$MOZ_FETCHES_DIR/mono.{trg_locale}.zst --output_path=$TASK_WORKDIR/artifacts/mono.aln.zst --output_tokenized - --tokenization=moses + --tokenization=icu --priors_input_path=$MOZ_FETCHES_DIR/corpus.priors dependencies: diff --git a/taskcluster/kinds/alignments-original/kind.yml b/taskcluster/kinds/alignments-original/kind.yml index 2eeddf77d..b59e6db3b 100644 --- a/taskcluster/kinds/alignments-original/kind.yml +++ b/taskcluster/kinds/alignments-original/kind.yml @@ -79,7 +79,7 @@ tasks: --output_path=$TASK_WORKDIR/artifacts/corpus.aln.zst --output_tokenized --priors_output_path=$TASK_WORKDIR/artifacts/corpus.priors - --tokenization=moses + --tokenization=icu dependencies: merge-corpus: merge-corpus-{src_locale}-{trg_locale} diff --git a/taskcluster/kinds/alignments-student/kind.yml b/taskcluster/kinds/alignments-student/kind.yml index ed22eaa9d..7da677231 100644 --- a/taskcluster/kinds/alignments-student/kind.yml +++ b/taskcluster/kinds/alignments-student/kind.yml @@ -77,7 +77,7 @@ tasks: --corpus_trg=$MOZ_FETCHES_DIR/corpus.{trg_locale}.zst --output_path=$TASK_WORKDIR/artifacts/corpus.aln.zst --output_tokenized - --tokenization=moses + --tokenization=icu --priors_input_path=$MOZ_FETCHES_DIR/corpus.priors dependencies: diff --git a/taskcluster/kinds/finetune-student/kind.yml b/taskcluster/kinds/finetune-student/kind.yml index a1d7bc001..7ba921dab 100644 --- a/taskcluster/kinds/finetune-student/kind.yml +++ b/taskcluster/kinds/finetune-student/kind.yml @@ -104,7 +104,7 @@ tasks: finetune {src_locale} {trg_locale} - $MOZ_FETCHES_DIR/corpus.tok-moses + $MOZ_FETCHES_DIR/corpus.tok-icu $MOZ_FETCHES_DIR/devset $TASK_WORKDIR/artifacts {best_model} @@ -140,7 +140,7 @@ tasks: extract: false alignments: - artifact: corpus.aln.zst - - artifact: corpus.tok-moses.{src_locale}.zst + - artifact: corpus.tok-icu.{src_locale}.zst extract: false - - artifact: corpus.tok-moses.{trg_locale}.zst + - artifact: corpus.tok-icu.{trg_locale}.zst extract: false diff --git a/taskcluster/kinds/train-student/kind.yml b/taskcluster/kinds/train-student/kind.yml index 5256f993c..e3bbb1f35 100644 --- a/taskcluster/kinds/train-student/kind.yml +++ b/taskcluster/kinds/train-student/kind.yml @@ -105,7 +105,7 @@ tasks: train {src_locale} {trg_locale} - $MOZ_FETCHES_DIR/corpus.tok-moses + $MOZ_FETCHES_DIR/corpus.tok-icu $MOZ_FETCHES_DIR/devset $TASK_WORKDIR/artifacts {best_model} @@ -135,7 +135,7 @@ tasks: extract: false alignments: - artifact: corpus.aln.zst - - artifact: corpus.tok-moses.{src_locale}.zst + - artifact: corpus.tok-icu.{src_locale}.zst extract: false - - artifact: corpus.tok-moses.{trg_locale}.zst + - artifact: corpus.tok-icu.{trg_locale}.zst extract: false diff --git a/taskcluster/kinds/train-teacher/kind.yml b/taskcluster/kinds/train-teacher/kind.yml index 298d5573b..a14d64771 100644 --- a/taskcluster/kinds/train-teacher/kind.yml +++ b/taskcluster/kinds/train-teacher/kind.yml @@ -128,7 +128,7 @@ tasks: train {src_locale} {trg_locale} - $MOZ_FETCHES_DIR/corpus.tok-moses,$MOZ_FETCHES_DIR/mono.tok-moses + $MOZ_FETCHES_DIR/corpus.tok-icu,$MOZ_FETCHES_DIR/mono.tok-icu $MOZ_FETCHES_DIR/devset $TASK_WORKDIR/artifacts {best_model} @@ -159,13 +159,13 @@ tasks: extract: false alignments-original: - artifact: corpus.aln.zst - - artifact: corpus.tok-moses.{src_locale}.zst + - artifact: corpus.tok-icu.{src_locale}.zst extract: false - - artifact: corpus.tok-moses.{trg_locale}.zst + - artifact: corpus.tok-icu.{trg_locale}.zst extract: false alignments-backtranslated: - artifact: mono.aln.zst - - artifact: mono.tok-moses.{trg_locale}.zst + - artifact: mono.tok-icu.{trg_locale}.zst extract: false - - artifact: mono.tok-moses.{src_locale}.zst + - artifact: mono.tok-icu.{src_locale}.zst extract: false diff --git a/tests/test_alignments.py b/tests/test_alignments.py index 474e3115a..1f770b1b1 100644 --- a/tests/test_alignments.py +++ b/tests/test_alignments.py @@ -3,7 +3,7 @@ import pytest import sh -from fixtures import DataDir +from fixtures import DataDir, en_sample, zh_sample, FIXTURES_PATH TRG = "ru" @@ -22,7 +22,7 @@ ) # "|||" in the text can cause issues if joint fast_align style input is used -en_sample = """The little girl, seeing she had lost one of her pretty shoes, grew angry, and said to the Witch, “Give me back my shoe!” ||| one +en_sample_with_separator = """The little girl, seeing she had lost one of her pretty shoes, grew angry, and said to the Witch, “Give me back my shoe!” ||| one “I will not,” retorted the Witch, “for it is now my shoe, and not yours.” “You are a wicked creature!” cried Dorothy. “You have no right to take my shoe from me.” “I shall keep it, just the same,” said the Witch, laughing at her, “and someday I shall get the other one from you, too.” @@ -32,7 +32,7 @@ “I’m very sorry, indeed,” said Dorothy, who was truly frightened to see the Witch actually melting away like brown sugar before her very eyes. """ -ru_sample = """Маленькая девочка, увидев, что потеряла одну из своих красивых туфелек, рассердилась и сказала Ведьме: «Верни мне мою туфельку!» ||| один +ru_sample_with_separator = """Маленькая девочка, увидев, что потеряла одну из своих красивых туфелек, рассердилась и сказала Ведьме: «Верни мне мою туфельку!» ||| один «Я не буду, — парировала Ведьма, — потому что теперь это моя туфля, а не твоя». «Ты злое существо!» - воскликнула Дороти. «Ты не имеешь права забирать у меня туфлю». «Я все равно сохраню его, — сказала Ведьма, смеясь над ней, — и когда-нибудь я получу от тебя и другой». @@ -43,43 +43,52 @@ """ -def verify_alignments(data_dir, dataset, src_corpus, trg_corpus): +def verify_alignments(data_dir, dataset, src, trg): aln_path = os.path.join(data_dir.path, "artifacts", f"{dataset}.aln.zst") assert os.path.exists(aln_path) sh.zstd("-d", aln_path) with open(aln_path[:-4], "r") as f: - aln_lines = f.readlines() + aln_lines = f.read().splitlines() - src_tokenized_path = os.path.join(data_dir.path, "artifacts", f"{dataset}.tok-moses.{SRC}.zst") - trg_tokenized_path = os.path.join(data_dir.path, "artifacts", f"{dataset}.tok-moses.{TRG}.zst") + src_tokenized_path = os.path.join(data_dir.path, "artifacts", f"{dataset}.tok-icu.{src}.zst") + trg_tokenized_path = os.path.join(data_dir.path, "artifacts", f"{dataset}.tok-icu.{trg}.zst") sh.zstd("-d", src_tokenized_path, trg_tokenized_path) with open(src_tokenized_path[:-4], "r") as f: - src_lines = f.readlines() + src_lines = f.read().splitlines() with open(trg_tokenized_path[:-4], "r") as f: - trg_lines = f.readlines() + trg_lines = f.read().splitlines() assert len(aln_lines) == len(src_lines) assert len(aln_lines) == len(trg_lines) # verify alignment indices - for aln_line, src_line, trg_line in zip(aln_lines, src_lines, trg_lines): - alns = [pair.split("-") for pair in aln_line.split()] - src_tokens_num = len(src_line.split()) - trg_tokens_num = len(trg_line.split()) - - assert all( - int(src_idx) < src_tokens_num and int(trg_idx) < trg_tokens_num - for src_idx, trg_idx in alns - ) + with open(aln_path[:-4] + ".debug", "w") as f: + for aln_line, src_line, trg_line in zip(aln_lines, src_lines, trg_lines): + alns = [pair.split("-") for pair in aln_line.split()] + src_tokens = src_line.split(" ") + trg_tokens = trg_line.split(" ") + src_tokens_num = len(src_tokens) + trg_tokens_num = len(trg_tokens) + + assert all( + int(src_idx) < src_tokens_num and int(trg_idx) < trg_tokens_num + for src_idx, trg_idx in alns + ) + + aligned = [] + for src_idx, trg_idx in alns: + aligned.append((src_tokens[int(src_idx)], trg_tokens[int(trg_idx)])) + f.write(str(aligned)) + f.write("\n") def test_teacher_original_alignments(): data_dir = DataDir("test_alignments") - data_dir.create_zst("corpus.en.zst", en_sample) - data_dir.create_zst("corpus.ru.zst", ru_sample) + data_dir.create_zst("corpus.en.zst", en_sample_with_separator) + data_dir.create_zst("corpus.ru.zst", ru_sample_with_separator) env = { "TEST_ARTIFACTS": data_dir.path, "BIN": bin_dir, @@ -90,15 +99,36 @@ def test_teacher_original_alignments(): data_dir.run_task("alignments-original-en-ru", env=env) - verify_alignments(data_dir, "corpus", en_sample, ru_sample) + verify_alignments(data_dir, "corpus", SRC, TRG) -def test_teacher_backtranslated_alignments(): +def test_teacher_original_alignments_zh(): data_dir = DataDir("test_alignments") data_dir.create_zst("corpus.en.zst", en_sample) - data_dir.create_zst("mono.en.zst", en_sample) - data_dir.create_zst("corpus.ru.zst", ru_sample) - data_dir.create_zst("mono.ru.zst", ru_sample) + data_dir.create_zst("corpus.zh.zst", zh_sample) + env = { + "TEST_ARTIFACTS": data_dir.path, + "BIN": bin_dir, + "SRC": "en", + "TRG": "zh", + "ALN_CHUNK_LINES": "3", + } + + data_dir.run_task( + "alignments-original-en-zh", + env=env, + config=os.path.abspath(os.path.join(FIXTURES_PATH, "config.pytest.enzh.yml")), + ) + + verify_alignments(data_dir, "corpus", "en", "zh") + + +def test_teacher_backtranslated_alignments(): + data_dir = DataDir("test_alignments") + data_dir.create_zst("corpus.en.zst", en_sample_with_separator) + data_dir.create_zst("mono.en.zst", en_sample_with_separator) + data_dir.create_zst("corpus.ru.zst", ru_sample_with_separator) + data_dir.create_zst("mono.ru.zst", ru_sample_with_separator) env = { "TEST_ARTIFACTS": data_dir.path, "BIN": bin_dir, @@ -115,13 +145,13 @@ def test_teacher_backtranslated_alignments(): data_dir.run_task("alignments-backtranslated-en-ru", env=env) - verify_alignments(data_dir, "mono", en_sample, ru_sample) + verify_alignments(data_dir, "mono", SRC, TRG) def test_student_alignments(): data_dir = DataDir("test_alignments") - data_dir.create_zst("corpus.en.zst", en_sample) - data_dir.create_zst("corpus.ru.zst", ru_sample) + data_dir.create_zst("corpus.en.zst", en_sample_with_separator) + data_dir.create_zst("corpus.ru.zst", ru_sample_with_separator) env = { "TEST_ARTIFACTS": data_dir.path, "BIN": bin_dir, @@ -136,18 +166,18 @@ def test_student_alignments(): os.path.join(data_dir.path, "corpus.priors"), ) os.remove(os.path.join(data_dir.path, "artifacts", "corpus.aln.zst")) - data_dir.create_zst("corpus.en.zst", en_sample) - data_dir.create_zst("corpus.ru.zst", ru_sample) + data_dir.create_zst("corpus.en.zst", en_sample_with_separator) + data_dir.create_zst("corpus.ru.zst", ru_sample_with_separator) data_dir.run_task("alignments-student-en-ru", env=env) - verify_alignments(data_dir, "corpus", en_sample, ru_sample) + verify_alignments(data_dir, "corpus", SRC, TRG) def test_shortlist(): data_dir = DataDir("test_shortlist") - data_dir.create_zst("corpus.en.zst", en_sample) - data_dir.create_zst("corpus.ru.zst", ru_sample) + data_dir.create_zst("corpus.en.zst", en_sample_with_separator) + data_dir.create_zst("corpus.ru.zst", ru_sample_with_separator) env = { "TEST_ARTIFACTS": data_dir.path, "BIN": bin_dir, diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py new file mode 100644 index 000000000..6608f6e19 --- /dev/null +++ b/tests/test_tokenizer.py @@ -0,0 +1,172 @@ +import pytest + +from pipeline.alignments.tokenizer import IcuTokenizer, TokenizerType, tokenize +from fixtures import zh_sample, en_sample, ru_sample, DataDir + +tokenized_first_lines = { + "en": "The ▁ little ▁ girl , ▁ seeing ▁ she ▁ had ▁ lost ▁ one ▁ of ▁ her ▁ pretty ▁ shoes , ▁ grew ▁ angry , ▁ and ▁ said ▁ to ▁ the ▁ Witch , ▁ “ Give ▁ me ▁ back ▁ my ▁ shoe ! ”", + "ru": "Маленькая ▁ девочка , ▁ увидев , ▁ что ▁ потеряла ▁ одну ▁ из ▁ своих ▁ красивых ▁ туфелек , ▁ рассердилась ▁ и ▁ сказала ▁ Ведьме : ▁ « Верни ▁ мне ▁ мою ▁ туфельку ! »", + "zh": "小 女孩 看到 自己 丢 了 一只 漂亮 的 鞋子 , 生气 了 , 对 女巫 说 : “ 把 我的 鞋子 还给 我 ! ”", +} + + +@pytest.mark.parametrize( + "lang,sample,first_line", + [ + ("en", en_sample, tokenized_first_lines["en"]), + ("ru", ru_sample, tokenized_first_lines["ru"]), + ("zh", zh_sample, tokenized_first_lines["zh"]), + ("zh", "这是一个简单的测试语句 🤣 。", "这 是 一个 简单 的 测试 语 句 ▁ 🤣▁ 。"), + ], + ids=["en", "ru", "zh", "zh2"], +) +def test_icu_tokenize_detokenize(lang, sample, first_line): + lines = sample.splitlines() + tokenizer = IcuTokenizer + icu_tokenizer = tokenizer(lang) + tok_lines = [] + detok_lines = [] + + for line in lines: + tokens = icu_tokenizer.tokenize(line) + detokenized = icu_tokenizer.detokenize(tokens) + tok_lines.append(" ".join(tokens)) + detok_lines.append(detokenized) + + assert lines == detok_lines + assert tok_lines[0] == first_line + + +@pytest.mark.parametrize( + "lang,sample", + [ + ( + "en", + en_sample, + ), + ( + "ru", + ru_sample, + ), + ("zh", zh_sample), + ], + ids=["en", "ru", "zh"], +) +def test_tokenizer(lang, sample): + data_dir = DataDir("test_tokenizer") + input_path = data_dir.create_file(f"input.{lang}.txt", sample) + output_path = data_dir.join(f"output.{lang}.txt") + + tokenize( + input_path=input_path, + output_path=output_path, + lang=lang, + tokenizer=TokenizerType.icu, + sentences_per_chunk=3, + ) + + with open(output_path) as f: + lines = f.read().splitlines() + + assert len(lines) == len(sample.splitlines()) + assert lines[0] == tokenized_first_lines[lang] + + +@pytest.mark.parametrize( + "lang,text,expected_tokenized", + [ + ( + "en", + "This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf", + "This , ▁ is ▁ a ▁ sentence ▁ with ▁ weird » ▁ symbols … ▁ appearing ▁ everywhere ¿", + ), + ("en", "abc def.", "abc ▁ def ."), + ("en", "2016, pp.", "2016 , ▁ pp ."), + ( + "en", + "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [ ] & You're gonna shake it off? Don't?", + "This ▁ ain't ▁ funny . ▁ It's ▁ actually ▁ hillarious , ▁ yet ▁ double ▁ Ls . ▁ | ▁ [ ] ▁ < ▁ > ▁ [ ▁ ] ▁ & ▁ You're ▁ gonna ▁ shake ▁ it ▁ off ? ▁ Don't ?", + ), + ("en", "this 'is' the thing", "this ▁ ' is ' ▁ the ▁ thing"), + ( + "en", + "By the mid 1990s a version of the game became a Latvian television series (with a parliamentary setting, and played by Latvian celebrities).", + "By ▁ the ▁ mid ▁ 1990s ▁ a ▁ version ▁ of ▁ the ▁ game ▁ became ▁ a ▁ Latvian ▁ television ▁ series ▁ ( with ▁ a ▁ parliamentary ▁ setting , ▁ and ▁ played ▁ by ▁ Latvian ▁ celebrities ) .", + ), + ( + "en", + "The meeting will take place at 11:00 a.m. Tuesday.", + "The ▁ meeting ▁ will ▁ take ▁ place ▁ at ▁ 11 : 00 ▁ a.m . ▁ Tuesday .", + ), + ("en", "'Hello.'", "' Hello . '"), + ("en", "'So am I.", "' So ▁ am ▁ I ."), + ( + "fr", + "Des gens admirent une œuvre d'art.", + "Des ▁ gens ▁ admirent ▁ une ▁ œuvre ▁ d'art .", + ), + ("de", "...schwer wie ein iPhone 5.", ". . . schwer ▁ wie ▁ ein ▁ iPhone ▁ 5 ."), + ("cz", "Dvě děti, které běží bez bot.", "Dvě ▁ děti , ▁ které ▁ běží ▁ bez ▁ bot ."), + ( + "en", + "this is a webpage https://stackoverflow.com/questions/6181381/how-to-print-variables-in-perl that kicks ass", + "this ▁ is ▁ a ▁ webpage ▁ https : / / stackoverflow.com / questions / 6181381 / how - to - print - variables - in - perl ▁ that ▁ kicks ▁ ass", + ), + ( + "en", + "What about a this,type,of-s-thingy?", + "What ▁ about ▁ a ▁ this , type , of - s - thingy ?", + ), + ( + "de", + "Sie sollten vor dem Upgrade eine Sicherung dieser Daten erstellen (wie unter Abschnitt 4.1.1, „Sichern aller Daten und Konfigurationsinformationen“ beschrieben). ", + "Sie ▁ sollten ▁ vor ▁ dem ▁ Upgrade ▁ eine ▁ Sicherung ▁ dieser ▁ Daten ▁ erstellen ▁ ( wie ▁ unter ▁ Abschnitt ▁ 4.1.1 , ▁ „ Sichern ▁ aller ▁ Daten ▁ und ▁ Konfigurationsinformationen “ ▁ beschrieben ) . ▁", + ), + ( + "fr", + "L'amitié nous a fait forts d'esprit", + "L'amitié ▁ nous ▁ a ▁ fait ▁ forts ▁ d'esprit", + ), + ("zh", "记者 应谦 美国", "记者 ▁ 应 谦 ▁ 美国"), + ("ko", "세계 에서 가장 강력한.", "세계 ▁ 에서 ▁ 가장 ▁ 강력한 ."), + ("ja", "電話でんわの邪魔じゃまをしないでください", "電話 でんわ の 邪魔 じゃ ま を しない で くだ さい"), + ("ja", "Japan is 日本 in Japanese.", "Japan ▁ is ▁ 日本 ▁ in ▁ Japanese ."), + ], + ids=[ + "en_weird_symbols", + "en_fullstop", + "en_numeric_prefix", + "en_braces", + "en_apostrophe", + "en_opening_brackets", + "en_dot_splitting", + "en_trailing_dot_apostrophe", + "en_one_apostrophe", + "fr", + "de", + "cz", + "en_pattern1", + "en_pattern2", + "de_final_comma_split_after_number", + "fr_apostrophes", + "zh", + "ko", + "ja", + "cjk_mix", + ], +) +def test_icu_tokens(lang, text, expected_tokenized): + """ + Tests tokens produced by ICU tokenizer. + + The use cases were copied from https://github.com/hplt-project/sacremoses/blob/master/sacremoses/test/test_tokenizer.py as is. + However, this test is mostly to show how the tokenizer works rather than fixing it because it relies on the underlying ICU tokenizer. + The expected values were just copied from the test run. + Having some mistakes in tokenization is ok because it's used only for the purposes of inline noise augmentation and to produce word alignments. + """ + icu_tokenizer = IcuTokenizer(lang) + + tokens = icu_tokenizer.tokenize(text) + tokenized = " ".join(tokens) + + assert expected_tokenized == tokenized diff --git a/tests/test_training.py b/tests/test_training.py index 68224a382..0e1a7bdbe 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -94,8 +94,8 @@ def alignments(data_dir, vocab, corpus, trg_lang, config): ) for lang in ["en", trg_lang]: shutil.copyfile( - data_dir.join("artifacts", f"{corpus}.tok-moses.{lang}.zst"), - data_dir.join(f"{corpus}.tok-moses.{lang}.zst"), + data_dir.join("artifacts", f"{corpus}.tok-icu.{lang}.zst"), + data_dir.join(f"{corpus}.tok-icu.{lang}.zst"), ) if task == "original": shutil.copyfile(