Updated all syntax to 0.55.0

zenml-io · Jan 24, 2024 · 2f2db2c · 2f2db2c
1 parent 8ff87a5
commit 2f2db2c
Show file tree

Hide file tree

Showing 5 changed files with 10 additions and 7 deletions.
diff --git a/.typos.toml b/.typos.toml
@@ -4,12 +4,15 @@ extend-exclude = ["*.csv", "sign-language-detection-yolov5/*", "orbit-user-analy
 [default.extend-identifiers]
 #  HashiCorp = "HashiCorp"
 connexion = "connexion"
-
+preprocesser = "preprocesser"
+Preprocesser = "Preprocesser"
 
 [default.extend-words]
 #  aks = "aks"
 GOES = "GOES"
 lenght = "lenght"
+preprocesser = "preprocesser"
+Preprocesser = "Preprocesser"
 
 [default]
 locale = "en-us"
diff --git a/llm-finetuning/README.md b/llm-finetuning/README.md
@@ -35,7 +35,7 @@ One of the first jobs of somebody entering MLOps is to convert their manual scri
 
 Frameworks like [ZenML](https://github.com/zenml-io/zenml) go a long way in alleviating this burden by abstracting much of the complexity away. However, recent advancement in Large Language Model based Copilots offer hope that even more repetitive aspects of this task can be automated.
 
-Unfortuantely, most open source or proprietary models like GitHub Copilot are often lagging behind the most recent versions of ML libraries, therefore giving errorneous our outdated syntax when asked simple commands.
+Unfortunately, most open source or proprietary models like GitHub Copilot are often lagging behind the most recent versions of ML libraries, therefore giving erroneous our outdated syntax when asked simple commands.
 
 The goal of this project is fine-tune an open-source LLM that performs better than off-the-shelf solutions on giving the right output for the latest version of ZenML.
 

diff --git a/llm-finetuning/pipelines/finetune.py b/llm-finetuning/pipelines/finetune.py
@@ -29,7 +29,7 @@
 @pipeline
 def finetune_starcoder():
     """
-    This pipelin finetunes the starcoder model.
+    This pipeline finetunes the starcoder model.
     """
     # Link all the steps together by calling them and passing the output
     # of one step as the input of the next step.

diff --git a/llm-finetuning/steps/prepare_dataset.py b/llm-finetuning/steps/prepare_dataset.py
@@ -51,7 +51,7 @@
     "pcm",
     "opus",
 ]
-ANTI_FOMATS = tuple(IMAGE + VIDEO + DOC + AUDIO + ARCHIVE + OTHERS)
+ANTI_FORMATS = tuple(IMAGE + VIDEO + DOC + AUDIO + ARCHIVE + OTHERS)
 
 
 def upload_to_hub(df: pd.DataFrame, dataset_id: str) -> str:
@@ -114,7 +114,7 @@ def read_repository_files(directory) -> pd.DataFrame:
     for root, _, files in os.walk(directory):
         for file in files:
             file_path = os.path.join(root, file)
-            if not file_path.endswith(ANTI_FOMATS) and all(
+            if not file_path.endswith(ANTI_FORMATS) and all(
                 k not in file_path for k in [".git", "__pycache__", "xcodeproj"]
             ):
                 file_paths.append((os.path.dirname(root), file_path))

diff --git a/llm-finetuning/steps/trainer.py b/llm-finetuning/steps/trainer.py
@@ -225,14 +225,14 @@ class ConstantLengthDataset(IterableDataset):
     """
     Iterable dataset that returns constant length chunks of tokens from stream of text files.
         Args:
-            tokenizer (Tokenizer): The processor used for proccessing the data.
+            tokenizer (Tokenizer): The processor used for processing the data.
             dataset (dataset.Dataset): Dataset with text files.
             infinite (bool): If True the iterator is reset after dataset reaches end else stops.
             seq_length (int): Length of token sequences to return.
             num_of_sequences (int): Number of token sequences to keep in buffer.
             chars_per_token (int): Number of characters per token used to estimate number of tokens in text buffer.
             fim_rate (float): Rate (0.0 to 1.0) that sample will be permuted with FIM.
-            fim_spm_rate (float): Rate (0.0 to 1.0) of FIM permuations that will use SPM.
+            fim_spm_rate (float): Rate (0.0 to 1.0) of FIM permutations that will use SPM.
             seed (int): Seed for random number generator.
     """