updating all examples, save_tokens now by default doesn't save progra…

…ms if config.use_programs is False
Natooz · Aug 17, 2023 · 5a6378b · ilya16 · Aug 18, 2023 · Natooz
1 parent 3f33a12
commit 5a6378b
Show file tree

Hide file tree

Showing 5 changed files with 41 additions and 122 deletions.
diff --git a/README.md b/README.md
@@ -31,38 +31,35 @@ The most basic and useful methods are summarized here. And [here](colab-notebook
 
 ```python
 from miditok import REMI, TokenizerConfig
-from miditok.utils import get_midi_programs
 from miditoolkit import MidiFile
 from pathlib import Path
 
-# Creating the tokenizer's configuration, read the doc to explore other parameters
-config = TokenizerConfig(nb_velocities=16, use_chords=True)
-
-# Creates the tokenizer and loads a MIDI
+# Creating a multitrack tokenizer configuration, read the doc to explore other parameters
+config = TokenizerConfig(nb_velocities=16, use_chords=True, use_programs=True)
 tokenizer = REMI(config)
-midi = MidiFile('path/to/your_midi.mid')
 
-# Converts MIDI to tokens, and back to a MIDI
-tokens = tokenizer(midi)  # calling it will automatically detect MIDIs, paths and tokens before the conversion
-converted_back_midi = tokenizer(tokens, get_midi_programs(midi))  # PyTorch / Tensorflow / Numpy tensors supported
+# Loads a midi, converts to tokens, and back to a MIDI
+midi = MidiFile('path/to/your_midi.mid')
+tokens = tokenizer(midi)  # calling the tokenizer will automatically detect MIDIs, paths and tokens
+converted_back_midi = tokenizer(tokens)  # PyTorch / Tensorflow / Numpy tensors supported
 
-# Converts MIDI files to tokens saved as JSON files
+# Tokenize a whole dataset and save it at Json files
 midi_paths = list(Path("path", "to", "dataset").glob("**/*.mid"))
 data_augmentation_offsets = [2, 1, 1]  # data augmentation on 2 pitch octaves, 1 velocity and 1 duration values
 tokenizer.tokenize_midi_dataset(midi_paths, Path("path", "to", "tokens_noBPE"),
                                 data_augment_offsets=data_augmentation_offsets)
 
-# Constructs the vocabulary with BPE, from the tokenized files
+# Constructs the vocabulary with BPE, from the token files
 tokenizer.learn_bpe(
-    vocab_size=500,
+    vocab_size=10000,
     tokens_paths=list(Path("path", "to", "tokens_noBPE").glob("**/*.json")),
     start_from_empty_voc=False,
 )
 
 # Saving our tokenizer, to retrieve it back later with the load_params method
 tokenizer.save_params(Path("path", "to", "save", "tokenizer.json"))
 
-# Converts the tokenized musics into tokens with BPE
+# Applies BPE to the previous tokens
 tokenizer.apply_bpe_to_dataset(Path('path', 'to', 'tokens_noBPE'), Path('path', 'to', 'tokens_BPE'))
 ```
 

diff --git a/colab-notebooks/Full_Example_HuggingFace_GPT2_Transformer.ipynb b/colab-notebooks/Full_Example_HuggingFace_GPT2_Transformer.ipynb
@@ -54,86 +54,16 @@
     "from copy import deepcopy\n",
     "import json\n",
     "\n",
-    "from torch import Tensor, LongTensor, stack, flip, cat, full, argmax\n",
+    "from torch import Tensor, LongTensor, flip, cat, full, argmax\n",
     "from torch.nn.utils.rnn import pad_sequence\n",
-    "from torch.utils.data import Dataset, DataLoader\n",
+    "from torch.utils.data import DataLoader\n",
     "from torchtoolkit.data import create_subsets\n",
     "from transformers import GPT2LMHeadModel, GPT2Config, Trainer, TrainingArguments, GenerationConfig\n",
-    "from transformers.data.data_collator import DataCollatorMixin\n",
     "from evaluate import load as load_metric\n",
     "from miditok import REMI, MIDITokenizer, TokenizerConfig\n",
+    "from miditok.pytorch_data import DatasetTok, DataCollator\n",
     "from miditok.constants import CHORD_MAPS\n",
-    "from miditoolkit import MidiFile\n",
-    "from tqdm import tqdm\n",
-    "\n",
-    "\n",
-    "class MIDIDataset(Dataset):\n",
-    "    r\"\"\"Dataset for generator training\n",
-    "\n",
-    "    :param files_paths: list of paths to files to load.\n",
-    "    :param tokenizer: tokenizer object, to use to load MIDIs instead of tokens. (default: None)\n",
-    "    \"\"\"\n",
-    "\n",
-    "    def __init__(self, files_paths: List[Path], min_seq_len: int, max_seq_len: int, tokenizer: MIDITokenizer = None):\n",
-    "        samples = []\n",
-    "\n",
-    "        for file_path in tqdm(files_paths, desc=f'Loading data: {files_paths[0].parent}'):\n",
-    "            if file_path.suffix in [\"mid\", \"midi\", \"MID\", \"MIDI\"]:\n",
-    "                midi = MidiFile(file_path)\n",
-    "                for _ in range(len(midi.instruments) - 1):\n",
-    "                    del midi.instruments[1]  # removes all tracks except first one\n",
-    "                tokens = tokenizer.midi_to_tokens(midi)[0].ids\n",
-    "            else:\n",
-    "                with open(file_path) as json_file:\n",
-    "                    tokens = json.load(json_file)['ids'][0]  # first track\n",
-    "            i = 0\n",
-    "            while i < len(tokens):\n",
-    "                if i >= len(tokens) - min_seq_len:\n",
-    "                    break  # last sample is too short\n",
-    "                samples.append(LongTensor(tokens[i:i + max_seq_len]))\n",
-    "                i += len(samples[-1])  # could be replaced with max_seq_len\n",
-    "\n",
-    "        self.samples = samples\n",
-    "\n",
-    "    def __getitem__(self, idx) -> Dict[str, LongTensor]:\n",
-    "        return {\"input_ids\": self.samples[idx], \"labels\": self.samples[idx]}\n",
-    "    \n",
-    "    def __len__(self) -> int: return len(self.samples)\n",
-    "\n",
-    "    def __repr__(self): return self.__str__()\n",
-    "\n",
-    "    def __str__(self) -> str: return 'No data loaded' if len(self) == 0 else f'{len(self.samples)} samples'\n",
-    "\n",
-    "\n",
-    "def _pad_batch(examples: List[Dict[str, LongTensor]], pad_token: int) -> LongTensor:\n",
-    "    \"\"\"Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary.\"\"\"\n",
-    "\n",
-    "    length_of_first = examples[0][\"input_ids\"].size(0)\n",
-    "\n",
-    "    # Check if padding is necessary.\n",
-    "    are_tensors_same_length = all(x[\"input_ids\"].size(0) == length_of_first for x in examples)\n",
-    "    if are_tensors_same_length:\n",
-    "        return stack([e[\"input_ids\"] for e in examples], dim=0).long()\n",
-    "\n",
-    "    # Creating the full tensor and filling it with our data.\n",
-    "    return pad_sequence([e[\"input_ids\"] for e in examples], batch_first=True, padding_value=pad_token).long()\n",
-    "\n",
-    "\n",
-    "class DataCollatorGen(DataCollatorMixin):\n",
-    "    def __init__(self, pad_token: int, return_tensors: str = \"pt\"):\n",
-    "        \"\"\"Collator that simply pad the input sequences.\n",
-    "        Input_ids will be padded with the pad token given, while labels will be\n",
-    "        padded with -100.\n",
-    "\n",
-    "        :param pad_token: pas token\n",
-    "        :param return_tensors:\n",
-    "        \"\"\"\n",
-    "        self.pad_token = pad_token\n",
-    "        self.return_tensors = return_tensors\n",
-    "\n",
-    "    def __call__(self, batch: List[Dict[str, Any]], return_tensors=None) -> Dict[str, LongTensor]:\n",
-    "        x, y = _pad_batch(batch, self.pad_token), _pad_batch(batch, -100)\n",
-    "        return {\"input_ids\": x, \"labels\": y}  # will be shifted in GPT2LMHead forward"
+    "from tqdm import tqdm"
    ]
   },
   {
@@ -198,7 +128,7 @@
     "\n",
     "# Loads tokens and create data loaders for training\n",
     "tokens_paths = list(Path('Maestro_tokens_bpe').glob(\"**/*.json\"))\n",
-    "dataset = MIDIDataset(\n",
+    "dataset = DatasetTok(\n",
     "    tokens_paths, max_seq_len=512, min_seq_len=384, \n",
     ")\n",
     "subset_train, subset_valid = create_subsets(dataset, [0.3])"
@@ -303,10 +233,11 @@
     "    gradient_checkpointing=True,\n",
     ")\n",
     "\n",
+    "collator = DataCollator(tokenizer[\"PAD_None\"], tokenizer[\"BOS_None\"], tokenizer[\"EOS_None\"], copy_inputs_as_labels=True)\n",
     "trainer = Trainer(\n",
     "    model=model,\n",
     "    args=training_config,\n",
-    "    data_collator=DataCollatorGen(tokenizer[\"PAD_None\"]),\n",
+    "    data_collator=collator,\n",
     "    train_dataset=subset_train,\n",
     "    eval_dataset=subset_valid,\n",
     "    compute_metrics=compute_metrics,\n",
@@ -339,15 +270,7 @@
    },
    "outputs": [],
    "source": [
-    "def collate_gen_left(batch: List[Dict[str, LongTensor]]) -> LongTensor:\n",
-    "    # Here the sequences are padded to the left, so that the last token along the time dimension\n",
-    "    # is always the last token of each seq, allowing to efficiently generate by batch\n",
-    "    bos_shape = (1,)\n",
-    "    batch = [flip(cat([full(bos_shape, tokenizer[\"BOS_None\"]), seq[\"input_ids\"]], dim=0), dims=(0,)) for seq in batch]\n",
-    "    batch = pad_sequence(batch, batch_first=True, padding_value=tokenizer[\"PAD_None\"])  # (N,T) or (N,T,Z)\n",
-    "    batch = flip(batch, dims=(1,)).long()\n",
-    "    return batch  # (N,T)\n",
-    "\n",
+    "(gen_results_path := Path('gen_res')).mkdir(parents=True, exist_ok=True)\n",
     "generation_config = GenerationConfig(\n",
     "    max_new_tokens=512,  # extends samples by 512 tokens\n",
     "    num_beams=1,        # no beam search\n",
@@ -360,8 +283,11 @@
     "    pad_token_id=config.padding_token_id,\n",
     ")\n",
     "\n",
-    "(gen_results_path := Path('gen_res')).mkdir(parents=True, exist_ok=True)\n",
-    "dataloader_test = DataLoader(subset_valid, batch_size=16, collate_fn=collate_gen_left)\n",
+    "# Here the sequences are padded to the left, so that the last token along the time dimension\n",
+    "# is always the last token of each seq, allowing to efficiently generate by batch\n",
+    "collator.pad_on_left = True\n",
+    "collator.eos_token = None\n",
+    "dataloader_test = DataLoader(subset_valid, batch_size=16, collate_fn=collator)\n",
     "model.eval()\n",
     "count = 0\n",
     "for batch in tqdm(dataloader_test, desc='Testing model / Generating results'):  # (N,T)\n",

diff --git a/docs/examples.rst b/docs/examples.rst
@@ -10,7 +10,6 @@ A basic example showing how to create a tokenizer, with a selection of custom pa
 ..  code-block:: python
 
     from miditok import REMI, TokenizerConfig  # here we choose to use REMI
-    from miditok.utils import get_midi_programs
 
     # Our parameters
     TOKENIZER_PARAMS = {
@@ -28,7 +27,7 @@ A basic example showing how to create a tokenizer, with a selection of custom pa
     }
     config = TokenizerConfig(**TOKENIZER_PARAMS)
 
-    # Creates the tokenizer and loads a MIDI
+    # Creates the tokenizer
     tokenizer = REMI(config)
 
 MIDI - Tokens conversion
@@ -38,17 +37,14 @@ Here we convert a MIDI to tokens, and the other way around.
 
 ..  code-block:: python
 
-    from miditok.utils import get_midi_programs
     from miditoolkit import MidiFile
 
     # Tokenize a MIDI file
-    midi_path = "path/to/your_midi.mid"
-    midi = MidiFile(midi_path)
-    tokens = tokenizer(midi_path)  # automatically detects MidiFile, paths or tokens before converting them
+    midi = MidiFile("path/to/your_midi.mid")
+    tokens = tokenizer(midi_path)  # automatically detects MidiFile, paths
 
     # Convert to MIDI and save it
-    programs = get_midi_programs(midi)
-    generated_midi = tokenizer(tokens, programs=programs)  # MidiTok can handle PyTorch / Tensorflow Tensors
+    generated_midi = tokenizer(tokens)  # MidiTok can handle PyTorch / Tensorflow Tensors
     generated_midi.dump('path/to/save/file.mid')  # could have been done above by giving the path argument
 
 Tokenize a dataset
@@ -93,6 +89,6 @@ Finally, we learn :ref:`Byte Pair Encoding (BPE)` on the tokenized dataset, and
         out_dir=Path('path', 'to', 'tokens_BPE'),
     )
 
-    # Converts the tokenized musics into tokens with BPE
+    # Applies BPE to the previous tokens
     tokenizer.apply_bpe_to_dataset(Path('path', 'to', 'tokens_noBPE'), Path('path', 'to', 'tokens_BPE'))
 
diff --git a/miditok/midi_tokenizer.py b/miditok/midi_tokenizer.py
@@ -1440,7 +1440,7 @@ def apply_bpe_to_dataset(
                 if out_path is not None
                 else path
             )
-            self.save_tokens(seq, out_, sample["programs"])
+            self.save_tokens(seq, out_, sample["programs"] if "programs" in sample else None)
 
     def _are_ids_bpe_encoded(self, ids: Union[List[int], np.ndarray]) -> bool:
         r"""A small check telling if a sequence of ids are encoded with BPE.
@@ -1484,7 +1484,7 @@ def tokenize_midi_dataset(
         validation_fn: Callable[[MidiFile], bool] = None,
         data_augment_offsets=None,
         apply_bpe: bool = True,
-        save_programs: bool = True,
+        save_programs: bool = None,
         logging: bool = True,
     ):
         r"""Converts a dataset / list of MIDI files, into their token version and save them as json files
@@ -1504,15 +1504,18 @@ def tokenize_midi_dataset(
             miditok.data_augmentation.data_augmentation_dataset method. Has to be given as a list / tuple
             of offsets pitch octaves, velocities, durations, and finally their directions (up/down). (default: None)
         :param apply_bpe: will apply BPE on the dataset to save, if the vocabulary was learned with. (default: True)
-        :param save_programs: will also save the programs of the tracks of the MIDI. Note that this option is
-            probably unnecessary when using a multitrack tokenizer, as the Program information is present within the
-            tokens, and that the tracks having the same programs are likely to have been merged. (default: True)
+        :param save_programs: will save the programs of the tracks of the MIDI as an entry in the Json file.
+            That this option is probably unnecessary when using a multitrack tokenizer (`config.use_programs`), as the
+            Program information is present within the tokens, and that the tracks having the same programs are likely
+            to have been merged. (default: False if `config.use_programs`, else True)
         :param logging: logs progress bar.
         """
         out_dir = Path(out_dir)
         out_dir.mkdir(parents=True, exist_ok=True)
         # Saves the tokenizer so that it can be reloaded
         self.save_params(out_dir / tokenizer_config_file_name)
+        if save_programs is None:
+            save_programs = not self.config.use_programs
 
         for midi_path in (
             tqdm(
@@ -1674,14 +1677,10 @@ def save_tokens(
             kwargs["ids_bpe_encoded"] = ids_bpe_encoded
 
         with open(path, "w") as outfile:
-            json.dump(
-                {
-                    "ids": ids,
-                    "programs": programs if programs is not None else [],
-                    **kwargs,
-                },
-                outfile,
-            )
+            dic = {"ids": ids}
+            if programs is not None:
+                dic["programs"] = programs
+            json.dump(dic, outfile)
 
     @staticmethod
     def load_tokens(path: Union[str, Path]) -> Union[List[Any], Dict]:

diff --git a/miditok/tokenizations/mumidi.py b/miditok/tokenizations/mumidi.py
@@ -47,6 +47,7 @@ class MuMIDI(MIDITokenizer):
     def _tweak_config_before_creating_voc(self):
         self.config.use_rests = False
         self.config.use_time_signatures = False
+        self.config.use_programs = True
         self.one_token_stream = True
 
         if "drum_pitch_range" not in self.config.additional_params: