Skip to content

Commit

Permalink
updating all examples, save_tokens now by default doesn't save progra…
Browse files Browse the repository at this point in the history
…ms if config.use_programs is False
  • Loading branch information
Natooz committed Aug 17, 2023
1 parent 3f33a12 commit 5a6378b
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 122 deletions.
23 changes: 10 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,38 +31,35 @@ The most basic and useful methods are summarized here. And [here](colab-notebook

```python
from miditok import REMI, TokenizerConfig
from miditok.utils import get_midi_programs
from miditoolkit import MidiFile
from pathlib import Path

# Creating the tokenizer's configuration, read the doc to explore other parameters
config = TokenizerConfig(nb_velocities=16, use_chords=True)

# Creates the tokenizer and loads a MIDI
# Creating a multitrack tokenizer configuration, read the doc to explore other parameters
config = TokenizerConfig(nb_velocities=16, use_chords=True, use_programs=True)
tokenizer = REMI(config)
midi = MidiFile('path/to/your_midi.mid')

# Converts MIDI to tokens, and back to a MIDI
tokens = tokenizer(midi) # calling it will automatically detect MIDIs, paths and tokens before the conversion
converted_back_midi = tokenizer(tokens, get_midi_programs(midi)) # PyTorch / Tensorflow / Numpy tensors supported
# Loads a midi, converts to tokens, and back to a MIDI
midi = MidiFile('path/to/your_midi.mid')
tokens = tokenizer(midi) # calling the tokenizer will automatically detect MIDIs, paths and tokens
converted_back_midi = tokenizer(tokens) # PyTorch / Tensorflow / Numpy tensors supported

# Converts MIDI files to tokens saved as JSON files
# Tokenize a whole dataset and save it at Json files
midi_paths = list(Path("path", "to", "dataset").glob("**/*.mid"))
data_augmentation_offsets = [2, 1, 1] # data augmentation on 2 pitch octaves, 1 velocity and 1 duration values
tokenizer.tokenize_midi_dataset(midi_paths, Path("path", "to", "tokens_noBPE"),
data_augment_offsets=data_augmentation_offsets)

# Constructs the vocabulary with BPE, from the tokenized files
# Constructs the vocabulary with BPE, from the token files
tokenizer.learn_bpe(
vocab_size=500,
vocab_size=10000,
tokens_paths=list(Path("path", "to", "tokens_noBPE").glob("**/*.json")),
start_from_empty_voc=False,
)

# Saving our tokenizer, to retrieve it back later with the load_params method
tokenizer.save_params(Path("path", "to", "save", "tokenizer.json"))

# Converts the tokenized musics into tokens with BPE
# Applies BPE to the previous tokens
tokenizer.apply_bpe_to_dataset(Path('path', 'to', 'tokens_noBPE'), Path('path', 'to', 'tokens_BPE'))
```

Expand Down
100 changes: 13 additions & 87 deletions colab-notebooks/Full_Example_HuggingFace_GPT2_Transformer.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -54,86 +54,16 @@
"from copy import deepcopy\n",
"import json\n",
"\n",
"from torch import Tensor, LongTensor, stack, flip, cat, full, argmax\n",
"from torch import Tensor, LongTensor, flip, cat, full, argmax\n",
"from torch.nn.utils.rnn import pad_sequence\n",
"from torch.utils.data import Dataset, DataLoader\n",
"from torch.utils.data import DataLoader\n",
"from torchtoolkit.data import create_subsets\n",
"from transformers import GPT2LMHeadModel, GPT2Config, Trainer, TrainingArguments, GenerationConfig\n",
"from transformers.data.data_collator import DataCollatorMixin\n",
"from evaluate import load as load_metric\n",
"from miditok import REMI, MIDITokenizer, TokenizerConfig\n",
"from miditok.pytorch_data import DatasetTok, DataCollator\n",
"from miditok.constants import CHORD_MAPS\n",
"from miditoolkit import MidiFile\n",
"from tqdm import tqdm\n",
"\n",
"\n",
"class MIDIDataset(Dataset):\n",
" r\"\"\"Dataset for generator training\n",
"\n",
" :param files_paths: list of paths to files to load.\n",
" :param tokenizer: tokenizer object, to use to load MIDIs instead of tokens. (default: None)\n",
" \"\"\"\n",
"\n",
" def __init__(self, files_paths: List[Path], min_seq_len: int, max_seq_len: int, tokenizer: MIDITokenizer = None):\n",
" samples = []\n",
"\n",
" for file_path in tqdm(files_paths, desc=f'Loading data: {files_paths[0].parent}'):\n",
" if file_path.suffix in [\"mid\", \"midi\", \"MID\", \"MIDI\"]:\n",
" midi = MidiFile(file_path)\n",
" for _ in range(len(midi.instruments) - 1):\n",
" del midi.instruments[1] # removes all tracks except first one\n",
" tokens = tokenizer.midi_to_tokens(midi)[0].ids\n",
" else:\n",
" with open(file_path) as json_file:\n",
" tokens = json.load(json_file)['ids'][0] # first track\n",
" i = 0\n",
" while i < len(tokens):\n",
" if i >= len(tokens) - min_seq_len:\n",
" break # last sample is too short\n",
" samples.append(LongTensor(tokens[i:i + max_seq_len]))\n",
" i += len(samples[-1]) # could be replaced with max_seq_len\n",
"\n",
" self.samples = samples\n",
"\n",
" def __getitem__(self, idx) -> Dict[str, LongTensor]:\n",
" return {\"input_ids\": self.samples[idx], \"labels\": self.samples[idx]}\n",
" \n",
" def __len__(self) -> int: return len(self.samples)\n",
"\n",
" def __repr__(self): return self.__str__()\n",
"\n",
" def __str__(self) -> str: return 'No data loaded' if len(self) == 0 else f'{len(self.samples)} samples'\n",
"\n",
"\n",
"def _pad_batch(examples: List[Dict[str, LongTensor]], pad_token: int) -> LongTensor:\n",
" \"\"\"Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary.\"\"\"\n",
"\n",
" length_of_first = examples[0][\"input_ids\"].size(0)\n",
"\n",
" # Check if padding is necessary.\n",
" are_tensors_same_length = all(x[\"input_ids\"].size(0) == length_of_first for x in examples)\n",
" if are_tensors_same_length:\n",
" return stack([e[\"input_ids\"] for e in examples], dim=0).long()\n",
"\n",
" # Creating the full tensor and filling it with our data.\n",
" return pad_sequence([e[\"input_ids\"] for e in examples], batch_first=True, padding_value=pad_token).long()\n",
"\n",
"\n",
"class DataCollatorGen(DataCollatorMixin):\n",
" def __init__(self, pad_token: int, return_tensors: str = \"pt\"):\n",
" \"\"\"Collator that simply pad the input sequences.\n",
" Input_ids will be padded with the pad token given, while labels will be\n",
" padded with -100.\n",
"\n",
" :param pad_token: pas token\n",
" :param return_tensors:\n",
" \"\"\"\n",
" self.pad_token = pad_token\n",
" self.return_tensors = return_tensors\n",
"\n",
" def __call__(self, batch: List[Dict[str, Any]], return_tensors=None) -> Dict[str, LongTensor]:\n",
" x, y = _pad_batch(batch, self.pad_token), _pad_batch(batch, -100)\n",
" return {\"input_ids\": x, \"labels\": y} # will be shifted in GPT2LMHead forward"
"from tqdm import tqdm"
]
},
{
Expand Down Expand Up @@ -198,7 +128,7 @@
"\n",
"# Loads tokens and create data loaders for training\n",
"tokens_paths = list(Path('Maestro_tokens_bpe').glob(\"**/*.json\"))\n",
"dataset = MIDIDataset(\n",
"dataset = DatasetTok(\n",
" tokens_paths, max_seq_len=512, min_seq_len=384, \n",
")\n",
"subset_train, subset_valid = create_subsets(dataset, [0.3])"
Expand Down Expand Up @@ -303,10 +233,11 @@
" gradient_checkpointing=True,\n",
")\n",
"\n",
"collator = DataCollator(tokenizer[\"PAD_None\"], tokenizer[\"BOS_None\"], tokenizer[\"EOS_None\"], copy_inputs_as_labels=True)\n",
"trainer = Trainer(\n",
" model=model,\n",
" args=training_config,\n",
" data_collator=DataCollatorGen(tokenizer[\"PAD_None\"]),\n",
" data_collator=collator,\n",
" train_dataset=subset_train,\n",
" eval_dataset=subset_valid,\n",
" compute_metrics=compute_metrics,\n",
Expand Down Expand Up @@ -339,15 +270,7 @@
},
"outputs": [],
"source": [
"def collate_gen_left(batch: List[Dict[str, LongTensor]]) -> LongTensor:\n",
" # Here the sequences are padded to the left, so that the last token along the time dimension\n",
" # is always the last token of each seq, allowing to efficiently generate by batch\n",
" bos_shape = (1,)\n",
" batch = [flip(cat([full(bos_shape, tokenizer[\"BOS_None\"]), seq[\"input_ids\"]], dim=0), dims=(0,)) for seq in batch]\n",
" batch = pad_sequence(batch, batch_first=True, padding_value=tokenizer[\"PAD_None\"]) # (N,T) or (N,T,Z)\n",
" batch = flip(batch, dims=(1,)).long()\n",
" return batch # (N,T)\n",
"\n",
"(gen_results_path := Path('gen_res')).mkdir(parents=True, exist_ok=True)\n",
"generation_config = GenerationConfig(\n",
" max_new_tokens=512, # extends samples by 512 tokens\n",
" num_beams=1, # no beam search\n",
Expand All @@ -360,8 +283,11 @@
" pad_token_id=config.padding_token_id,\n",
")\n",
"\n",
"(gen_results_path := Path('gen_res')).mkdir(parents=True, exist_ok=True)\n",
"dataloader_test = DataLoader(subset_valid, batch_size=16, collate_fn=collate_gen_left)\n",
"# Here the sequences are padded to the left, so that the last token along the time dimension\n",
"# is always the last token of each seq, allowing to efficiently generate by batch\n",
"collator.pad_on_left = True\n",
"collator.eos_token = None\n",
"dataloader_test = DataLoader(subset_valid, batch_size=16, collate_fn=collator)\n",
"model.eval()\n",
"count = 0\n",
"for batch in tqdm(dataloader_test, desc='Testing model / Generating results'): # (N,T)\n",
Expand Down
14 changes: 5 additions & 9 deletions docs/examples.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ A basic example showing how to create a tokenizer, with a selection of custom pa
.. code-block:: python
from miditok import REMI, TokenizerConfig # here we choose to use REMI
from miditok.utils import get_midi_programs
# Our parameters
TOKENIZER_PARAMS = {
Expand All @@ -28,7 +27,7 @@ A basic example showing how to create a tokenizer, with a selection of custom pa
}
config = TokenizerConfig(**TOKENIZER_PARAMS)
# Creates the tokenizer and loads a MIDI
# Creates the tokenizer
tokenizer = REMI(config)
MIDI - Tokens conversion
Expand All @@ -38,17 +37,14 @@ Here we convert a MIDI to tokens, and the other way around.

.. code-block:: python
from miditok.utils import get_midi_programs
from miditoolkit import MidiFile
# Tokenize a MIDI file
midi_path = "path/to/your_midi.mid"
midi = MidiFile(midi_path)
tokens = tokenizer(midi_path) # automatically detects MidiFile, paths or tokens before converting them
midi = MidiFile("path/to/your_midi.mid")
tokens = tokenizer(midi_path) # automatically detects MidiFile, paths
# Convert to MIDI and save it
programs = get_midi_programs(midi)
generated_midi = tokenizer(tokens, programs=programs) # MidiTok can handle PyTorch / Tensorflow Tensors
generated_midi = tokenizer(tokens) # MidiTok can handle PyTorch / Tensorflow Tensors
generated_midi.dump('path/to/save/file.mid') # could have been done above by giving the path argument
Tokenize a dataset
Expand Down Expand Up @@ -93,6 +89,6 @@ Finally, we learn :ref:`Byte Pair Encoding (BPE)` on the tokenized dataset, and
out_dir=Path('path', 'to', 'tokens_BPE'),
)
# Converts the tokenized musics into tokens with BPE
# Applies BPE to the previous tokens
tokenizer.apply_bpe_to_dataset(Path('path', 'to', 'tokens_noBPE'), Path('path', 'to', 'tokens_BPE'))
25 changes: 12 additions & 13 deletions miditok/midi_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1440,7 +1440,7 @@ def apply_bpe_to_dataset(
if out_path is not None
else path
)
self.save_tokens(seq, out_, sample["programs"])
self.save_tokens(seq, out_, sample["programs"] if "programs" in sample else None)

def _are_ids_bpe_encoded(self, ids: Union[List[int], np.ndarray]) -> bool:
r"""A small check telling if a sequence of ids are encoded with BPE.
Expand Down Expand Up @@ -1484,7 +1484,7 @@ def tokenize_midi_dataset(
validation_fn: Callable[[MidiFile], bool] = None,
data_augment_offsets=None,
apply_bpe: bool = True,
save_programs: bool = True,
save_programs: bool = None,
logging: bool = True,
):
r"""Converts a dataset / list of MIDI files, into their token version and save them as json files
Expand All @@ -1504,15 +1504,18 @@ def tokenize_midi_dataset(
miditok.data_augmentation.data_augmentation_dataset method. Has to be given as a list / tuple
of offsets pitch octaves, velocities, durations, and finally their directions (up/down). (default: None)
:param apply_bpe: will apply BPE on the dataset to save, if the vocabulary was learned with. (default: True)
:param save_programs: will also save the programs of the tracks of the MIDI. Note that this option is
probably unnecessary when using a multitrack tokenizer, as the Program information is present within the
tokens, and that the tracks having the same programs are likely to have been merged. (default: True)
:param save_programs: will save the programs of the tracks of the MIDI as an entry in the Json file.
That this option is probably unnecessary when using a multitrack tokenizer (`config.use_programs`), as the
Program information is present within the tokens, and that the tracks having the same programs are likely
to have been merged. (default: False if `config.use_programs`, else True)
:param logging: logs progress bar.
"""
out_dir = Path(out_dir)
out_dir.mkdir(parents=True, exist_ok=True)
# Saves the tokenizer so that it can be reloaded
self.save_params(out_dir / tokenizer_config_file_name)
if save_programs is None:
save_programs = not self.config.use_programs

for midi_path in (
tqdm(
Expand Down Expand Up @@ -1674,14 +1677,10 @@ def save_tokens(
kwargs["ids_bpe_encoded"] = ids_bpe_encoded

with open(path, "w") as outfile:
json.dump(
{
"ids": ids,
"programs": programs if programs is not None else [],
**kwargs,
},
outfile,
)
dic = {"ids": ids}
if programs is not None:
dic["programs"] = programs
json.dump(dic, outfile)

This comment has been minimized.

Copy link
@ilya16

ilya16 Aug 18, 2023

Contributor

kwargs are not saved with the token data

This comment has been minimized.

Copy link
@Natooz

Natooz Aug 18, 2023

Author Owner

Indeed, thank you for spotting this !
I'll commit a fix, that will be in the next update.
Can't tell though when to release the next update, maybe in a few days / week along with other changes / fixes (if there are any)


@staticmethod
def load_tokens(path: Union[str, Path]) -> Union[List[Any], Dict]:
Expand Down
1 change: 1 addition & 0 deletions miditok/tokenizations/mumidi.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ class MuMIDI(MIDITokenizer):
def _tweak_config_before_creating_voc(self):
self.config.use_rests = False
self.config.use_time_signatures = False
self.config.use_programs = True
self.one_token_stream = True

if "drum_pitch_range" not in self.config.additional_params:
Expand Down

0 comments on commit 5a6378b

Please sign in to comment.