midi_tokenizer.py

import numpy as np
from numba import jit
import pretty_midi
import scipy.interpolate as interp

TOKEN_SPECIAL: int = 0
TOKEN_NOTE: int = 1
TOKEN_VELOCITY: int = 2
TOKEN_TIME: int = 3

DEFAULT_VELOCITY: int = 77

TIE: int = 2
EOS: int = 1
PAD: int = 0


def extrapolate_beat_times(beat_times, n_extend=1):
    beat_times_function = interp.interp1d(
        np.arange(beat_times.size),
        beat_times,
        bounds_error=False,
        fill_value="extrapolate",
    )

    ext_beats = beat_times_function(
        np.linspace(0, beat_times.size + n_extend - 1, beat_times.size + n_extend)
    )

    return ext_beats


@jit(nopython=True, cache=True)
def fast_tokenize(idx, token_type, n_special, n_note, n_velocity):
    if token_type == TOKEN_TIME:
        return n_special + n_note + n_velocity + idx
    elif token_type == TOKEN_VELOCITY:
        return n_special + n_note + idx
    elif token_type == TOKEN_NOTE:
        return n_special + idx
    elif token_type == TOKEN_SPECIAL:
        return idx
    else:
        return -1


@jit(nopython=True, cache=True)
def fast_detokenize(idx, n_special, n_note, n_velocity, time_idx_offset):
    if idx >= n_special + n_note + n_velocity:
        return (TOKEN_TIME, (idx - (n_special + n_note + n_velocity)) + time_idx_offset)
    elif idx >= n_special + n_note:
        return TOKEN_VELOCITY, idx - (n_special + n_note)
    elif idx >= n_special:
        return TOKEN_NOTE, idx - n_special
    else:
        return TOKEN_SPECIAL, idx


class MidiTokenizer:
    def __init__(self, config) -> None:
        self.config = config

    def tokenize_note(self, idx, token_type):
        rt = fast_tokenize(
            idx,
            token_type,
            self.config.vocab_size.special,
            self.config.vocab_size.note,
            self.config.vocab_size.velocity,
        )
        if rt == -1:
            raise ValueError(f"type {type} is not a predefined token type.")
        else:
            return rt

    def notes_to_tokens(self, notes):
        """
        notes : (onset idx, offset idx, pitch, velocity)
        """
        max_time_idx = notes[:, :2].max()

        times = [[] for i in range((max_time_idx + 1))]
        for onset, offset, pitch, velocity in notes:
            times[onset].append([pitch, velocity])
            times[offset].append([pitch, 0])

        tokens = []
        current_velocity = 0
        for i, time in enumerate(times):
            if len(time) == 0:
                continue
            tokens.append(self.tokenize_note(i, TOKEN_TIME))
            for pitch, velocity in time:
                velocity = int(velocity > 0)
                if current_velocity != velocity:
                    current_velocity = velocity
                    tokens.append(self.tokenize_note(velocity, TOKEN_VELOCITY))
                tokens.append(self.tokenize_note(pitch, TOKEN_NOTE))

        return np.array(tokens, dtype=int)

    def detokenize(self, token, time_idx_offset):
        type, value = fast_detokenize(
            token,
            n_special=self.config.vocab_size.special,
            n_note=self.config.vocab_size.note,
            n_velocity=self.config.vocab_size.velocity,
            time_idx_offset=time_idx_offset,
        )
        if type != TOKEN_TIME:
            value = int(value)
        return [type, value]

    def to_string(self, tokens, time_idx_offset=0):
        nums = [
            self.detokenize(token, time_idx_offset=time_idx_offset) for token in tokens
        ]
        strings = []
        for i in range(len(nums)):
            type = nums[i][0]
            value = nums[i][1]

            if type == TOKEN_TIME:
                type = "time"
            elif type == TOKEN_SPECIAL:
                if value == EOS:
                    value = "EOS"
                elif value == PAD:
                    value = "PAD"
                elif value == TIE:
                    value = "TIE"
                else:
                    value = "Unknown Special"
            elif type == TOKEN_NOTE:
                type = "note"
            elif type == TOKEN_VELOCITY:
                type = "velocity"
            strings.append((type, value))
        return strings

    def split_notes(self, notes, beatsteps, time_from, time_to):
        """
        Assumptions
        - notes are sorted by onset time
        - beatsteps are sorted by time
        """
        start_idx = np.searchsorted(beatsteps, time_from)
        start_note = np.searchsorted(notes[:, 0], start_idx)

        end_idx = np.searchsorted(beatsteps, time_to)
        end_note = np.searchsorted(notes[:, 0], end_idx)
        splited_notes = notes[start_note:end_note]

        return splited_notes, (start_idx, end_idx, start_note, end_note)

    def notes_to_relative_tokens(
        self, notes, offset_idx, add_eos=False, add_composer=False, composer_value=None
    ):
        """
        notes : (onset idx, offset idx, pitch, velocity)
        """

        def _add_eos(tokens):
            tokens = np.concatenate((tokens, np.array([EOS], dtype=tokens.dtype)))
            return tokens

        def _add_composer(tokens, composer_value):
            tokens = np.concatenate(
                (np.array([composer_value], dtype=tokens.dtype), tokens)
            )
            return tokens

        if len(notes) == 0:
            tokens = np.array([], dtype=int)
            if add_eos:
                tokens = _add_eos(tokens)
            if add_composer:
                tokens = _add_composer(tokens, composer_value=composer_value)
            return tokens

        max_time_idx = notes[:, :2].max()

        # times[time_idx] = [[pitch, .. ], [pitch, 0], ..]
        times = [[] for i in range((max_time_idx + 1 - offset_idx))]
        for abs_onset, abs_offset, pitch, velocity in notes:
            rel_onset = abs_onset - offset_idx
            rel_offset = abs_offset - offset_idx
            times[rel_onset].append([pitch, velocity])
            times[rel_offset].append([pitch, 0])

        # 여기서부터는 전부 시간 0(offset) 기준
        tokens = []
        current_velocity = 0
        current_time_idx = 0

        for rel_idx, time in enumerate(times):
            if len(time) == 0:
                continue
            time_idx_shift = rel_idx - current_time_idx
            current_time_idx = rel_idx

            tokens.append(self.tokenize_note(time_idx_shift, TOKEN_TIME))
            for pitch, velocity in time:
                velocity = int(velocity > 0)
                if current_velocity != velocity:
                    current_velocity = velocity
                    tokens.append(self.tokenize_note(velocity, TOKEN_VELOCITY))
                tokens.append(self.tokenize_note(pitch, TOKEN_NOTE))

        tokens = np.array(tokens, dtype=int)
        if add_eos:
            tokens = _add_eos(tokens)
        if add_composer:
            tokens = _add_composer(tokens, composer_value=composer_value)
        return tokens

    def relative_batch_tokens_to_midi(
        self,
        tokens,
        beatstep,
        beat_offset_idx=None,
        bars_per_batch=None,
        cutoff_time_idx=None,
    ):
        """
        tokens : (batch, sequence)
        beatstep : (times, )
        """
        beat_offset_idx = 0 if beat_offset_idx is None else beat_offset_idx
        notes = None
        bars_per_batch = 2 if bars_per_batch is None else bars_per_batch

        N = len(tokens)
        for n in range(N):
            _tokens = tokens[n]
            _start_idx = beat_offset_idx + n * bars_per_batch * 4
            _cutoff_time_idx = cutoff_time_idx + _start_idx
            _notes = self.relative_tokens_to_notes(
                _tokens,
                start_idx=_start_idx,
                cutoff_time_idx=_cutoff_time_idx,
            )
            # print(_notes, "\n-------")
            if len(_notes) == 0:
                pass
                # print("_notes zero")
            elif notes is None:
                notes = _notes
            else:
                notes = np.concatenate((notes, _notes), axis=0)

        if notes is None:
            notes = []
        midi = self.notes_to_midi(notes, beatstep, offset_sec=beatstep[beat_offset_idx])
        return midi, notes

    def relative_tokens_to_notes(self, tokens, start_idx, cutoff_time_idx=None):
        # TODO remove legacy
        # decoding 첫토큰이 편곡자인 경우
        if tokens[0] >= sum(self.config.vocab_size.values()):
            tokens = tokens[1:]

        words = [self.detokenize(token, time_idx_offset=0) for token in tokens]

        if hasattr(start_idx, "item"):
            """
            if numpy or torch tensor
            """
            start_idx = start_idx.item()

        current_idx = start_idx
        current_velocity = 0
        note_onsets_ready = [None for i in range(self.config.vocab_size.note + 1)]
        notes = []
        for type, number in words:
            if type == TOKEN_SPECIAL:
                if number == EOS:
                    break
            elif type == TOKEN_TIME:
                current_idx += number
                if cutoff_time_idx is not None:
                    current_idx = min(current_idx, cutoff_time_idx)

            elif type == TOKEN_VELOCITY:
                current_velocity = number
            elif type == TOKEN_NOTE:
                pitch = number
                if current_velocity == 0:
                    # note_offset
                    if note_onsets_ready[pitch] is None:
                        # offset without onset
                        pass
                    else:
                        onset_idx = note_onsets_ready[pitch]
                        if onset_idx >= current_idx:
                            # No time shift after previous note_on
                            pass
                        else:
                            offset_idx = current_idx
                            notes.append(
                                [onset_idx, offset_idx, pitch, DEFAULT_VELOCITY]
                            )
                            note_onsets_ready[pitch] = None
                else:
                    # note_on
                    if note_onsets_ready[pitch] is None:
                        note_onsets_ready[pitch] = current_idx
                    else:
                        # note-on already exists
                        onset_idx = note_onsets_ready[pitch]
                        if onset_idx >= current_idx:
                            # No time shift after previous note_on
                            pass
                        else:
                            offset_idx = current_idx
                            notes.append(
                                [onset_idx, offset_idx, pitch, DEFAULT_VELOCITY]
                            )
                            note_onsets_ready[pitch] = current_idx
            else:
                raise ValueError

        for pitch, note_on in enumerate(note_onsets_ready):
            # force offset if no offset for each pitch
            if note_on is not None:
                if cutoff_time_idx is None:
                    cutoff = note_on + 1
                else:
                    cutoff = max(cutoff_time_idx, note_on + 1)

                offset_idx = max(current_idx, cutoff)
                notes.append([note_on, offset_idx, pitch, DEFAULT_VELOCITY])

        if len(notes) == 0:
            return []
        else:
            notes = np.array(notes)
            note_order = notes[:, 0] * 128 + notes[:, 1]
            notes = notes[note_order.argsort()]
            return notes

    def notes_to_midi(self, notes, beatstep, offset_sec=None):
        new_pm = pretty_midi.PrettyMIDI(resolution=384, initial_tempo=120.0)
        new_inst = pretty_midi.Instrument(program=0)
        new_notes = []
        if offset_sec is None:
            offset_sec = 0.0

        for onset_idx, offset_idx, pitch, velocity in notes:
            new_note = pretty_midi.Note(
                velocity=velocity,
                pitch=pitch,
                start=beatstep[onset_idx] - offset_sec,
                end=beatstep[offset_idx] - offset_sec,
            )
            new_notes.append(new_note)
        new_inst.notes = new_notes
        new_pm.instruments.append(new_inst)
        new_pm.remove_invalid_notes()
        return new_pm


@jit(nopython=True, cache=False)
def fast_notes_to_relative_tokens(
    notes, offset_idx, max_time_idx, n_special, n_note, n_velocity
):
    """
    notes : (onset idx, offset idx, pitch, velocity)
    """

    times_p = [np.array([], dtype=int) for i in range((max_time_idx + 1 - offset_idx))]
    times_v = [np.array([], dtype=int) for i in range((max_time_idx + 1 - offset_idx))]

    for abs_onset, abs_offset, pitch, velocity in notes:
        rel_onset = abs_onset - offset_idx
        rel_offset = abs_offset - offset_idx
        times_p[rel_onset] = np.append(times_p[rel_onset], pitch)
        times_v[rel_onset] = np.append(times_v[rel_onset], velocity)
        times_p[rel_offset] = np.append(times_p[rel_offset], pitch)
        times_v[rel_offset] = np.append(times_v[rel_offset], velocity)

    # 여기서부터는 전부 시간 0(offset) 기준
    tokens = []
    current_velocity = np.array([0])
    current_time_idx = np.array([0])

    # range가 0일 수도 있으니까..
    for i in range(len(times_p)):
        rel_idx = i
        notes_at_time = times_p[i]
        if len(notes_at_time) == 0:
            continue

        time_idx_shift = rel_idx - current_time_idx[0]
        current_time_idx[0] = rel_idx

        token = fast_tokenize(
            time_idx_shift,
            TOKEN_TIME,
            n_special=n_special,
            n_note=n_note,
            n_velocity=n_velocity,
        )
        tokens.append(token)

        for j in range(len(notes_at_time)):
            pitch = times_p[j]
            velocity = times_v[j]
            # for pitch, velocity in time:
            velocity = int(velocity > 0)
            if current_velocity[0] != velocity:
                current_velocity[0] = velocity
                token = fast_tokenize(
                    velocity,
                    TOKEN_VELOCITY,
                    n_special=n_special,
                    n_note=n_note,
                    n_velocity=n_velocity,
                )
                tokens.append(token)
            token = fast_tokenize(
                pitch,
                TOKEN_NOTE,
                n_special=n_special,
                n_note=n_note,
                n_velocity=n_velocity,
            )
            tokens.append(token)

    return np.array(tokens)