Metacreation/GigaMIDI
Viewer • Updated • 3.44M • 3.56k • 43
A Byte-Pair Encoding (BPE) tokenizer trained on REMI (REpresentation of MIdi) tokens for efficient symbolic music representation. This tokenizer combines the expressiveness of REMI encoding with the compression benefits of BPE, making it ideal for training large language models on MIDI data.
The tokenizer uses the following REMI configuration:
PAD_None, BOS_None, EOS_None, MASK_NoneREMI represents MIDI files using the following event types:
| Token Type | Description | Example |
|---|---|---|
Bar_None |
Measure/bar boundary | Bar_None |
TimeSig_X/Y |
Time signature | TimeSig_4/4 |
Position_N |
Position within measure (ticks) | Position_16 |
Tempo_X |
Tempo in BPM | Tempo_121.29 |
Program_N |
MIDI program/instrument | Program_0 (Piano) |
Pitch_N |
MIDI note pitch (0-127) | Pitch_69 (A4) |
Velocity_N |
Note velocity (dynamics) | Velocity_63 |
Duration_X.Y.Z |
Note duration | Duration_4.0.4 |
Chord_X |
Chord detection | Chord_C:maj |
pip install miditok transformers torch
from miditok import MusicTokenizer
from pathlib import Path
# Load the tokenizer from HuggingFace Hub
tokenizer = MusicTokenizer.from_pretrained("manoskary/miditok-REMI")
# Tokenize a MIDI file
midi_path = Path("your_music.mid")
tok_seq = tokenizer(midi_path)
# Access token IDs (for training models)
token_ids = tok_seq.ids
print(f"Sequence length: {len(token_ids)}")
print(f"Token IDs: {token_ids[:10]}...") # First 10 tokens
# Access human-readable tokens
print(f"Token strings: {tok_seq.tokens[:10]}")
from miditok import MusicTokenizer
from pathlib import Path
# Load tokenizer
tok = MusicTokenizer.from_pretrained("manoskary/miditok-REMI")
# 1. MIDI → Tokens
midi = Path("input.mid")
tok_seq = tok(midi)
print(f"Original MIDI tokenized:")
print(f" - Tokens: {tok_seq.tokens[:5]}...")
print(f" - IDs: {tok_seq.ids[:5]}...")
print(f" - Length: {len(tok_seq.ids)}")
# 2. Tokens → MIDI (reconstruction)
score = tok.decode(tok_seq.ids)
score.dump_midi("reconstructed.mid")
# 3. Verify reconstruction
tok_seq_reconstructed = tok("reconstructed.mid")
assert tok_seq.ids == tok_seq_reconstructed.ids, "Reconstruction failed!"
print("\n✓ Perfect reconstruction verified!")
import torch
from miditok import MusicTokenizer
from transformers import BertForMaskedLM
# Load tokenizer and model
tokenizer = MusicTokenizer.from_pretrained("manoskary/miditok-REMI")
model = BertForMaskedLM.from_pretrained("your-musicbert-model")
# Tokenize MIDI
midi_path = "song.mid"
tok_seq = tokenizer(midi_path)
input_ids = torch.tensor([tok_seq.ids[:512]]) # Truncate to max length
# Forward pass
with torch.no_grad():
outputs = model(input_ids=input_ids)
logits = outputs.logits
# Generate predictions
predictions = logits.argmax(dim=-1)
from miditok import MusicTokenizer
from pathlib import Path
import torch
tokenizer = MusicTokenizer.from_pretrained("manoskary/miditok-REMI")
# Process multiple MIDI files
midi_files = list(Path("midi_dataset/").glob("*.mid"))
all_sequences = []
for midi_file in midi_files[:100]: # Process first 100 files
try:
tok_seq = tokenizer(midi_file)
all_sequences.append(tok_seq.ids)
except Exception as e:
print(f"Error processing {midi_file}: {e}")
# Pad sequences for batch processing
max_len = 2048
padded_sequences = []
for seq in all_sequences:
if len(seq) > max_len:
seq = seq[:max_len] # Truncate
else:
seq = seq + [0] * (max_len - len(seq)) # Pad with PAD token
padded_sequences.append(seq)
batch = torch.tensor(padded_sequences)
print(f"Batch shape: {batch.shape}") # [batch_size, max_len]
For a simple MIDI file with a few notes:
TokSequence(
tokens=[
'Bar_None', # New measure
'TimeSig_4/4', # 4/4 time signature
'Position_0', # Start of measure
'Tempo_121.29', # Tempo = 121 BPM
'Program_0', # Piano instrument
'Pitch_69', # A4 note
'Velocity_63', # Medium velocity
'Duration_4.0.4', # Quarter note duration
'Position_16', # 16 ticks later
'Program_0', # Piano
'Pitch_72', # C5 note
'Velocity_63', # Medium velocity
'Duration_2.0.8', # Eighth note duration
'Program_0', # Piano
'Pitch_76', # E5 note
'Velocity_63', # Medium velocity
'Duration_2.0.8' # Eighth note duration
],
ids=[532, 4, 531, 190, 374, 580, 850, 2595, 33442, 686],
# BPE compression: 17 REMI tokens → 10 BPE tokens (41% compression!)
)