#!/usr/bin/env python3
"""fix-vocab.py โ regenerate ~~~vocab sections in .model files from tokenizer.json
Usage: python3 analizer/fix-vocab.py ~/llm
python3 analizer/fix-vocab.py ~/llm qwen3-0.6b-abl
Reads name.tokenizer.json alongside name.model, generates correct TOML
escaping for [tokens] and [merges], patches the .model file in-place.
"""
"""Escape a string for TOML double-quoted format."""
=
== :
== :
== :
== :
< 0x20:
return
"""Generate correct vocab.toml content from tokenizer.json."""
=
=
=
=
=
=
# BPE: vocab is {token: id}
=
:
# Unigram/SentencePiece: vocab is [[token, score], ...]
=
# Unknown format
return
, =
:
=
continue
, =
continue
return
"""Replace ~~~vocab section in .model file with correct content."""
=
# Find ~~~vocab and ~~~eval (or ~~~weights) markers
= b
= b
= b
=
return False
= +
# Find the next section after vocab
=
=
=
# Replace vocab content
= + +
return True
=
=
continue
= # strip .model
continue
=
=
continue
=
continue
= - 2 # rough count
# Count tokens and merges
=
analizer/fix-vocab.py
ฯ 0.0%