#!/usr/bin/env nu
# fetch-tokenizers.nu โ download tokenizer.json for each .model file
#
# usage: nu analizer/fetch-tokenizers.nu ~/llm
# model name โ HuggingFace repo with tokenizer.json
# abliterated models share tokenizer with base model
def source_map [] {
{
"qwen3-0.6b-abl": "Qwen/Qwen3-0.6B"
"qwen2.5-0.5b-abl": "Qwen/Qwen2.5-0.5B"
"qwen2.5-coder-1.5b-abl": "Qwen/Qwen2.5-Coder-1.5B"
"qwen2.5-coder-14b-abl": "Qwen/Qwen2.5-Coder-14B"
"qwen3.5-4b-abl": "Qwen/Qwen3-4B"
"qwen3.5-9b-abl": "Qwen/Qwen3-8B"
"deepseek-r1-8b-abl": "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B"
"smollm2-360m": "HuggingFaceTB/SmolLM2-360M-Instruct"
"bitnet-2b": "microsoft/bitnet-b1.58-2B-4T"
"nuextract-1.5": "numind/NuExtract-1.5"
"mimo-7b-rl": "XiaomiMiMo/MiMo-7B-RL"
"deberta-zeroshot": "MoritzLaurer/deberta-v3-base-zeroshot-v2.0"
"modernbert": "answerdotai/ModernBERT-base"
"granite-hap-125m": "ibm-granite/granite-guardian-hap-125m"
"granite-hap-38m": "ibm-granite/granite-guardian-hap-38m"
"jina-v5-nano": "jinaai/jina-embeddings-v5-text-nano-retrieval"
"whisper-small": "openai/whisper-small"
"moondream2": "vikhyatk/moondream2"
"qwen2.5-vl-7b-abl": "Qwen/Qwen2.5-VL-7B-Instruct"
}
}
def main [llm_path: string] {
let models = source_map
print $"Fetching tokenizer.json files โ ($llm_path)\n"
for model_name in ($models | columns) {
let repo = $models | get $model_name
let dst = ($llm_path | path join $"($model_name).tokenizer.json")
if ($dst | path exists) {
print $" ($model_name) โ already exists"
continue
}
print $" ($model_name) โ ($repo)"
try {
let result = (python3 -c $"
from huggingface_hub import hf_hub_download
path = hf_hub_download\('($repo)', 'tokenizer.json'\)
print\(path\)
" | str trim)
cp $result $dst
let size = (ls $dst | get size | first)
print $" โ ($size)"
} catch {
print $" โ download failed"
}
}
print "\ndone."
}
analizer/fetch-tokenizers.nu
ฯ 0.0%