tokenizer backend changes
on this page
v5 eliminates the distinction between “slow” (python-based) and “fast” (rust-based) tokenizers, consolidating to a unified backend system with automatic selection.
backend architecture
available backends (priority order)
-
tokenizersbackend (preferred)
- rust-based from tokenizers library
- optimal performance
- most features
-
sentencepiecebackend
- python wrapper for sentencepiece
- for t5, xlnet, albert, etc.
- requires:
pip install sentencepiece
-
mistralcommonbackend
- uses mistralcommon library
- for mistral model family
-
pythonbackend
- pure python implementation
- fallback when others unavailable
file consolidation
before v5:
tokenization_<model>.py # slow (python)
tokenization_<model>_fast.py # fast (rust) after v5:
tokenization_<model>.py # single file, auto-backend api changes
encode_plus() → __call__()
# old (v4.x) - deprecated
output = tokenizer.encode_plus("Hello", padding=True)
output = tokenizer.batch_encode_plus(["Hello", "World"])
# new (v5)
output = tokenizer("Hello", padding=True)
output = tokenizer(["Hello", "World"]) # batch auto-detected unified decode()
# decode() now handles both single and batch
text = tokenizer.decode([101, 2023, 102])
texts = tokenizer.decode([[101, 2023], [101, 2054]]) # batch apply_chat_template() behavior
messages = [{"role": "user", "content": "Hello!"}]
# with tokenize=True and return_tensors, returns tensor
output = tokenizer.apply_chat_template(
messages,
tokenize=True,
return_tensors="pt"
)
print(type(output)) # torch.Tensor
# without tokenize, returns string
output_str = tokenizer.apply_chat_template(messages, tokenize=False)
print(type(output_str)) # str training tokenizers from scratch
new direct initialization
from transformers import LlamaTokenizer
# initialize empty, trainable tokenizer
tokenizer = LlamaTokenizer()
# train on corpus
tokenizer.train(corpus) custom backend example
from transformers import TokenizersBackend
from tokenizers import pre_tokenizers, Tokenizer
from tokenizers.models import BPE
class CustomTokenizer(TokenizersBackend):
def __init__(self, vocab=None, merges=None, **kwargs):
if vocab is None:
self._vocab = {"<unk>": 0, "<s>": 1, "</s>": 2}
else:
self._vocab = vocab
self._merges = merges or []
self._tokenizer = Tokenizer(
BPE(vocab=self._vocab, merges=self._merges)
)
self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace()
super().__init__(
tokenizer_object=self._tokenizer,
unk_token="<unk>",
**kwargs
) bpe training
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()
trainer = BpeTrainer(
special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
vocab_size=30000,
min_frequency=2
)
tokenizer.train(files=["corpus.txt"], trainer=trainer)
tokenizer.save("tokenizer.json") configuration consolidation
removed files
special_tokens_map.jsonadded_tokens.json
new structure
tokenizer_config.json:
{
"name_or_path": "meta-llama/Llama-3.2-3B",
"tokenizer_class": "LlamaTokenizer",
"bos_token": "<s>",
"eos_token": "</s>",
"added_tokens_decoder": {
"0": {
"content": "<unk>",
"special": true
}
}
} special tokens as addedtoken objects
# v5 - special tokens are AddedToken objects
print(type(tokenizer.bos_token)) # <class 'AddedToken'>
print(str(tokenizer.bos_token)) # "<s>" deprecations and removals
| feature | status | replacement |
|---|---|---|
encode_plus() | removed | __call__() |
batch_encode_plus() | removed | __call__() |
use_fast parameter | removed | auto-selection |
special_tokens_map.json | removed | tokenizer_config.json |
added_tokens.json | removed | tokenizer_config.json |
use_auth_token | removed | token |
resume_download | removed | always enabled |
migration examples
basic tokenization
# v4.x
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base", use_fast=True)
output = tokenizer.encode_plus("Hello", return_tensors="pt")
# v5
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base") # no use_fast
output = tokenizer("Hello", return_tensors="pt") chat templates
# v4.x
input_ids = tokenizer.apply_chat_template(messages, tokenize=True)
output = {'input_ids': torch.tensor([input_ids])}
# v5
output = tokenizer.apply_chat_template(
messages,
tokenize=True,
return_tensors="pt"
)
# direct use with model.generate(**output) saving tokenizers
# save with new unified format
tokenizer.save_pretrained("./tokenizer", legacy_format=False)
# upload to hub
tokenizer.push_to_hub("username/model-name") token type ids
not all models use token type ids:
# bert uses token type ids
output = bert_tokenizer("First.", "Second.", return_token_type_ids=True)
print(output['token_type_ids']) # [0, 0, 0, 1, 1, 1, ...]
# t5 returns all zeros
output = t5_tokenizer("Hello", return_token_type_ids=True)
print(output['token_type_ids']) # [0, 0, 0, ...]