mirror of
https://github.com/kyegomez/OpenMythos.git
synced 2026-05-02 17:43:27 +02:00
65 lines
1.8 KiB
Python
65 lines
1.8 KiB
Python
from transformers import AutoTokenizer
|
|
|
|
DEFAULT_MODEL_ID = "openai/gpt-oss-20b"
|
|
|
|
|
|
class MythosTokenizer:
|
|
"""
|
|
HuggingFace tokenizer wrapper for OpenMythos.
|
|
|
|
Args:
|
|
model_id (str): The HuggingFace model ID or path to use with AutoTokenizer.
|
|
Defaults to "openai/gpt-oss-20b".
|
|
|
|
Attributes:
|
|
tokenizer: An instance of HuggingFace's AutoTokenizer.
|
|
|
|
Example:
|
|
>>> tok = MythosTokenizer()
|
|
>>> ids = tok.encode("Hello world")
|
|
>>> s = tok.decode(ids)
|
|
"""
|
|
|
|
def __init__(self, model_id: str = DEFAULT_MODEL_ID):
|
|
"""
|
|
Initialize the MythosTokenizer.
|
|
|
|
Args:
|
|
model_id (str): HuggingFace model identifier or path to tokenizer files.
|
|
"""
|
|
self.tokenizer = AutoTokenizer.from_pretrained(model_id)
|
|
|
|
@property
|
|
def vocab_size(self) -> int:
|
|
"""
|
|
Return the size of the tokenizer vocabulary.
|
|
|
|
Returns:
|
|
int: The number of unique tokens in the tokenizer vocabulary.
|
|
"""
|
|
return self.tokenizer.vocab_size
|
|
|
|
def encode(self, text: str) -> list[int]:
|
|
"""
|
|
Encode input text into a list of token IDs.
|
|
|
|
Args:
|
|
text (str): The input text string to tokenize.
|
|
|
|
Returns:
|
|
list[int]: List of integer token IDs representing the input text.
|
|
"""
|
|
return self.tokenizer.encode(text, add_special_tokens=False)
|
|
|
|
def decode(self, token_ids: list[int]) -> str:
|
|
"""
|
|
Decode a list of token IDs back into a text string.
|
|
|
|
Args:
|
|
token_ids (list[int]): A list of integer token IDs to decode.
|
|
|
|
Returns:
|
|
str: Decoded string representation of the token IDs.
|
|
"""
|
|
return self.tokenizer.decode(token_ids, skip_special_tokens=True)
|