tatm.tokenizer Tokenizer API Reference

class tatm.tokenizer.TokenizationEngine(data: List[str | TatmDataMetadata], tokenizer: str, output_dir: str, file_prefix: str, dtype: str = 'uint32', log_level: int = 20)

Bases: object

run_with_ray(num_workers: int | None = None)

tatm.tokenize Sub components

class tatm.tokenizer.engine.ExampleMessage(data: dict, content_field: str)

Bases: object

content_field: str
data: dict
class tatm.tokenizer.engine.TokenizationEngine(data: List[str | TatmDataMetadata], tokenizer: str, output_dir: str, file_prefix: str, dtype: str = 'uint32', log_level: int = 20)

Bases: object

run_with_ray(num_workers: int | None = None)