from tessar_tokenizer import TessarTokenizer, load_tessar_tokenizer # Example 1: Initialize a new Tessar Tokenizer tokenizer = TessarTokenizer.from_pretrained("SVECTOR-CORPORATION/Tessar-largest") # Example 2: Tokenize a simple text text = "Hello, how are you doing today?" encoded = tokenizer(text, return_tensors="pt") print("Encoded Input:", encoded) # Example 3: Batch tokenization texts = [ "Hello, world!", "This is a test sentence.", "Tokenization is an important NLP task." ] batch_encoded = tokenizer(texts, padding=True, truncation=True, return_tensors="pt") print("Batch Encoded Inputs:", batch_encoded) # Example 4: Save and reload tokenizer save_directory = "./tessar_tokenizer" tokenizer.save_pretrained(save_directory) # Reload the saved tokenizer reloaded_tokenizer = load_tessar_tokenizer(save_directory) # Example 5: Custom tokenization with specific parameters custom_tokenizer = TessarTokenizer( do_lower_case=True, max_cell_length=20, unk_token="[UNK]", pad_token="[PAD]" ) # Tokenize with custom settings custom_text = "A custom tokenization example" custom_encoded = custom_tokenizer(custom_text, return_tensors="pt") print("Custom Tokenizer Encoded:", custom_encoded)