# utils.py import re from typing import List def tokenize_vi_simple(text: str) -> List[str]: """ Tokenizes Vietnamese text simply for tasks like BM25. Converts to lowercase, removes basic punctuation, and splits by whitespace. Args: text (str): The input Vietnamese text. Returns: List[str]: A list of tokens. """ if not isinstance(text, str): # Or raise TypeError("Input must be a string") return [] text = text.lower() # Remove characters that are not alphanumeric or whitespace text = re.sub(r'[^\w\s]', '', text) return text.split() # You can add other general utility functions here as your project grows. # For example: # - Functions for logging # - Functions for path manipulation if they are used across multiple modules # - Simple data validation or cleaning routines not specific to law data or LLMs if __name__ == '__main__': print("Testing utils.py...") # Test tokenize_vi_simple print("\n--- Test tokenize_vi_simple ---") test_phrases = [ "Luật Giao thông Đường bộ Việt Nam 2023!", "Xe ô tô con và xe máy.", " Phạt tiền từ 200.000đ đến 400.000đ. ", "", None, # Test with None 123 # Test with non-string ] for phrase in test_phrases: print(f"Input: '{phrase}' (type: {type(phrase).__name__})") tokens = tokenize_vi_simple(phrase) print(f"Tokens: {tokens}") print("-" * 10)