Spaces:
Sleeping
Sleeping
# utils.py | |
import re | |
from typing import List | |
def tokenize_vi_simple(text: str) -> List[str]: | |
""" | |
Tokenizes Vietnamese text simply for tasks like BM25. | |
Converts to lowercase, removes basic punctuation, and splits by whitespace. | |
Args: | |
text (str): The input Vietnamese text. | |
Returns: | |
List[str]: A list of tokens. | |
""" | |
if not isinstance(text, str): | |
# Or raise TypeError("Input must be a string") | |
return [] | |
text = text.lower() | |
# Remove characters that are not alphanumeric or whitespace | |
text = re.sub(r'[^\w\s]', '', text) | |
return text.split() | |
# You can add other general utility functions here as your project grows. | |
# For example: | |
# - Functions for logging | |
# - Functions for path manipulation if they are used across multiple modules | |
# - Simple data validation or cleaning routines not specific to law data or LLMs | |
if __name__ == '__main__': | |
print("Testing utils.py...") | |
# Test tokenize_vi_simple | |
print("\n--- Test tokenize_vi_simple ---") | |
test_phrases = [ | |
"Luật Giao thông Đường bộ Việt Nam 2023!", | |
"Xe ô tô con và xe máy.", | |
" Phạt tiền từ 200.000đ đến 400.000đ. ", | |
"", | |
None, # Test with None | |
123 # Test with non-string | |
] | |
for phrase in test_phrases: | |
print(f"Input: '{phrase}' (type: {type(phrase).__name__})") | |
tokens = tokenize_vi_simple(phrase) | |
print(f"Tokens: {tokens}") | |
print("-" * 10) |