chatbot_demo / utils.py
deddoggo's picture
update full version
c69c2f9
raw
history blame
1.5 kB
# utils.py
import re
from typing import List
def tokenize_vi_simple(text: str) -> List[str]:
"""
Tokenizes Vietnamese text simply for tasks like BM25.
Converts to lowercase, removes basic punctuation, and splits by whitespace.
Args:
text (str): The input Vietnamese text.
Returns:
List[str]: A list of tokens.
"""
if not isinstance(text, str):
# Or raise TypeError("Input must be a string")
return []
text = text.lower()
# Remove characters that are not alphanumeric or whitespace
text = re.sub(r'[^\w\s]', '', text)
return text.split()
# You can add other general utility functions here as your project grows.
# For example:
# - Functions for logging
# - Functions for path manipulation if they are used across multiple modules
# - Simple data validation or cleaning routines not specific to law data or LLMs
if __name__ == '__main__':
print("Testing utils.py...")
# Test tokenize_vi_simple
print("\n--- Test tokenize_vi_simple ---")
test_phrases = [
"Luật Giao thông Đường bộ Việt Nam 2023!",
"Xe ô tô con và xe máy.",
" Phạt tiền từ 200.000đ đến 400.000đ. ",
"",
None, # Test with None
123 # Test with non-string
]
for phrase in test_phrases:
print(f"Input: '{phrase}' (type: {type(phrase).__name__})")
tokens = tokenize_vi_simple(phrase)
print(f"Tokens: {tokens}")
print("-" * 10)