File size: 1,499 Bytes
c69c2f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# utils.py

import re
from typing import List

def tokenize_vi_simple(text: str) -> List[str]:
    """
    Tokenizes Vietnamese text simply for tasks like BM25.
    Converts to lowercase, removes basic punctuation, and splits by whitespace.

    Args:
        text (str): The input Vietnamese text.

    Returns:
        List[str]: A list of tokens.
    """
    if not isinstance(text, str):
        # Or raise TypeError("Input must be a string")
        return []
    text = text.lower()
    # Remove characters that are not alphanumeric or whitespace
    text = re.sub(r'[^\w\s]', '', text)
    return text.split()

# You can add other general utility functions here as your project grows.
# For example:
# - Functions for logging
# - Functions for path manipulation if they are used across multiple modules
# - Simple data validation or cleaning routines not specific to law data or LLMs

if __name__ == '__main__':
    print("Testing utils.py...")

    # Test tokenize_vi_simple
    print("\n--- Test tokenize_vi_simple ---")
    test_phrases = [
        "Luật Giao thông Đường bộ Việt Nam 2023!",
        "Xe ô tô con và xe máy.",
        "  Phạt tiền từ 200.000đ đến 400.000đ.  ",
        "",
        None, # Test with None
        123 # Test with non-string
    ]
    for phrase in test_phrases:
        print(f"Input: '{phrase}' (type: {type(phrase).__name__})")
        tokens = tokenize_vi_simple(phrase)
        print(f"Tokens: {tokens}")
        print("-" * 10)