File size: 1,907 Bytes
56cfa73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from transformers import PreTrainedTokenizerFast
import re
import unicodedata

REPLACEMENTS = {
    # whitespace
    "\n": " ",
    "\r": " ",
    "\t": " ",
    "\xa0": " ",
    "\u2009": " ",
    "\u202f": " ",
    "\u200b": "",
    # quotes
    "‘": "'",
    "’": "'",
    "‚": "'",
    "‛": "'",
    "“": '"',
    "”": '"',
    "„": '"',
    "«": '"',
    "»": '"',
    # dashes
    "–": "-",
    "—": "-",
    "−": "-",
    "-": "-",
    # ellipsis
    "…": "...",
    # bullets & symbols
    "•": ".",
    "∙": ".",
    "·": ".",
    # currencies
    "€": " euros",
    "$": " dollars",
    "£": " pounds",
    "¥": " yen",
    # misc
    "°": " degrees",
    "©": "",
    "®": "",
    "™": "",
}


def clean_text(text: str) -> str:
    text = unicodedata.normalize("NFKC", text)
    for src, tgt in REPLACEMENTS.items():
        text = text.replace(src, tgt)
    text = re.sub(r"\s+", " ", text.strip())  # collapse spaces
    return text


class BasicTextProcessor:
    """
    Basic text processor on top of a character level BPE model.
    """

    def __init__(self, tokenizer_file: str):
        self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_file)

    def normalize(self, text: str) -> str:
        """Basic pre-normalization: whitespace cleanup, punctuation spacing, etc."""
        text = clean_text(text)
        text = text.strip()
        return text

    def __call__(self, text: str, **tokenizer_kwargs):
        """Normalize then tokenize."""
        text = self.normalize(text)
        return self.tokenizer.encode(text, **tokenizer_kwargs)

    def detokenize(self, token_ids):
        """Optional: convert back to string."""
        out = self.tokenizer.decode(token_ids, skip_special_tokens=False)
        whitespace = "##[WHITESPACE]"
        return out.replace("   ", whitespace).replace(" ", "").replace(whitespace, " ")