lidp
commited on
Commit
Β·
f47b156
1
Parent(s):
dbcbb17
Fix tokenizer bug (#2573)
Browse files### What problem does this PR solve?
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
- rag/nlp/rag_tokenizer.py +1 -1
rag/nlp/rag_tokenizer.py
CHANGED
@@ -64,7 +64,7 @@ class RagTokenizer:
|
|
64 |
self.stemmer = PorterStemmer()
|
65 |
self.lemmatizer = WordNetLemmatizer()
|
66 |
|
67 |
-
self.SPLIT_CHAR = r"([
|
68 |
try:
|
69 |
self.trie_ = datrie.Trie.load(self.DIR_ + ".txt.trie")
|
70 |
return
|
|
|
64 |
self.stemmer = PorterStemmer()
|
65 |
self.lemmatizer = WordNetLemmatizer()
|
66 |
|
67 |
+
self.SPLIT_CHAR = r"([ ,\.<>/?;:'\[\]\\`!@#$%^&*\(\)\{\}\|_+=γγοΌγοΌγοΌββοΌββγγ~οΌοΏ₯%β¦β¦οΌοΌββ-]+|[a-z\.-]+|[0-9,\.-]+)"
|
68 |
try:
|
69 |
self.trie_ = datrie.Trie.load(self.DIR_ + ".txt.trie")
|
70 |
return
|