lidp commited on
Commit
f47b156
Β·
1 Parent(s): dbcbb17

Fix tokenizer bug (#2573)

Browse files

### What problem does this PR solve?

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

Files changed (1) hide show
  1. rag/nlp/rag_tokenizer.py +1 -1
rag/nlp/rag_tokenizer.py CHANGED
@@ -64,7 +64,7 @@ class RagTokenizer:
64
  self.stemmer = PorterStemmer()
65
  self.lemmatizer = WordNetLemmatizer()
66
 
67
- self.SPLIT_CHAR = r"([ ,\.<>/?;'\[\]\\`!@#$%^&*\(\)\{\}\|_+=γ€Šγ€‹οΌŒγ€‚οΌŸγ€οΌ›β€˜β€™οΌšβ€œβ€γ€γ€‘~!οΏ₯%β€¦β€¦οΌˆοΌ‰β€”β€”-]+|[a-z\.-]+|[0-9,\.-]+)"
68
  try:
69
  self.trie_ = datrie.Trie.load(self.DIR_ + ".txt.trie")
70
  return
 
64
  self.stemmer = PorterStemmer()
65
  self.lemmatizer = WordNetLemmatizer()
66
 
67
+ self.SPLIT_CHAR = r"([ ,\.<>/?;:'\[\]\\`!@#$%^&*\(\)\{\}\|_+=γ€Šγ€‹οΌŒγ€‚οΌŸγ€οΌ›β€˜β€™οΌšβ€œβ€γ€γ€‘~!οΏ₯%β€¦β€¦οΌˆοΌ‰β€”β€”-]+|[a-z\.-]+|[0-9,\.-]+)"
68
  try:
69
  self.trie_ = datrie.Trie.load(self.DIR_ + ".txt.trie")
70
  return